def scrape_faculty():
    print("*************** Scraping Faculty *********************")
    url = 'https://www.dal.ca/academics/faculties.html'
    soup = get_soup(url)
    main_div = soup.find_all("div", class_="text parbase section")
    f_list = list()

    for child in main_div:
        if is_faculty(child):
            name = child.find_next("h2").get_text().strip()
            href = child.find_next("h2").find_next('a').get("href").strip()
            if not href.startswith("http"):
                href = "https://dal.ca" + href
            faculty = Faculty(name, href)
            f_list.append(faculty)
    generate_id(f_list)
    XmlList().from_list(f_list).save("faculty.xml")
Пример #2
0
def scrape_buildings():
    campus_ids = dict()
    buildings = get_buildings(campus_ids)
    b_list = XmlList()
    amenities: Dict[str, int] = dict()
    generate_id(buildings)
    for b in buildings:
        b_list.add(b.to_xml_obj())
        for key in b.amenities.keys():
            amenities[key] = 1
    build_ids_from_dict(amenities)
    # save_to_file(dict_to_xml_rows(amenities, "amenity"), "amenity.xml")
    save_to_file(dict_to_xml_rows(campus_ids, "campus"), "campus.xml")
    b_list.save("building.xml")
Пример #3
0
def scrapeUndergraduatePrograms():
    print("*************** Scraping UG Programs *********************")
    url = "https://www.dal.ca/academics/programs.html"
    soup = get_soup(url)
    programs_tag = soup.find("div", {'id':'node_9d9b8bcabfe64055a607401063a8f68econtentPartabcontainerentriestabentry'}).find_all("dt")
    p_list: List[Program] = list()
    for tag in programs_tag:
        name = tag.find("a").get_text()
        web_page = tag.find("a").get('href')
        web_page = dal_prefix(web_page)
        p_soup = get_soup(web_page)
        degree_overview = p_soup.find("h3", string="Degree overview")
        if degree_overview is None:
            continue
        degree_overview = degree_overview.parent.get_text()
        degree_overview = degree_overview.split("\n")
        local_dict = dict()
        for sentence in degree_overview:
            if ("Degree overview" in sentence) or (":" not in sentence):
                continue
            key_val = sentence.split(":")
            local_dict[key_val[0].strip().lower()] = key_val[1].strip()
        p = Program(name, "Undergraduate", web_page)
        faculty = local_dict.get("faculty")
        if faculty.startswith("Faculty of "):
            faculty = faculty.replace("Faculty of ","")
        p.faculty_id = get_faculty_id(faculty)
        p.program_length = local_dict.get("program length")
        p.program_start = local_dict.get("program start")
        if local_dict.get("department") is not None:
            p.department_id = get_department_id(local_dict.get("department"))
        campus = local_dict.get("campus")
        if campus is not None:
            p.campus_id = get_campus_id(campus.strip().split(", ")[0])
        p_list.append(p)
        generate_id(p_list)
    XmlList().from_list(p_list).save("program.xml")
def scrapeCampusServices():
    print("*************** Scraping Campus Services *********************")
    soup = get_soup('https://www.dal.ca/faculty_staff.html')
    service_nodes = soup.find_all("h4", class_="c-title")
    service_list: List[CampusService] = list()
    web_link_list: List[WebLinks] = list()

    service_id = 0
    for node in service_nodes:
        service_id = service_id + 1
        link_nodes = node.find_next("ul").find_all("li")
        service = node.find_next("h4").find_next("a")
        service_url = service.get("href")
        service_url = dal_prefix(service_url)
        service_name = service.get_text()
        campus_service = CampusService(service_name, service_url)
        campus_service.id = service_id
        service_list.append(campus_service)
        for link_node in link_nodes:
            link = link_node.find_next("a")
            url = link.get('href')
            url = dal_prefix(url)
            text = link.get_text()
            web_link = WebLinks(text, url, service_name)
            web_link.service_id = service_id
            web_link_list.append(web_link)

    xml_camp_service = XmlList()
    xml_camp_service.from_list(service_list)
    xml_camp_service.save("campus_service.xml")

    xml_web_links = XmlList()
    xml_web_links.from_list(web_link_list)
    xml_web_links.save("web_links.xml")