def scrape_faculty(): print("*************** Scraping Faculty *********************") url = 'https://www.dal.ca/academics/faculties.html' soup = get_soup(url) main_div = soup.find_all("div", class_="text parbase section") f_list = list() for child in main_div: if is_faculty(child): name = child.find_next("h2").get_text().strip() href = child.find_next("h2").find_next('a').get("href").strip() if not href.startswith("http"): href = "https://dal.ca" + href faculty = Faculty(name, href) f_list.append(faculty) generate_id(f_list) XmlList().from_list(f_list).save("faculty.xml")
def scrape_buildings(): campus_ids = dict() buildings = get_buildings(campus_ids) b_list = XmlList() amenities: Dict[str, int] = dict() generate_id(buildings) for b in buildings: b_list.add(b.to_xml_obj()) for key in b.amenities.keys(): amenities[key] = 1 build_ids_from_dict(amenities) # save_to_file(dict_to_xml_rows(amenities, "amenity"), "amenity.xml") save_to_file(dict_to_xml_rows(campus_ids, "campus"), "campus.xml") b_list.save("building.xml")
def scrapeUndergraduatePrograms(): print("*************** Scraping UG Programs *********************") url = "https://www.dal.ca/academics/programs.html" soup = get_soup(url) programs_tag = soup.find("div", {'id':'node_9d9b8bcabfe64055a607401063a8f68econtentPartabcontainerentriestabentry'}).find_all("dt") p_list: List[Program] = list() for tag in programs_tag: name = tag.find("a").get_text() web_page = tag.find("a").get('href') web_page = dal_prefix(web_page) p_soup = get_soup(web_page) degree_overview = p_soup.find("h3", string="Degree overview") if degree_overview is None: continue degree_overview = degree_overview.parent.get_text() degree_overview = degree_overview.split("\n") local_dict = dict() for sentence in degree_overview: if ("Degree overview" in sentence) or (":" not in sentence): continue key_val = sentence.split(":") local_dict[key_val[0].strip().lower()] = key_val[1].strip() p = Program(name, "Undergraduate", web_page) faculty = local_dict.get("faculty") if faculty.startswith("Faculty of "): faculty = faculty.replace("Faculty of ","") p.faculty_id = get_faculty_id(faculty) p.program_length = local_dict.get("program length") p.program_start = local_dict.get("program start") if local_dict.get("department") is not None: p.department_id = get_department_id(local_dict.get("department")) campus = local_dict.get("campus") if campus is not None: p.campus_id = get_campus_id(campus.strip().split(", ")[0]) p_list.append(p) generate_id(p_list) XmlList().from_list(p_list).save("program.xml")
def scrapeCampusServices(): print("*************** Scraping Campus Services *********************") soup = get_soup('https://www.dal.ca/faculty_staff.html') service_nodes = soup.find_all("h4", class_="c-title") service_list: List[CampusService] = list() web_link_list: List[WebLinks] = list() service_id = 0 for node in service_nodes: service_id = service_id + 1 link_nodes = node.find_next("ul").find_all("li") service = node.find_next("h4").find_next("a") service_url = service.get("href") service_url = dal_prefix(service_url) service_name = service.get_text() campus_service = CampusService(service_name, service_url) campus_service.id = service_id service_list.append(campus_service) for link_node in link_nodes: link = link_node.find_next("a") url = link.get('href') url = dal_prefix(url) text = link.get_text() web_link = WebLinks(text, url, service_name) web_link.service_id = service_id web_link_list.append(web_link) xml_camp_service = XmlList() xml_camp_service.from_list(service_list) xml_camp_service.save("campus_service.xml") xml_web_links = XmlList() xml_web_links.from_list(web_link_list) xml_web_links.save("web_links.xml")