def extract_organizers_info(url): organizers_dummy = { attribute: None for attribute in ["members", "position", "institution"] } organizers_info_list = [] try: page = request.urlopen(url) except ConnectionError: print("Could not connect to url.") organizers = BeautifulSoup(page, 'html.parser').find("section", {"class": "content"}) for child in organizers.findChildren('p'): counter = 0 for tag in child.contents: if isinstance(tag, element.Tag): text = tag.text # re.sub(r"[\W^ ]", "", tag.text) if re.sub(r"[\W]", "", text) != "": authors_str = util.basic_string_clean(text) authors_list = [ author.strip() for author in re.split(r",|\sand\s", authors_str) ] attribute = fill_dummy(counter) if attribute == "members": organizers_dummy[attribute] = authors_list else: organizers_dummy[attribute] = util.basic_string_clean( text) counter += 1 elif isinstance(tag, element.NavigableString): text = re.sub(r"[^\w\s]", "", tag).strip() if text != "": authors_str = util.basic_string_clean(text) authors_list = [ author.strip() for author in re.split(r",|\sand\s", authors_str) ] attribute = fill_dummy(counter) if attribute == "members": organizers_dummy[attribute] = authors_list else: organizers_dummy[attribute] = util.basic_string_clean( text) counter += 1 if counter > 1 and organizers_dummy[ "members"] is not None and organizers_dummy[ "institution"] is not None: organizers_info_list.append(copy.copy(organizers_dummy)) organizers_dummy["members"] = None organizers_dummy["institution"] = None organizers_dummy['members'] = util.basic_string_clean( child.text.strip()) return organizers_info_list
def extract_workshops(url): """ Extracts all information available for workshops provided at https://coling2020.org/pages/workshops :return: list of dictionaries with a workshop represented as one dictionary. """ workshops = [] # url = "https://coling2020.org/pages/workshops" try: page = request.urlopen(url) except: print("Could not connect to url.") soup = BeautifulSoup(page, 'html.parser').find("section", {"id": "main_content"}) for child in soup.findChildren('h3'): for i in child.findNext('ul').find_all('li'): workshop = { attribute: None for attribute in [ "workshop_name", "workshop_organizer", "workshop_description", "workshop_day", "workshop_location", "workshop_link" ] } workshop['workshop_day'] = child.text workshop['workshop_name'] = util.basic_string_clean( i.find('a').text) workshop['workshop_link'] = i.find('a')['href'] workshops.append(copy.copy(workshop)) # print(json.dumps(workshops, indent=1)) return workshops
def extract_paper_info(url): paper_dummy = { attribute: None for attribute in [ "paper_title", "paper_authors", "paper_type", "paper_link", "paper_time", "paper_keywords" ] } paper_info_list = [] try: page = request.urlopen(url) except ConnectionError: print("Could not connect to url.") papers = BeautifulSoup(page, 'html.parser').findAll("p", class_="paper-item") for paper in papers: paper_dummy["paper_title"] = util.basic_string_clean(paper.contents[0]) authors_str = paper.contents[2].text authors_list = [ author.strip() for author in re.split(r",|\sand\s", authors_str) ] paper_dummy["paper_authors"] = authors_list paper_info_list.append(copy.copy(paper_dummy)) return paper_info_list
def extract_tutorial_info(url): tutorial_dummy = {attribute: None for attribute in ["tutorial_name", "tutorial_author", "tutorial_abstract", "tutorial_time", "tutorial_location", "tutorial_link"]} tutorial_info_list = [] try: page = request.urlopen(url) except ConnectionError: print("Could not connect to url.") datetimes = get_timestamps(url) tutorials = BeautifulSoup(page, 'html.parser').findAll("h3", {"class": "tutorials-anchor"}) for tutorial in tutorials: tutorial_dummy["tutorial_name"] = util.basic_string_clean(tutorial.text) if tutorial.text[:2] < datetimes[1][1][:2]: tutorial_dummy["tutorial_time"] = datetimes[0][0] elif tutorial.text[:2] >= datetimes[1][1][:2]: tutorial_dummy["tutorial_time"] = datetimes[1][0] next = tutorial.findNext("div") authors_str = tutorial.findNext("p", {"class": "tutorials-tutors"}).text authors_list = [author.strip() for author in re.split(r",|\sand\s", authors_str)] tutorial_dummy["tutorial_author"] = authors_list tutorial_dummy["tutorial_location"] = tutorial.findNext("p", {"class": "tutorials-room"}).text.split('.')[1].strip() tutorial_dummy["tutorial_link"] = tutorial.findNext("a", {"class": "tutorials-materials"})["href"] abstract_p = next.findAll("p", class_=None) abstract = "" for p in abstract_p: abstract += p.text tutorial_dummy["abstract"] = abstract tutorial_info_list.append(copy.copy(tutorial_dummy)) return tutorial_info_list
def extract_workshop_info(url): workshop_dummy = { attribute: None for attribute in [ "workshop_name", "workshop_organizer", "workshop_description", "workshop_day", "workshop_location", "workshop_link" ] } workshop_info_list = [] try: page = request.urlopen(url) except ConnectionError: print("Could not connect to url.") datetimes = get_timestamps(url) current_date = datetimes[0][0] workshop_div = BeautifulSoup(page, 'html.parser').findAll("div", class_="workshops")[0] workshops = workshop_div.findAll("h3") for workshop in workshops: workshop_dummy["workshop_name"] = util.basic_string_clean( workshop.text) current_date = get_timestamp_for_event(datetimes, (current_date, workshop.text)) workshop_dummy["workshop_day"] = current_date workshop_dummy["workshop_link"] = workshop.contents[0]["href"] authors_str = util.basic_string_clean( workshop.findNext("p", { "class": "tutorials-tutors" }).text) authors_list = [ author.strip() for author in re.split(r",|\sand\s", authors_str) ] workshop_dummy["workshop_organizer"] = authors_list workshop_dummy["workshop_location"] = workshop.findNext( "p", { "class": "tutorials-room" }).text.split('.')[1].strip() workshop_info_list.append(copy.copy(workshop_dummy)) return workshop_info_list
def extract_tutorials_info(url): tutorial_dummy = { attribute: None for attribute in [ "tutorial_name", "tutorial_author", "tutorial_abstract", "tutorial_time", "tutorial_location", "tutorial_link" ] } tutorial_info_list = [] try: page = request.urlopen(url) except ConnectionError: print("Could not connect to url.") tutorials = BeautifulSoup(page, 'html.parser').find("section", {"id": "main_content"}) for child in tutorials.findChildren('p'): text = child.text.split('\n') tutorial_dummy['tutorial_name'] = util.basic_string_clean(text[0]) tutorial_dummy['tutorial_author'] = text[1] tutorial_info_list.append(copy.copy(tutorial_dummy)) return tutorial_info_list
def extract_tutorials(tutorials_url=None, schedule_url=None): """ Extracts basic information available for tutorials provided at the tutorial site of the conference and extract and merge with data for a tutorial if interactive schedule of conference is specified. One url of the the two must be provided. If only one is specified, the crawler tries to extract as much information as it can from this site. Its recommended to specify both urls, then the crawler extracts all available data starting with the tutorials_url and afterwards merges the data with results of the crawled data from the schedule url. :param: tutorials_url: the url where the tutorials are listed (default None) (for example https://www.emnlp-ijcnlp2019.org/program/tutorials/ ) :param: schedule_url: the url of the interactive schedule if available (default None) (for example: https://www.emnlp-ijcnlp2019.org/program/ischedule/ ) :return: list of dictionaries with a tutorial represented as one dictionary. """ logger.info('Start crawling TUTORIALS...') tutorials = [] tutorial_reference = {} # we need this dictionary to merge existing tutorials with the # schedule data author_reference = [] if tutorials_url is not None: logger.info('Crawling data from: %s', tutorials_url) # extract information from tutorial site try: page = request.urlopen(tutorials_url) except: logger.warning("URl could not be crawled!") return tutorials soup = BeautifulSoup(page, 'html.parser').find("section", {"class": "page__content"}) reference_counter = 0 # tutorials can either be in <h2> or <h3> tags for item in soup.find_all(['h2', 'h3'], id= re.compile("^t\d+")): tutorial = {attribute: None for attribute in ["tutorial_name", "tutorial_author", "tutorial_abstract", "tutorial_time", "tutorial_location", "tutorial_link"]} tutorial["tutorial_name"] = pretty_title(item.text) tutorial["tutorial_author"] = pretty_organizers(item.findNext("p").text) next_node = item.findNext("p") tagbreak = item.name abstract = "" for tag in next_node.next_siblings: if tag.name == tagbreak: break elif tag.name in ["p", "div", "ul"]: abstract += tag.text tutorial["tutorial_abstract"] = basic_string_clean(abstract) tutorials.append(tutorial) tutorial_reference[clean_title(item.text)] = reference_counter author_reference.append(set(clean_authors(item.findNext("p").text))) reference_counter += 1 # gather and merge with information available in interactive schedule if schedule_url is None: logger.info('Crawling DONE: no schedule url specified') return tutorials logger.info('Crawling data from: %s', schedule_url) try: page = request.urlopen(schedule_url) except: logger.warning("URl could not be crawled!") return tutorials tutorial_sessions = BeautifulSoup(page, 'html.parser')\ .findAll("div", {"class": "session session-expandable session-tutorials"}) for session in tutorial_sessions: time = session.find("span", {"class" : "session-time"}) datetime = "" if time is not None: datetime = time["title"] + ", " + time.text for child in session.findChildren(class_="tutorial-title"): title_parent = child.find("strong") if title_parent is not None: title = title_parent.text authors_parent = title_parent.next_sibling if authors_parent is not None: authors = str(authors_parent) # if tutorial already exists merge search_title = clean_title(title) search_authors = clean_authors(authors) if (search_title in tutorial_reference) or (search_authors in author_reference): logger.debug('Merge existing tutorial: **%s', title) try: existing_index = tutorial_reference[search_title] except KeyError: existing_index = author_reference.index(search_authors) location_parent = child.findNext(class_="btn") location = None if location_parent is not None: location = location_parent.text if tutorials[existing_index]["tutorial_time"] is None: tutorials[existing_index]["tutorial_time"] = datetime tutorials[existing_index]["tutorial_location"] = location else: cur_datetime = tutorials[existing_index]["tutorial_time"] + ", " + time.text tutorials[existing_index]["tutorial_time"] = cur_datetime # if tutorial does not exist add to tutorials else: logger.debug('Tutorial does not exist already: Create new: *%s', title) tutorial = {attribute: None for attribute in ["tutorial_name", "tutorial_author", "tutorial_abstract", "tutorial_time", "tutorial_location", "tutorial_link"]} tutorial["tutorial_name"] = pretty_title(title) tutorial["tutorial_author"] = pretty_organizers(authors) location_parent = child.findNext(class_="btn") if location_parent is not None: tutorial["tutorial_location"] = location_parent.text tutorial["tutorial_time"] = datetime tutorials.append(tutorial) logger.info('Crawling TUTORIALS DONE') return tutorials
def extract_keynotes(keynotes_url=None, schedule_url=None): """ Extracts basic information available for keynotes provided at the keynote site of the conference and extract and merge with data for a keynote if interactive schedule of conference is specified. One url of the the two must be provided. If only one is specified, the crawler tries to extract as much information as it can from this site. Its recommended to specify both urls, then the crawler extracts all available data starting with the keynote_url and afterwards merges the data with results of the crawled data from the schedule url. :param: keynotes_url: the url where the keynotes are listed (default None) (for example https://naacl2019.org/program/keynotes/ ) :param: schedule_url: the url of the interactive schedule if available (default None) (for example: https://www.emnlp-ijcnlp2019.org/program/ischedule/ ) :return: list of dictionaries with a keynote represented as one dictionary. """ logger.info('Start crawling KEYNOTES...') keynotes = [] keynote_reference = { } # we need this dictionary to merge existing tutorials with the # schedule data # extract information from tutorial site if keynotes_url is not None: logger.info('Crawling data from: %s', keynotes_url) try: page = request.urlopen(keynotes_url) except: logger.warning("URl could not be crawled!") return keynotes soup = BeautifulSoup(page, 'html.parser').find("section", {"class": "page__content"}) reference_counter = 0 items = soup.find_all(class_="archive__item-body") for item in items: keynote = { attribute: None for attribute in [ "keynote_title", "keynote_speaker", "keynote_abstract", "keynote_time", "keynote_location", "keynote_link", "keynote_speaker_bio" ] } title_parent = item.find(class_="archive__item-excerpt").find( "strong", text=re.compile("^\s?[Tt]itle\s?:\s?")) abstract_parent = item.find(class_="archive__item-excerpt").find( "strong", text=re.compile("^\s?[Aa]bstract\s?:\s?")) author_bio = item.find(class_="archive__item-small-excerpt") if title_parent is not None: title = basic_string_clean(str(title_parent.next_sibling)) keynote["keynote_title"] = title if abstract_parent is not None: keynote["keynote_abstract"] = basic_string_clean( str(abstract_parent.next_sibling)) if author_bio is not None: keynote["keynote_speaker_bio"] = basic_string_clean( author_bio.text) keynotes.append(keynote) keynote_reference[clean_title(title)] = reference_counter reference_counter += 1 if schedule_url is None: logger.info('Crawling DONE: no schedule url specified') return keynotes # extract information from tutorial site logger.info('Crawling data from: %s', schedule_url) try: page = request.urlopen(schedule_url) except Exception: logger.warning("URl could not be crawled!") plenary_sessions = BeautifulSoup(page, 'html.parser')\ .findAll("div", {"class": "session session-expandable session-plenary"}) for session in plenary_sessions: title = session.findNext("a", {"class": "session-title"}).text if title.startswith("Keynote"): search_title = clean_title(title) # if keynote exist already merge results if search_title in keynote_reference: logger.debug('Merge existing keynote: **%s', title) existing_index = keynote_reference[search_title] authors = session.find("span", {"class": "session-people"}) if authors is None: authors = session.find("span", {"class": "session-person"}) keynotes[existing_index]["keynote_speaker"] = authors.text time = session.find("span", {"class": "session-time"}) keynotes[existing_index][ "keynote_time"] = time["title"] + ", " + time.text keynotes[existing_index][ "keynote_location"] = session.findNext( "span", { "class": "btn" }).text # if keynote does not exist add new keynote else: logger.debug('Keynote does not exist already: Create new: *%s', title) keynote = { attribute: None for attribute in [ "keynote_title", "keynote_speaker", "keynote_abstract", "keynote_time", "keynote_location", "keynote_link", "keynote_speaker_bio" ] } if title: keynote["keynote_title"] = pretty_title(title) authors = session.find("span", {"class": "session-people"}) if authors is None: authors = session.find("span", {"class": "session-person"}) keynote["keynote_speaker"] = authors.text time_parent = session.find("span", {"class": "session-time"}) if time_parent is not None: keynote["keynote_time"] = time_parent[ "title"] + ", " + time_parent.text location_parent = session.findNext("span", {"class": "btn"}) if location_parent is not None: keynote["keynote_location"] = location_parent.text abstract_parent = session.findNext( "div", {"class": "session-abstract"}) if abstract_parent is not None: keynote["keynote_abstract"] = basic_string_clean( abstract_parent.text) keynotes.append(keynote) logger.info('Crawling KEYNOTES DONE') return keynotes