Python basic_string_clean примеры, confcrawler.util.util.basic_string_clean Python примеры использования

Пример #1

0

Показать файл

def extract_organizers_info(url):
    organizers_dummy = {
        attribute: None
        for attribute in ["members", "position", "institution"]
    }
    organizers_info_list = []
    try:
        page = request.urlopen(url)
    except ConnectionError:
        print("Could not connect to url.")

    organizers = BeautifulSoup(page,
                               'html.parser').find("section",
                                                   {"class": "content"})
    for child in organizers.findChildren('p'):
        counter = 0
        for tag in child.contents:
            if isinstance(tag, element.Tag):
                text = tag.text  # re.sub(r"[\W^ ]", "", tag.text)
                if re.sub(r"[\W]", "", text) != "":
                    authors_str = util.basic_string_clean(text)
                    authors_list = [
                        author.strip()
                        for author in re.split(r",|\sand\s", authors_str)
                    ]
                    attribute = fill_dummy(counter)
                    if attribute == "members":
                        organizers_dummy[attribute] = authors_list
                    else:
                        organizers_dummy[attribute] = util.basic_string_clean(
                            text)
                    counter += 1
            elif isinstance(tag, element.NavigableString):
                text = re.sub(r"[^\w\s]", "", tag).strip()
                if text != "":
                    authors_str = util.basic_string_clean(text)
                    authors_list = [
                        author.strip()
                        for author in re.split(r",|\sand\s", authors_str)
                    ]
                    attribute = fill_dummy(counter)
                    if attribute == "members":
                        organizers_dummy[attribute] = authors_list
                    else:
                        organizers_dummy[attribute] = util.basic_string_clean(
                            text)
                    counter += 1
            if counter > 1 and organizers_dummy[
                    "members"] is not None and organizers_dummy[
                        "institution"] is not None:
                organizers_info_list.append(copy.copy(organizers_dummy))
                organizers_dummy["members"] = None
                organizers_dummy["institution"] = None
        organizers_dummy['members'] = util.basic_string_clean(
            child.text.strip())

    return organizers_info_list

Пример #2

0

Показать файл

Файл: COLING_2020_workshop_crawler.py Проект: kronung/DASP-Project

def extract_workshops(url):
    """
    Extracts all information available for workshops provided at
    https://coling2020.org/pages/workshops
    :return: list of dictionaries with a workshop represented as one dictionary.
    """
    workshops = []
    # url = "https://coling2020.org/pages/workshops"

    try:
        page = request.urlopen(url)
    except:
        print("Could not connect to url.")

    soup = BeautifulSoup(page, 'html.parser').find("section",
                                                   {"id": "main_content"})

    for child in soup.findChildren('h3'):
        for i in child.findNext('ul').find_all('li'):
            workshop = {
                attribute: None
                for attribute in [
                    "workshop_name", "workshop_organizer",
                    "workshop_description", "workshop_day",
                    "workshop_location", "workshop_link"
                ]
            }
            workshop['workshop_day'] = child.text
            workshop['workshop_name'] = util.basic_string_clean(
                i.find('a').text)
            workshop['workshop_link'] = i.find('a')['href']
            workshops.append(copy.copy(workshop))

    # print(json.dumps(workshops, indent=1))
    return workshops

Пример #3

0

Показать файл

def extract_paper_info(url):
    paper_dummy = {
        attribute: None
        for attribute in [
            "paper_title", "paper_authors", "paper_type", "paper_link",
            "paper_time", "paper_keywords"
        ]
    }
    paper_info_list = []
    try:
        page = request.urlopen(url)
    except ConnectionError:
        print("Could not connect to url.")

    papers = BeautifulSoup(page, 'html.parser').findAll("p",
                                                        class_="paper-item")
    for paper in papers:
        paper_dummy["paper_title"] = util.basic_string_clean(paper.contents[0])
        authors_str = paper.contents[2].text
        authors_list = [
            author.strip() for author in re.split(r",|\sand\s", authors_str)
        ]
        paper_dummy["paper_authors"] = authors_list
        paper_info_list.append(copy.copy(paper_dummy))
    return paper_info_list

Пример #4

0

Показать файл

Файл: tutorials_crawler.py Проект: kronung/DASP-Project

def extract_tutorial_info(url):
    tutorial_dummy = {attribute: None for attribute in ["tutorial_name", "tutorial_author", "tutorial_abstract", "tutorial_time",
                                                          "tutorial_location", "tutorial_link"]}
    tutorial_info_list = []
    try:
        page = request.urlopen(url)
    except ConnectionError:
        print("Could not connect to url.")

    datetimes = get_timestamps(url)
    tutorials = BeautifulSoup(page, 'html.parser').findAll("h3", {"class": "tutorials-anchor"})
    for tutorial in tutorials:
        tutorial_dummy["tutorial_name"] = util.basic_string_clean(tutorial.text)
        if tutorial.text[:2] < datetimes[1][1][:2]:
            tutorial_dummy["tutorial_time"] = datetimes[0][0]
        elif tutorial.text[:2] >= datetimes[1][1][:2]:
            tutorial_dummy["tutorial_time"] = datetimes[1][0]
        next = tutorial.findNext("div")
        authors_str = tutorial.findNext("p", {"class": "tutorials-tutors"}).text
        authors_list = [author.strip() for author in re.split(r",|\sand\s", authors_str)]
        tutorial_dummy["tutorial_author"] = authors_list
        tutorial_dummy["tutorial_location"] = tutorial.findNext("p", {"class": "tutorials-room"}).text.split('.')[1].strip()
        tutorial_dummy["tutorial_link"] = tutorial.findNext("a", {"class": "tutorials-materials"})["href"]
        abstract_p = next.findAll("p", class_=None)
        abstract = ""
        for p in abstract_p:
            abstract += p.text
        tutorial_dummy["abstract"] = abstract
        tutorial_info_list.append(copy.copy(tutorial_dummy))
    return tutorial_info_list

Пример #5

0

Показать файл

Файл: workshop_crawler.py Проект: kronung/DASP-Project

def extract_workshop_info(url):
    workshop_dummy = {
        attribute: None
        for attribute in [
            "workshop_name", "workshop_organizer", "workshop_description",
            "workshop_day", "workshop_location", "workshop_link"
        ]
    }
    workshop_info_list = []
    try:
        page = request.urlopen(url)
    except ConnectionError:
        print("Could not connect to url.")

    datetimes = get_timestamps(url)
    current_date = datetimes[0][0]
    workshop_div = BeautifulSoup(page,
                                 'html.parser').findAll("div",
                                                        class_="workshops")[0]
    workshops = workshop_div.findAll("h3")
    for workshop in workshops:
        workshop_dummy["workshop_name"] = util.basic_string_clean(
            workshop.text)
        current_date = get_timestamp_for_event(datetimes,
                                               (current_date, workshop.text))
        workshop_dummy["workshop_day"] = current_date
        workshop_dummy["workshop_link"] = workshop.contents[0]["href"]
        authors_str = util.basic_string_clean(
            workshop.findNext("p", {
                "class": "tutorials-tutors"
            }).text)
        authors_list = [
            author.strip() for author in re.split(r",|\sand\s", authors_str)
        ]
        workshop_dummy["workshop_organizer"] = authors_list
        workshop_dummy["workshop_location"] = workshop.findNext(
            "p", {
                "class": "tutorials-room"
            }).text.split('.')[1].strip()
        workshop_info_list.append(copy.copy(workshop_dummy))
    return workshop_info_list

Пример #6

0

Показать файл

def extract_tutorials_info(url):
    tutorial_dummy = {
        attribute: None
        for attribute in [
            "tutorial_name", "tutorial_author", "tutorial_abstract",
            "tutorial_time", "tutorial_location", "tutorial_link"
        ]
    }
    tutorial_info_list = []
    try:
        page = request.urlopen(url)
    except ConnectionError:
        print("Could not connect to url.")

    tutorials = BeautifulSoup(page,
                              'html.parser').find("section",
                                                  {"id": "main_content"})
    for child in tutorials.findChildren('p'):
        text = child.text.split('\n')
        tutorial_dummy['tutorial_name'] = util.basic_string_clean(text[0])
        tutorial_dummy['tutorial_author'] = text[1]
        tutorial_info_list.append(copy.copy(tutorial_dummy))
    return tutorial_info_list

Пример #7

0

Показать файл

Файл: tutorial_crawler.py Проект: kronung/DASP-Project

def extract_tutorials(tutorials_url=None, schedule_url=None):
    """
    Extracts basic information available for tutorials provided at
    the tutorial site of the conference and extract and merge with data for a
    tutorial if interactive schedule of conference is specified.
    One url of the the two must be provided. If only one is specified, the crawler tries to
    extract as much information as it can from this site. Its recommended to specify both urls,
    then the crawler extracts all available data starting with the tutorials_url and afterwards
    merges the data with results of the crawled data from the schedule url.
    :param: tutorials_url: the url where the tutorials are listed (default None)
            (for example https://www.emnlp-ijcnlp2019.org/program/tutorials/ )
    :param: schedule_url: the url of the interactive schedule if available (default None)
            (for example: https://www.emnlp-ijcnlp2019.org/program/ischedule/ )
    :return: list of dictionaries with a tutorial represented as one dictionary.
    """
    logger.info('Start crawling TUTORIALS...')
    tutorials = []
    tutorial_reference = {} # we need this dictionary to merge existing tutorials with the
                            # schedule data
    author_reference = []

    if tutorials_url is not None:
        logger.info('Crawling data from: %s', tutorials_url)
        # extract information from tutorial site
        try:
            page = request.urlopen(tutorials_url)
        except:
            logger.warning("URl could not be crawled!")
            return tutorials

        soup = BeautifulSoup(page, 'html.parser').find("section", {"class": "page__content"})

        reference_counter = 0
        # tutorials can either be in <h2> or <h3> tags
        for item in soup.find_all(['h2', 'h3'], id= re.compile("^t\d+")):
            tutorial = {attribute: None for attribute in ["tutorial_name", "tutorial_author", "tutorial_abstract", "tutorial_time",
                                                          "tutorial_location", "tutorial_link"]}
            tutorial["tutorial_name"] = pretty_title(item.text)
            tutorial["tutorial_author"] = pretty_organizers(item.findNext("p").text)
            next_node = item.findNext("p")
            tagbreak = item.name
            abstract = ""
            for tag in next_node.next_siblings:
                if tag.name == tagbreak:
                    break
                elif tag.name in ["p", "div", "ul"]:
                    abstract += tag.text
            tutorial["tutorial_abstract"] = basic_string_clean(abstract)
            tutorials.append(tutorial)
            tutorial_reference[clean_title(item.text)] = reference_counter
            author_reference.append(set(clean_authors(item.findNext("p").text)))
            reference_counter += 1

    # gather and merge with information available in interactive schedule

    if schedule_url is None:
        logger.info('Crawling DONE: no schedule url specified')
        return tutorials

    logger.info('Crawling data from: %s', schedule_url)
    try:
        page = request.urlopen(schedule_url)
    except:
        logger.warning("URl could not be crawled!")
        return tutorials

    tutorial_sessions = BeautifulSoup(page, 'html.parser')\
        .findAll("div", {"class": "session session-expandable session-tutorials"})
    for session in tutorial_sessions:
        time = session.find("span", {"class" : "session-time"})
        datetime = ""
        if time is not None:
            datetime = time["title"] + ", " + time.text
        for child in session.findChildren(class_="tutorial-title"):
            title_parent = child.find("strong")
            if title_parent is not None:
                title = title_parent.text
                authors_parent = title_parent.next_sibling
                if authors_parent is not None:
                    authors = str(authors_parent)

                    # if tutorial already exists merge
                    search_title = clean_title(title)
                    search_authors = clean_authors(authors)
                    if (search_title in tutorial_reference) or (search_authors in author_reference):
                        logger.debug('Merge existing tutorial: **%s', title)
                        try:
                            existing_index = tutorial_reference[search_title]
                        except KeyError:
                            existing_index = author_reference.index(search_authors)
                        location_parent = child.findNext(class_="btn")
                        location = None
                        if location_parent is not None:
                            location = location_parent.text
                        if tutorials[existing_index]["tutorial_time"] is None:
                            tutorials[existing_index]["tutorial_time"] = datetime
                            tutorials[existing_index]["tutorial_location"] = location
                        else:
                            cur_datetime = tutorials[existing_index]["tutorial_time"] + ", " + time.text
                            tutorials[existing_index]["tutorial_time"] = cur_datetime

                    # if tutorial does not exist add to tutorials
                    else:
                        logger.debug('Tutorial does not exist already: Create new: *%s', title)
                        tutorial = {attribute: None for attribute in
                                    ["tutorial_name", "tutorial_author", "tutorial_abstract", "tutorial_time",
                                     "tutorial_location", "tutorial_link"]}
                        tutorial["tutorial_name"] = pretty_title(title)
                        tutorial["tutorial_author"] = pretty_organizers(authors)

                        location_parent = child.findNext(class_="btn")
                        if location_parent is not None:
                            tutorial["tutorial_location"] = location_parent.text
                        tutorial["tutorial_time"] = datetime
                        tutorials.append(tutorial)
    logger.info('Crawling TUTORIALS DONE')
    return tutorials

Пример #8

0

Показать файл

def extract_keynotes(keynotes_url=None, schedule_url=None):
    """
    Extracts basic information available for keynotes provided at
    the keynote site of the conference and extract and merge with data for a
    keynote if interactive schedule of conference is specified.
    One url of the the two must be provided. If only one is specified, the crawler tries to
    extract as much information as it can from this site. Its recommended to specify both urls,
    then the crawler extracts all available data starting with the keynote_url and afterwards
    merges the data with results of the crawled data from the schedule url.
    :param: keynotes_url: the url where the keynotes are listed (default None)
            (for example https://naacl2019.org/program/keynotes/ )
    :param: schedule_url: the url of the interactive schedule if available (default None)
            (for example: https://www.emnlp-ijcnlp2019.org/program/ischedule/ )
    :return: list of dictionaries with a keynote represented as one dictionary.
    """
    logger.info('Start crawling KEYNOTES...')
    keynotes = []

    keynote_reference = {
    }  # we need this dictionary to merge existing tutorials with the
    # schedule data

    # extract information from tutorial site
    if keynotes_url is not None:
        logger.info('Crawling data from: %s', keynotes_url)
        try:
            page = request.urlopen(keynotes_url)
        except:
            logger.warning("URl could not be crawled!")
            return keynotes

        soup = BeautifulSoup(page,
                             'html.parser').find("section",
                                                 {"class": "page__content"})

        reference_counter = 0
        items = soup.find_all(class_="archive__item-body")

        for item in items:
            keynote = {
                attribute: None
                for attribute in [
                    "keynote_title", "keynote_speaker", "keynote_abstract",
                    "keynote_time", "keynote_location", "keynote_link",
                    "keynote_speaker_bio"
                ]
            }
            title_parent = item.find(class_="archive__item-excerpt").find(
                "strong", text=re.compile("^\s?[Tt]itle\s?:\s?"))
            abstract_parent = item.find(class_="archive__item-excerpt").find(
                "strong", text=re.compile("^\s?[Aa]bstract\s?:\s?"))
            author_bio = item.find(class_="archive__item-small-excerpt")
            if title_parent is not None:
                title = basic_string_clean(str(title_parent.next_sibling))
                keynote["keynote_title"] = title
                if abstract_parent is not None:
                    keynote["keynote_abstract"] = basic_string_clean(
                        str(abstract_parent.next_sibling))
                if author_bio is not None:
                    keynote["keynote_speaker_bio"] = basic_string_clean(
                        author_bio.text)

                keynotes.append(keynote)
                keynote_reference[clean_title(title)] = reference_counter
                reference_counter += 1

    if schedule_url is None:
        logger.info('Crawling DONE: no schedule url specified')
        return keynotes

    # extract information from tutorial site
    logger.info('Crawling data from: %s', schedule_url)
    try:
        page = request.urlopen(schedule_url)
    except Exception:
        logger.warning("URl could not be crawled!")

    plenary_sessions = BeautifulSoup(page, 'html.parser')\
        .findAll("div", {"class": "session session-expandable session-plenary"})

    for session in plenary_sessions:
        title = session.findNext("a", {"class": "session-title"}).text
        if title.startswith("Keynote"):
            search_title = clean_title(title)
            # if keynote exist already merge results
            if search_title in keynote_reference:
                logger.debug('Merge existing keynote: **%s', title)
                existing_index = keynote_reference[search_title]
                authors = session.find("span", {"class": "session-people"})
                if authors is None:
                    authors = session.find("span", {"class": "session-person"})
                keynotes[existing_index]["keynote_speaker"] = authors.text
                time = session.find("span", {"class": "session-time"})
                keynotes[existing_index][
                    "keynote_time"] = time["title"] + ", " + time.text
                keynotes[existing_index][
                    "keynote_location"] = session.findNext(
                        "span", {
                            "class": "btn"
                        }).text

            # if keynote does not exist add new keynote
            else:
                logger.debug('Keynote does not exist already: Create new: *%s',
                             title)
                keynote = {
                    attribute: None
                    for attribute in [
                        "keynote_title", "keynote_speaker", "keynote_abstract",
                        "keynote_time", "keynote_location", "keynote_link",
                        "keynote_speaker_bio"
                    ]
                }
                if title:
                    keynote["keynote_title"] = pretty_title(title)
                    authors = session.find("span", {"class": "session-people"})
                    if authors is None:
                        authors = session.find("span",
                                               {"class": "session-person"})
                    keynote["keynote_speaker"] = authors.text
                    time_parent = session.find("span",
                                               {"class": "session-time"})
                    if time_parent is not None:
                        keynote["keynote_time"] = time_parent[
                            "title"] + ", " + time_parent.text
                    location_parent = session.findNext("span",
                                                       {"class": "btn"})
                    if location_parent is not None:
                        keynote["keynote_location"] = location_parent.text
                    abstract_parent = session.findNext(
                        "div", {"class": "session-abstract"})
                    if abstract_parent is not None:
                        keynote["keynote_abstract"] = basic_string_clean(
                            abstract_parent.text)
                    keynotes.append(keynote)
    logger.info('Crawling KEYNOTES DONE')
    return keynotes

Python basic_string_clean примеры использования