예제 #1
0
def _process_authors(author_list):
    """Extract and process author data.

    Parameters
    ----------
    author_list : bs4.element.Tag
        AuthorList tag, which contains tags related to author data.

    Returns
    -------
    out : list of tuple of (str, str, str, str)
        List of authors, each as (LastName, FirstName, Initials, Affiliation).
    """

    # Pull out all author tags from the input
    authors = extract(author_list, 'Author', 'all')

    # Initialize list to return
    out = []

    # Extract data for each author
    for author in authors:
        out.append((extract(author, 'LastName',
                            'str'), extract(author, 'ForeName', 'str'),
                    extract(author, 'Initials',
                            'str'), extract(author, 'Affiliation', 'str')))

    return out
예제 #2
0
def test_extract():

    # Create a complex tag
    out = bs4.element.Tag(name='Out')
    inn1 = bs4.element.Tag(name='Inn')
    inn2 = bs4.element.Tag(name='Inn')

    inn1.append('words words')
    inn2.append('more words')

    out.append(inn1)
    out.append(inn2)

    # Test error - bad how
    with raises(ValueError):
        out_err = extract(out, 'Inn', 'bad')

    # Test how = 'raw'
    out_raw = extract(out, 'Inn', 'raw')
    assert type(out_raw) is bs4.element.Tag

    # Test how = 'str'
    out_str = extract(out, 'Inn', 'str')
    #TODO: Figure this out? Whats the return type?
    #assert isinstance(out_str, str)
    #assert out_str == 'words words'

    # Test how = 'all'
    out_all = extract(out, 'Inn', 'all')
    assert type(out_all) is bs4.element.ResultSet

    # Test with non-existent tag name
    out_none = extract(out, 'bad', 'raw')
    assert out_none is None
def _process_authors(author_list):
    """Extract and process author data.

    Parameters
    ----------
    author_list : bs4.element.Tag
        AuthorList tag, which contains tags related to author data.

    Returns
    -------
    out : list of tuple of (str, str, str, str)
        List of authors, each as (LastName, FirstName, Initials, Affiliation).
    """

    # Pull out all author tags from the input
    authors = extract(author_list, 'Author', 'all')

    # Initialize list to return
    out = []

    # Extract data for each author
    for author in authors:
        out.append((extract(author, 'LastName', 'str'), extract(author, 'ForeName', 'str'),
                    extract(author, 'Initials', 'str'), extract(author, 'Affiliation', 'str')))

    return out
def _get_count(req, url):
    """Get the count of how many articles listed on search results URL.

    Parameters
    ----------
    url : str
        URL to search with.

    Returns
    -------
    count : int
        Count of the number of articles found.
    """

    # Request page from URL
    page = req.get_url(url)
    page_soup = BeautifulSoup(page.content, 'lxml')

    # Get all count tags
    counts = extract(page_soup, 'count', 'all')

    try:
        count = int(counts[0].text)
    except IndexError:
        count = 0

    return count
def _scrape_papers(req, art_url, cur_dat):
    """Scrape information for each article found for a given term.

    Parameters
    ----------
    req : Requester() object
        Manages request
    art_url : str
        URL for the article to be scraped

    Returns
    -------
    cur_dat : Data() object
        Object to store information for the current term.
    """

    # Get page of all articles
    art_page = req.get_url(art_url)
    art_page_soup = BeautifulSoup(art_page.content, "xml")

    # Pull out articles
    articles = art_page_soup.findAll('PubmedArticle')

    # Loop through each article, extracting relevant information
    for ind, art in enumerate(articles):

        # Get ID of current article
        new_id = _process_ids(extract(art, 'ArticleId', 'all'), 'pubmed')

        # Extract and add all relevant info from current articles to Data object
        cur_dat = _extract_add_info(cur_dat, new_id, art)

    return cur_dat
def _get_db_info(req, info_url):
    """Calls EInfo to get info and status of db to be used for scraping.

    Parameters
    ----------
    info_url : str
        URL to request db information from.

    Returns
    -------
    db_info : dict
        Database information.
    """

    # Get the info page and parse with BeautifulSoup
    info_page = req.get_url(info_url)
    info_page_soup = BeautifulSoup(info_page.content, 'lxml')

    # Set list of fields to extract from eInfo
    fields = ['dbname', 'menuname', 'description', 'dbbuild', 'count', 'lastupdate']

    # Extract basic infomation into a dictionary
    db_info = dict()
    for field in fields:
        db_info[field] = extract(info_page_soup, field, 'str')

    return db_info
예제 #7
0
def _get_count(req, url):
    """Get the count of how many articles listed on search results URL.

    Parameters
    ----------
    url : str
        URL to search with.

    Returns
    -------
    count : int
        Count of the number of articles found.
    """

    # Request page from URL
    page = req.get_url(url)
    page_soup = BeautifulSoup(page.content, 'lxml')

    # Get all count tags
    counts = extract(page_soup, 'count', 'all')

    try:
        count = int(counts[0].text)
    except IndexError:
        count = 0

    return count
예제 #8
0
def _scrape_papers(req, art_url, cur_dat):
    """Scrape information for each article found for a given term.

    Parameters
    ----------
    req : Requester() object
        Manages request
    art_url : str
        URL for the article to be scraped

    Returns
    -------
    cur_dat : Data() object
        Object to store information for the current term.
    """

    # Get page of all articles
    art_page = req.get_url(art_url)
    art_page_soup = BeautifulSoup(art_page.content, "xml")

    # Pull out articles
    articles = art_page_soup.findAll('PubmedArticle')

    # Loop through each article, extracting relevant information
    for ind, art in enumerate(articles):

        # Get ID of current article
        new_id = _process_ids(extract(art, 'ArticleId', 'all'), 'pubmed')

        # Extract and add all relevant info from current articles to Data object
        cur_dat = _extract_add_info(cur_dat, new_id, art)

    return cur_dat
예제 #9
0
def _get_db_info(req, info_url):
    """Calls EInfo to get info and status of db to be used for scraping.

    Parameters
    ----------
    info_url : str
        URL to request db information from.

    Returns
    -------
    db_info : dict
        Database information.
    """

    # Get the info page and parse with BeautifulSoup
    info_page = req.get_url(info_url)
    info_page_soup = BeautifulSoup(info_page.content, 'lxml')

    # Set list of fields to extract from eInfo
    fields = [
        'dbname', 'menuname', 'description', 'dbbuild', 'count', 'lastupdate'
    ]

    # Extract basic infomation into a dictionary
    db_info = dict()
    for field in fields:
        db_info[field] = extract(info_page_soup, field, 'str')

    return db_info
def test_extract():
    """Test the extract function."""

    # Create a complex tag
    out = bs4.element.Tag(name='Out')
    inn1 = bs4.element.Tag(name='Inn')
    inn2 = bs4.element.Tag(name='Inn')

    inn1.append('words words')
    inn2.append('more words')

    out.append(inn1)
    out.append(inn2)

    # Test error - bad how
    with raises(ValueError):
        out_err = extract(out, 'Inn', 'bad')

    # Test how = 'raw'
    out_raw = extract(out, 'Inn', 'raw')
    assert type(out_raw) is bs4.element.Tag

    # DROPPED CASE WITH MOVE TO PY35
    # Test how = 'txt'
    #out_txt = extract(out, 'Inn', 'txt')
    #assert isinstance(out_txt, UnicodeType)
    #assert out_txt == unicode('words words')

    # Test how = 'str'
    out_str = extract(out, 'Inn', 'str')
    #TODO: Figure this out? Whats the return type?
    #assert isinstance(out_str, str)
    #assert out_str == 'words words'

    # Test how = 'all'
    out_all = extract(out, 'Inn', 'all')
    assert type(out_all) is bs4.element.ResultSet

    # Test with non-existent tag name
    out_none = extract(out, 'bad', 'raw')
    assert out_none is None
def _process_pub_date(pub_date):
    """Extract and process publication date data.

    Parameters
    ----------
    pub_date : bs4.element.Tag
        PubDate tag, which contains tags with publication date information.

    Returns
    -------
    year : int or None
        Year the article was published.
    month : str or None
        Month the article was published.
    """

    # Extract year, convert to int if not None
    year = extract(pub_date, 'Year', 'str')
    year = int(year) if year else year

    # Extract month
    month = extract(pub_date, 'Month', 'str')

    return year, month
예제 #12
0
def _process_pub_date(pub_date):
    """Extract and process publication date data.

    Parameters
    ----------
    pub_date : bs4.element.Tag
        PubDate tag, which contains tags with publication date information.

    Returns
    -------
    year : int or None
        Year the article was published.
    month : str or None
        Month the article was published.
    """

    # Extract year, convert to int if not None
    year = extract(pub_date, 'Year', 'str')
    year = int(year) if year else year

    # Extract month
    month = extract(pub_date, 'Month', 'str')

    return year, month
예제 #13
0
def _extract_add_info(cur_dat, new_id, art):
    """Extract information from article web page and add to

    Parameters
    ----------
    cur_dat : Data() object
        Object to store information for the current term.
    new_id : int
        Paper ID of the new paper.
    art : bs4.element.Tag() object
        Extracted pubmed article.

    Returns
    -------
    cur_dat : Data() object
        Object to store data from the current term.

    NOTES
    -----
    Data extraction is all in try/except statements in order to
        deal with missing data, since fields may be missing.
    """

    # Add ID of current article
    cur_dat.add_id(new_id)
    cur_dat.add_title(extract(art, 'ArticleTitle', 'str'))
    cur_dat.add_authors(_process_authors(extract(art, 'AuthorList', 'raw')))
    cur_dat.add_journal(extract(art, 'Title', 'str'),
                        extract(art, 'ISOAbbreviation', 'str'))
    cur_dat.add_words(_process_words(extract(art, 'AbstractText', 'str')))
    cur_dat.add_kws(_process_kws(extract(art, 'Keyword', 'all')))
    cur_dat.add_pub_date(_process_pub_date(extract(art, 'PubDate', 'raw')))
    cur_dat.add_doi(_process_ids(extract(art, 'ArticleId', 'all'), 'doi'))

    # Increment number of articles included in Data
    cur_dat.increment_n_articles()

    return cur_dat
def _extract_add_info(cur_dat, new_id, art):
    """Extract information from article web page and add to

    Parameters
    ----------
    cur_dat : Data() object
        Object to store information for the current term.
    new_id : int
        Paper ID of the new paper.
    art : bs4.element.Tag() object
        Extracted pubmed article.

    Returns
    -------
    cur_dat : Data() object
        Object to store data from the current term.

    NOTES
    -----
    Data extraction is all in try/except statements in order to
        deal with missing data, since fields may be missing.
    """

    # Add ID of current article
    cur_dat.add_id(new_id)
    cur_dat.add_title(extract(art, 'ArticleTitle', 'str'))
    cur_dat.add_authors(_process_authors(extract(art, 'AuthorList', 'raw')))
    cur_dat.add_journal(extract(art, 'Title', 'str'), extract(art, 'ISOAbbreviation', 'str'))
    cur_dat.add_words(_process_words(extract(art, 'AbstractText', 'str')))
    cur_dat.add_kws(_process_kws(extract(art, 'Keyword', 'all')))
    cur_dat.add_pub_date(_process_pub_date(extract(art, 'PubDate', 'raw')))
    cur_dat.add_doi(_process_ids(extract(art, 'ArticleId', 'all'), 'doi'))

    # Increment number of articles included in Data
    cur_dat.increment_n_articles()

    return cur_dat