示例#1
0
def test_extract_add_info():
    """Tset the extract_add_info method."""

    words = Words()

    # Check page with all fields defined - check data extraction
    erp_word = Data('test')
    page = requests.get(("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
                         "efetch.fcgi?&db=pubmed&retmode=xml&id=28000963"))
    page_soup = BeautifulSoup(page.content, "xml")
    art = page_soup.findAll('PubmedArticle')[0]
    words.extract_add_info(erp_word, 111111, art)

    assert erp_word.ids[0] == 111111
    assert erp_word.titles[0] == ("A Neurocomputational Model of the N400"
                                  " and the P600 in Language Processing.")
    assert erp_word.words[0][0] == "ten"
    assert erp_word.kws[0][0] == "computational modeling"
    assert erp_word.years[0] == 2017
    assert erp_word.months[0] == 'May'
    assert erp_word.dois[0] == '10.1111/cogs.12461'

    # Check page with all fields missing - check error handling
    page = requests.get('http://www.google.com')
    erp_word = words.extract_add_info(erp_word, 999999, page)

    assert erp_word.ids[1] == 999999
    assert erp_word.titles[1] is None
    assert erp_word.words[1] is None
    assert erp_word.kws[1] is None
    assert erp_word.years[1] is None
    assert erp_word.months[1] is None
    assert erp_word.dois[1] is None
示例#2
0
def test_add_results():

    words = Words()

    words.add_results(Data(['test']))

    assert words.results
示例#3
0
def test_add_results():
    """Test the add_results method."""

    words = Words()

    words.add_results(Data(['test']))

    assert words.results
示例#4
0
def test_get_item():

    words = Words()

    # Test error for empty object
    with raises(IndexError):
        words['not a thing']

    words.add_results(Data('test', ['test']))

    # Test error for wrong key
    with raises(IndexError):
        words['wrong']

    # Test properly extracting item
    assert words['test']
def scrape_words(terms_lst, exclusions_lst=[], db='pubmed', retmax=None,
                 use_hist=False, save_n_clear=True, verbose=False):
    """Search and scrape from pubmed for all abstracts referring to a given term.

    Parameters
    ----------
    terms_lst : list of list of str
        Search terms.
    exclusions_lst : list of list of str, optional
        Exclusion words for search terms.
    db : str, optional (default: 'pubmed')
        Which pubmed database to use.
    retmax : int, optional
        Maximum number of records to return.
    use_hist : bool, optional (default: False)
        Use e-utilities history: storing results on their server, as needed.
    save_n_clear : bool, optional (default: False)
        Whether to
    verbose : bool, optional (default: False)
        Whether to print out updates.

    Returns
    -------
    results : list of lisc Data() objects
        Results from the scraping data for each term.
    meta_dat : dict
        Meta data from the scrape.

    Notes
    -----
    The scraping does an exact word search for the term given.
    It then loops through all the articles found about that data.
    For each article, pulls and saves out data (including title, abstract, authors, etc)
        Pulls data using the hierarchical tag structure that organize the articles.
        This procedure loops through each article tag.
    """

    results = []
    meta_dat = dict()

    # Requester object
    req = Requester()

    # Set date of when data was collected
    meta_dat['date'] = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

    # Get e-utils URLS object
    hist_val = 'y' if use_hist else 'n'
    urls = URLS(db=db, usehistory=hist_val, retmax=retmax, retmode='xml', field='TIAB', auto_gen=False)
    urls.build_info(['db'])
    urls.build_search(['db', 'usehistory', 'retmax', 'retmode', 'field'])
    urls.build_fetch(['db', 'retmode'])

    # Get current information about database being used
    meta_dat['db_info'] = _get_db_info(req, urls.info)

    # Check exclusions
    if not exclusions_lst:
        exclusions_lst = [[] for i in range(len(terms_lst))]

    # Loop through all the terms
    for ind, terms in enumerate(terms_lst):

        # Print out status
        if verbose:
            print('Scraping words for: ', terms[0])

        # Initiliaze object to store data for current term papers
        cur_dat = Data(terms[0], terms)

        # Set up search terms - add exclusions, if there are any
        if exclusions_lst[ind]:
            term_arg = comb_terms(terms, 'or') + comb_terms(exclusions_lst[ind], 'not')
        else:
            term_arg = comb_terms(terms, 'or')

        # Create the url for the search term
        url = urls.search + term_arg

        # Update History
        cur_dat.update_history('Start Scrape')

        # Get page and parse
        page = req.get_url(url)
        page_soup = BeautifulSoup(page.content, 'lxml')

        # Using history
        if use_hist:

            # Initialize to start at 0
            ret_start_it = 0

            # Get number of papers, and keys to use history
            count = int(page_soup.find('count').text)
            web_env = page_soup.find('webenv').text
            query_key = page_soup.find('querykey').text

            # Loop through pulling paper data, using history
            while ret_start_it < count:

                # Set the number of papers per iteration (the ret_max per call)
                #  This defaults to 100, but will sets to less if fewer needed to reach retmax
                ret_end_it = min(100, int(retmax) - ret_start_it)

                # Get article page, scrape data, update position
                art_url = urls.fetch + '&WebEnv=' + web_env + '&query_key=' + query_key + \
                          '&retstart=' + str(ret_start_it) + '&retmax=' + str(ret_end_it)
                cur_dat = _scrape_papers(req, art_url, cur_dat)
                ret_start_it += ret_end_it

                # Stop if number of scraped papers has reached total retmax
                if ret_start_it >= int(retmax):
                    break

        # Without using history
        else:

            # Get all ids
            ids = page_soup.find_all('id')

            # Convert ids to string
            ids_str = _ids_to_str(ids)

            # Get article page & scrape data
            art_url = urls.fetch + '&id=' + ids_str
            cur_dat = _scrape_papers(req, art_url, cur_dat)

        # Check consistency of extracted results
        cur_dat.check_results()
        cur_dat.update_history('End Scrape')

        # Save out and clear data
        if save_n_clear:
            cur_dat.save_n_clear()
        results.append(cur_dat)

    # Set Requester object as finished being used
    req.close()
    meta_dat['req'] = req

    return results, meta_dat
def load_data(add_dat=False, n=1):
    """Helper function to load Data() object for testing."""

    dat = Data('test', ['test'])

    if add_dat:
        for i in range(n):
            dat.add_id(1)
            dat.add_title('title')
            dat.add_journal('science', 'sc')
            dat.add_authors([('A', 'B', 'C', 'D')])
            dat.add_words(['new', 'dat'])
            dat.add_kws(['lots', 'of', 'erps'])
            dat.add_pub_date((2112, 'Jan'))
            dat.add_doi('doi_str')
            dat.increment_n_articles()

    return dat
示例#7
0
def scrape_words(terms_lst,
                 exclusions_lst=[],
                 db='pubmed',
                 retmax=None,
                 use_hist=False,
                 save_n_clear=True,
                 verbose=False):
    """Search and scrape from pubmed for all abstracts referring to a given term.

    Parameters
    ----------
    terms_lst : list of list of str
        Search terms.
    exclusions_lst : list of list of str, optional
        Exclusion words for search terms.
    db : str, optional (default: 'pubmed')
        Which pubmed database to use.
    retmax : int, optional
        Maximum number of records to return.
    use_hist : bool, optional (default: False)
        Use e-utilities history: storing results on their server, as needed.
    save_n_clear : bool, optional (default: False)
        Whether to
    verbose : bool, optional (default: False)
        Whether to print out updates.

    Returns
    -------
    results : list of lisc Data() objects
        Results from the scraping data for each term.
    meta_dat : dict
        Meta data from the scrape.

    Notes
    -----
    The scraping does an exact word search for the term given.
    It then loops through all the articles found about that data.
    For each article, pulls and saves out data (including title, abstract, authors, etc)
        Pulls data using the hierarchical tag structure that organize the articles.
        This procedure loops through each article tag.
    """

    results = []
    meta_dat = dict()

    # Requester object
    req = Requester()

    # Set date of when data was collected
    meta_dat['date'] = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

    # Get e-utils URLS object
    hist_val = 'y' if use_hist else 'n'
    urls = URLS(db=db,
                usehistory=hist_val,
                retmax=retmax,
                retmode='xml',
                field='TIAB',
                auto_gen=False)
    urls.build_info(['db'])
    urls.build_search(['db', 'usehistory', 'retmax', 'retmode', 'field'])
    urls.build_fetch(['db', 'retmode'])

    # Get current information about database being used
    meta_dat['db_info'] = _get_db_info(req, urls.info)

    # Check exclusions
    if not exclusions_lst:
        exclusions_lst = [[] for i in range(len(terms_lst))]

    # Loop through all the terms
    for ind, terms in enumerate(terms_lst):

        # Print out status
        if verbose:
            print('Scraping words for: ', terms[0])

        # Initiliaze object to store data for current term papers
        cur_dat = Data(terms[0], terms)

        # Set up search terms - add exclusions, if there are any
        if exclusions_lst[ind]:
            term_arg = comb_terms(terms, 'or') + comb_terms(
                exclusions_lst[ind], 'not')
        else:
            term_arg = comb_terms(terms, 'or')

        # Create the url for the search term
        url = urls.search + term_arg

        # Update History
        cur_dat.update_history('Start Scrape')

        # Get page and parse
        page = req.get_url(url)
        page_soup = BeautifulSoup(page.content, 'lxml')

        # Using history
        if use_hist:

            # Initialize to start at 0
            ret_start_it = 0

            # Get number of papers, and keys to use history
            count = int(page_soup.find('count').text)
            web_env = page_soup.find('webenv').text
            query_key = page_soup.find('querykey').text

            # Loop through pulling paper data, using history
            while ret_start_it < count:

                # Set the number of papers per iteration (the ret_max per call)
                #  This defaults to 100, but will sets to less if fewer needed to reach retmax
                ret_end_it = min(100, int(retmax) - ret_start_it)

                # Get article page, scrape data, update position
                art_url = urls.fetch + '&WebEnv=' + web_env + '&query_key=' + query_key + \
                          '&retstart=' + str(ret_start_it) + '&retmax=' + str(ret_end_it)
                cur_dat = _scrape_papers(req, art_url, cur_dat)
                ret_start_it += ret_end_it

                # Stop if number of scraped papers has reached total retmax
                if ret_start_it >= int(retmax):
                    break

        # Without using history
        else:

            # Get all ids
            ids = page_soup.find_all('id')

            # Convert ids to string
            ids_str = _ids_to_str(ids)

            # Get article page & scrape data
            art_url = urls.fetch + '&id=' + ids_str
            cur_dat = _scrape_papers(req, art_url, cur_dat)

        # Check consistency of extracted results
        cur_dat.check_results()
        cur_dat.update_history('End Scrape')

        # Save out and clear data
        if save_n_clear:
            cur_dat.save_n_clear()
        results.append(cur_dat)

    # Set Requester object as finished being used
    req.close()
    meta_dat['req'] = req

    return results, meta_dat
def load_data(add_dat=False, n=1):
    """Helper function to load Data() object for testing."""

    dat = Data('test', ['test'])

    if add_dat:
        for i in range(n):
            dat.add_id(1)
            dat.add_title('title')
            dat.add_journal('science', 'sc')
            dat.add_authors([('A', 'B', 'C', 'D')])
            dat.add_words(['new', 'dat'])
            dat.add_kws(['lots', 'of', 'erps'])
            dat.add_pub_date((2112, 'Jan'))
            dat.add_doi('doi_str')
            dat.increment_n_articles()

    return dat