예제 #1
0
def test_comb_terms():

    out = comb_terms(['one', 'two'], 'or')
    assert out == '("one"OR"two")'

    out = comb_terms(['one', 'two'], 'not')
    assert out == 'NOT"one"NOT"two"'
def test_comb_terms():
    """Test the comb_terms function."""

    out = comb_terms(['one', 'two'], 'or')
    assert out == '("one"OR"two")'

    out = comb_terms(['one', 'two'], 'not')
    assert out == 'NOT"one"NOT"two"'
def _mk(t_lst, cm=''):
    """Create search term component.

    Parameters
    ----------
    t_lst : list of str
        List of words to connect together.
    cm : str
        Connector word to append to front of search term.

    Returns
    -------
    str
        Search term.
    """

    if t_lst and t_lst[0]:
        return cm + comb_terms(t_lst, 'or')
    else:
        return ''
예제 #4
0
def _mk(t_lst, cm=''):
    """Create search term component.

    Parameters
    ----------
    t_lst : list of str
        List of words to connect together.
    cm : str
        Connector word to append to front of search term.

    Returns
    -------
    str
        Search term.
    """

    if t_lst and t_lst[0]:
        return cm + comb_terms(t_lst, 'or')
    else:
        return ''
def scrape_words(terms_lst, exclusions_lst=[], db='pubmed', retmax=None,
                 use_hist=False, save_n_clear=True, verbose=False):
    """Search and scrape from pubmed for all abstracts referring to a given term.

    Parameters
    ----------
    terms_lst : list of list of str
        Search terms.
    exclusions_lst : list of list of str, optional
        Exclusion words for search terms.
    db : str, optional (default: 'pubmed')
        Which pubmed database to use.
    retmax : int, optional
        Maximum number of records to return.
    use_hist : bool, optional (default: False)
        Use e-utilities history: storing results on their server, as needed.
    save_n_clear : bool, optional (default: False)
        Whether to
    verbose : bool, optional (default: False)
        Whether to print out updates.

    Returns
    -------
    results : list of lisc Data() objects
        Results from the scraping data for each term.
    meta_dat : dict
        Meta data from the scrape.

    Notes
    -----
    The scraping does an exact word search for the term given.
    It then loops through all the articles found about that data.
    For each article, pulls and saves out data (including title, abstract, authors, etc)
        Pulls data using the hierarchical tag structure that organize the articles.
        This procedure loops through each article tag.
    """

    results = []
    meta_dat = dict()

    # Requester object
    req = Requester()

    # Set date of when data was collected
    meta_dat['date'] = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

    # Get e-utils URLS object
    hist_val = 'y' if use_hist else 'n'
    urls = URLS(db=db, usehistory=hist_val, retmax=retmax, retmode='xml', field='TIAB', auto_gen=False)
    urls.build_info(['db'])
    urls.build_search(['db', 'usehistory', 'retmax', 'retmode', 'field'])
    urls.build_fetch(['db', 'retmode'])

    # Get current information about database being used
    meta_dat['db_info'] = _get_db_info(req, urls.info)

    # Check exclusions
    if not exclusions_lst:
        exclusions_lst = [[] for i in range(len(terms_lst))]

    # Loop through all the terms
    for ind, terms in enumerate(terms_lst):

        # Print out status
        if verbose:
            print('Scraping words for: ', terms[0])

        # Initiliaze object to store data for current term papers
        cur_dat = Data(terms[0], terms)

        # Set up search terms - add exclusions, if there are any
        if exclusions_lst[ind]:
            term_arg = comb_terms(terms, 'or') + comb_terms(exclusions_lst[ind], 'not')
        else:
            term_arg = comb_terms(terms, 'or')

        # Create the url for the search term
        url = urls.search + term_arg

        # Update History
        cur_dat.update_history('Start Scrape')

        # Get page and parse
        page = req.get_url(url)
        page_soup = BeautifulSoup(page.content, 'lxml')

        # Using history
        if use_hist:

            # Initialize to start at 0
            ret_start_it = 0

            # Get number of papers, and keys to use history
            count = int(page_soup.find('count').text)
            web_env = page_soup.find('webenv').text
            query_key = page_soup.find('querykey').text

            # Loop through pulling paper data, using history
            while ret_start_it < count:

                # Set the number of papers per iteration (the ret_max per call)
                #  This defaults to 100, but will sets to less if fewer needed to reach retmax
                ret_end_it = min(100, int(retmax) - ret_start_it)

                # Get article page, scrape data, update position
                art_url = urls.fetch + '&WebEnv=' + web_env + '&query_key=' + query_key + \
                          '&retstart=' + str(ret_start_it) + '&retmax=' + str(ret_end_it)
                cur_dat = _scrape_papers(req, art_url, cur_dat)
                ret_start_it += ret_end_it

                # Stop if number of scraped papers has reached total retmax
                if ret_start_it >= int(retmax):
                    break

        # Without using history
        else:

            # Get all ids
            ids = page_soup.find_all('id')

            # Convert ids to string
            ids_str = _ids_to_str(ids)

            # Get article page & scrape data
            art_url = urls.fetch + '&id=' + ids_str
            cur_dat = _scrape_papers(req, art_url, cur_dat)

        # Check consistency of extracted results
        cur_dat.check_results()
        cur_dat.update_history('End Scrape')

        # Save out and clear data
        if save_n_clear:
            cur_dat.save_n_clear()
        results.append(cur_dat)

    # Set Requester object as finished being used
    req.close()
    meta_dat['req'] = req

    return results, meta_dat
예제 #6
0
def scrape_words(terms_lst,
                 exclusions_lst=[],
                 db='pubmed',
                 retmax=None,
                 use_hist=False,
                 save_n_clear=True,
                 verbose=False):
    """Search and scrape from pubmed for all abstracts referring to a given term.

    Parameters
    ----------
    terms_lst : list of list of str
        Search terms.
    exclusions_lst : list of list of str, optional
        Exclusion words for search terms.
    db : str, optional (default: 'pubmed')
        Which pubmed database to use.
    retmax : int, optional
        Maximum number of records to return.
    use_hist : bool, optional (default: False)
        Use e-utilities history: storing results on their server, as needed.
    save_n_clear : bool, optional (default: False)
        Whether to
    verbose : bool, optional (default: False)
        Whether to print out updates.

    Returns
    -------
    results : list of lisc Data() objects
        Results from the scraping data for each term.
    meta_dat : dict
        Meta data from the scrape.

    Notes
    -----
    The scraping does an exact word search for the term given.
    It then loops through all the articles found about that data.
    For each article, pulls and saves out data (including title, abstract, authors, etc)
        Pulls data using the hierarchical tag structure that organize the articles.
        This procedure loops through each article tag.
    """

    results = []
    meta_dat = dict()

    # Requester object
    req = Requester()

    # Set date of when data was collected
    meta_dat['date'] = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

    # Get e-utils URLS object
    hist_val = 'y' if use_hist else 'n'
    urls = URLS(db=db,
                usehistory=hist_val,
                retmax=retmax,
                retmode='xml',
                field='TIAB',
                auto_gen=False)
    urls.build_info(['db'])
    urls.build_search(['db', 'usehistory', 'retmax', 'retmode', 'field'])
    urls.build_fetch(['db', 'retmode'])

    # Get current information about database being used
    meta_dat['db_info'] = _get_db_info(req, urls.info)

    # Check exclusions
    if not exclusions_lst:
        exclusions_lst = [[] for i in range(len(terms_lst))]

    # Loop through all the terms
    for ind, terms in enumerate(terms_lst):

        # Print out status
        if verbose:
            print('Scraping words for: ', terms[0])

        # Initiliaze object to store data for current term papers
        cur_dat = Data(terms[0], terms)

        # Set up search terms - add exclusions, if there are any
        if exclusions_lst[ind]:
            term_arg = comb_terms(terms, 'or') + comb_terms(
                exclusions_lst[ind], 'not')
        else:
            term_arg = comb_terms(terms, 'or')

        # Create the url for the search term
        url = urls.search + term_arg

        # Update History
        cur_dat.update_history('Start Scrape')

        # Get page and parse
        page = req.get_url(url)
        page_soup = BeautifulSoup(page.content, 'lxml')

        # Using history
        if use_hist:

            # Initialize to start at 0
            ret_start_it = 0

            # Get number of papers, and keys to use history
            count = int(page_soup.find('count').text)
            web_env = page_soup.find('webenv').text
            query_key = page_soup.find('querykey').text

            # Loop through pulling paper data, using history
            while ret_start_it < count:

                # Set the number of papers per iteration (the ret_max per call)
                #  This defaults to 100, but will sets to less if fewer needed to reach retmax
                ret_end_it = min(100, int(retmax) - ret_start_it)

                # Get article page, scrape data, update position
                art_url = urls.fetch + '&WebEnv=' + web_env + '&query_key=' + query_key + \
                          '&retstart=' + str(ret_start_it) + '&retmax=' + str(ret_end_it)
                cur_dat = _scrape_papers(req, art_url, cur_dat)
                ret_start_it += ret_end_it

                # Stop if number of scraped papers has reached total retmax
                if ret_start_it >= int(retmax):
                    break

        # Without using history
        else:

            # Get all ids
            ids = page_soup.find_all('id')

            # Convert ids to string
            ids_str = _ids_to_str(ids)

            # Get article page & scrape data
            art_url = urls.fetch + '&id=' + ids_str
            cur_dat = _scrape_papers(req, art_url, cur_dat)

        # Check consistency of extracted results
        cur_dat.check_results()
        cur_dat.update_history('End Scrape')

        # Save out and clear data
        if save_n_clear:
            cur_dat.save_n_clear()
        results.append(cur_dat)

    # Set Requester object as finished being used
    req.close()
    meta_dat['req'] = req

    return results, meta_dat