Пример #1
0
def collect_words(terms,
                  inclusions=[],
                  exclusions=[],
                  db='pubmed',
                  retmax=None,
                  field='TIAB',
                  usehistory=False,
                  api_key=None,
                  save_and_clear=False,
                  logging=None,
                  directory=None,
                  verbose=False):
    """Collect text data and metadata from EUtils using specified search term(s).

    Parameters
    ----------
    terms : list of list of str
        Search terms.
    inclusions : list of list of str, optional
        Inclusion words for search terms.
    exclusions : list of list of str, optional
        Exclusion words for search terms.
    db : str, optional, default: 'pubmed'
        Which database to access from EUtils.
    retmax : int, optional
        Maximum number of articles to return.
    field : str, optional, default: 'TIAB'
        Field to search for term within.
        Defaults to 'TIAB', which is Title/Abstract.
    usehistory : bool, optional, default: False
        Whether to use EUtils history, storing results on their server.
    api_key : str, optional
        An API key for a NCBI account.
    save_and_clear : bool, optional, default: False
        Whether to save words data to disk per term as it goes, instead of holding in memory.
    logging : {None, 'print', 'store', 'file'}
        What kind of logging, if any, to do for requested URLs.
    directory : str or SCDB, optional
        Folder or database object specifying the save location.
    verbose : bool, optional, default: False
        Whether to print out updates.

    Returns
    -------
    results : list of Articles
        Results from collecting data for each term.
    meta_data : MetaData
        Meta data from the data collection.

    Notes
    -----
    The collection does an exact word search for the term given. It then loops through all
    the articles found for that term.

    For each article, it pulls and saves out data (including title, abstract, authors, etc),
    using the hierarchical tag structure that organizes the articles.
    """

    # Get EUtils URLS object, with desired settings, and build required utility URLs
    urls = EUtils(db=db,
                  usehistory='y' if usehistory else 'n',
                  retmax=retmax,
                  retmode='xml',
                  field=field,
                  api_key=api_key)
    urls.build_url('info', settings=['db'])
    urls.build_url('search',
                   settings=['db', 'usehistory', 'retmax', 'retmode', 'field'])
    urls.build_url('fetch', settings=['db', 'retmode'])

    # Initialize results, meta data & requester
    results = []
    meta_data = MetaData()
    req = Requester(wait_time=get_wait_time(urls.authenticated),
                    logging=logging,
                    directory=directory)

    # Get current information about database being used
    meta_data.add_db_info(get_db_info(req, urls.get_url('info')))

    # Check inclusions & exclusions
    inclusions = inclusions if inclusions else [[]] * len(terms)
    exclusions = exclusions if exclusions else [[]] * len(terms)

    # Loop through all the terms
    for search, incl, excl in zip(terms, inclusions, exclusions):

        # Collect term information and make search term argument
        term = Term(search[0], search, incl, excl)
        term_arg = mk_term(term)

        if verbose:
            print('Collecting data for: ', term.label)

        # Initialize object to store data for current term articles
        arts = Articles(term)

        # Request web page
        url = urls.get_url('search', settings={'term': term_arg})
        page = req.request_url(url)
        page_soup = BeautifulSoup(page.content, 'lxml')

        if usehistory:

            # Get number of articles, and keys to use history
            count = int(page_soup.find('count').text)
            web_env = page_soup.find('webenv').text
            query_key = page_soup.find('querykey').text

            # Loop through, collecting article data, using history
            ret_start_it = 0
            while ret_start_it < count:

                # Set the number of articles per iteration (the ret_max per call)
                #  This defaults to 100, but will set to less if fewer needed to reach retmax
                ret_end_it = min(100, int(retmax) - ret_start_it)

                # Get article page, collect data, update position
                url_settings = {
                    'WebEnv': web_env,
                    'query_key': query_key,
                    'retstart': str(ret_start_it),
                    'retmax': str(ret_end_it)
                }
                art_url = urls.get_url('fetch', settings=url_settings)
                arts = get_articles(req, art_url, arts)
                ret_start_it += ret_end_it

                if ret_start_it >= int(retmax):
                    break

        # Without using history
        else:

            ids = page_soup.find_all('id')
            art_url = urls.get_url('fetch', settings={'id': ids_to_str(ids)})
            arts = get_articles(req, art_url, arts)

        arts._check_results()

        if save_and_clear:
            arts.save_and_clear(directory=directory)
        results.append(arts)

    meta_data.add_requester(req)

    return results, meta_data
Пример #2
0
def collect_counts(terms_a,
                   inclusions_a=None,
                   exclusions_a=None,
                   labels_a=None,
                   terms_b=None,
                   inclusions_b=None,
                   exclusions_b=None,
                   labels_b=None,
                   db='pubmed',
                   field='TIAB',
                   api_key=None,
                   collect_coocs=True,
                   logging=None,
                   directory=None,
                   verbose=False,
                   **eutils_kwargs):
    """Collect count and term co-occurrence data from EUtils.

    Parameters
    ----------
    terms_a : list of list of str
        Search terms.
    inclusions_a : list of list of str, optional
        Inclusion words for search terms.
    exclusions_a : list of list of str, optional
        Exclusion words for search terms.
    labels_a : list of str, optional
        Labels for the search terms.
    terms_b : list of list of str, optional
        Secondary list of search terms.
    inclusions_b : list of list of str, optional
        Inclusion words for the second list of search terms.
    exclusions_b : list of list of str, optional
        Exclusion words for the second list of search terms.
    labels_b : list of str
        Labels for the second list of search terms.
    db : str, optional, default: 'pubmed'
        Which database to access from EUtils.
    field : str, optional, default: 'TIAB'
        Field to search for term within.
        Defaults to 'TIAB', which is Title/Abstract.
    api_key : str, optional
        An API key for a NCBI account.
    collect_coocs : bool, optional, default: True
        Whether to collect co-occurence data.
        If False, only collects the counts for first term list.
    logging : {None, 'print', 'store', 'file'}, optional
        What kind of logging, if any, to do for requested URLs.
    directory : str or SCDB, optional
        Folder or database object specifying the save location.
    verbose : bool, optional, default: False
        Whether to print out updates.
    **eutils_kwargs
        Additional settings for the EUtils API.

    Returns
    -------
    co_occurences : 2d array
        The numbers of articles found for each combination of terms.
        Only returned if `collect_coocs` is True.
    counts : 1d array or list of 1d array
        Number of articles for each term independently.
    meta_data : dict
        Meta data from the data collection.

    Notes
    -----
    The collection does an exact word search for search terms.

    The HTML page returned by the EUtils search includes a 'count' field.
    This field contains the number of articles with both terms. This is extracted.

    Examples
    --------
    Collect counts and co-occurrences for a single set of two search terms:

    >>> coocs, counts, meta_data = collect_counts([['frontal lobe'], ['temporal lobe']])

    Collect counts and co-occurrences for two sets of search terms:

    >>> coocs, counts, meta_data = collect_counts(terms_a=[['frontal lobe'], ['temporal lobe']],
    ...                                           terms_b=[['attention'], ['perception']])
    """

    # Get e-utils URLS object. Set retmax as 0, since not using UIDs for counts
    urls = EUtils(db=db,
                  retmax='0',
                  field=field,
                  retmode='xml',
                  **eutils_kwargs,
                  api_key=api_key)

    # Define the settings for the search utility, adding a default for datetype if not provided
    search_settings = ['db', 'retmax', 'retmode', 'field']
    if 'date' in ''.join(
            eutils_kwargs.keys()) and 'datetype' not in eutils_kwargs.keys():
        search_settings.append('datetype')

    # Build the URLs for the utilities that will be used
    urls.build_url('info', settings=['db'])
    urls.build_url('search',
                   settings=search_settings + list(eutils_kwargs.keys()))

    # Initialize meta data object
    meta_data = MetaData()

    # Check for a Requester object to be passed in as logging, otherwise initialize
    req = logging if isinstance(logging, Requester) else \
        Requester(wait_time=get_wait_time(urls.authenticated),
                  logging=logging, directory=directory)

    # Sort out terms for list a
    n_terms_a = len(terms_a)
    counts_a = np.ones([n_terms_a], dtype=int) * -1
    labels_a = labels_a if labels_a else [term[0] for term in terms_a]
    inclusions_a = [[]] * n_terms_a if not inclusions_a else inclusions_a
    exclusions_a = [[]] * n_terms_a if not exclusions_a else exclusions_a

    # If collecting co-occurences, sort out terms for list b and initialize co-occurence stores
    if collect_coocs:

        if not terms_b:
            square = True
            terms_b, inclusions_b, exclusions_b = terms_a, inclusions_a, exclusions_a
        else:
            square = False
        n_terms_b = len(terms_b)

        counts_b = np.ones([n_terms_b], dtype=int) * -1
        labels_b = labels_b if labels_b else [term[0] for term in terms_b]
        inclusions_b = [[]] * n_terms_b if not inclusions_b else inclusions_b
        exclusions_b = [[]] * n_terms_b if not exclusions_b else exclusions_b

        # Initialize matrices to store co-occurrence data
        co_occurences = np.ones([n_terms_a, n_terms_b], dtype=int) * -1

        # Set diagonal to zero if square (term co-occurrence with itself)
        if square:
            np.fill_diagonal(co_occurences, 0)

    # Get current information about database being used
    meta_data.add_db_info(get_db_info(req, urls.get_url('info')))

    # Loop through each term (list-A)
    for a_ind, (label_a, search_a, incl_a, excl_a) in \
        enumerate(zip(labels_a, terms_a, inclusions_a, exclusions_a)):

        # Make term arguments
        term_a = Term(label_a, search_a, incl_a, excl_a)
        term_a_arg = make_term(term_a)

        if verbose:
            print('Running counts for: ', term_a.label)

        # Get number of results for current term search
        url = urls.get_url('search', settings={'term': term_a_arg})
        counts_a[a_ind] = get_count(req, url)

        if collect_coocs:

            # For each term in list a, loop through each term in list b
            for b_ind, (label_b, search_b, incl_b, excl_b) in \
                enumerate(zip(labels_b, terms_b, inclusions_b, exclusions_b)):

                # Skip collections of equivalent term combinations - if single term list
                #  This will skip the diagonal row, and any combinations already collected
                if square and co_occurences[a_ind, b_ind] != -1:
                    continue

                # Make term arguments
                term_b = Term(label_b, search_b, incl_b, excl_b)
                term_b_arg = make_term(term_b)
                full_term_arg = join(term_a_arg, term_b_arg, 'AND')

                # Get number of results for current term search
                if not square:
                    url = urls.get_url('search', settings={'term': term_b_arg})
                    counts_b[b_ind] = get_count(req, url)

                # Get number of results for combination of terms
                url = urls.get_url('search', settings={'term': full_term_arg})
                count = get_count(req, url)

                co_occurences[a_ind, b_ind] = count
                if square:
                    co_occurences[b_ind, a_ind] = count

    if collect_coocs:
        counts = counts_a if square else [counts_a, counts_b]
    else:
        counts = counts_a

    meta_data.add_requester(req)

    if not collect_coocs:
        return counts, meta_data
    else:
        return co_occurences, counts, meta_data
Пример #3
0
def collect_counts(terms_a, inclusions_a=None, exclusions_a=None,
                   terms_b=None, inclusions_b=None, exclusions_b=None,
                   db='pubmed', field='TIAB', api_key=None,
                   logging=None, directory=None, verbose=False):
    """Collect count and term co-occurrence data from EUtils.

    Parameters
    ----------
    terms_a : list of list of str
        Search terms.
    inclusions_a : list of list of str, optional
        Inclusion words for search terms.
    exclusions_a : list of list of str, optional
        Exclusion words for search terms.
    terms_b : list of list of str, optional
        Secondary list of search terms.
    inclusions_b : list of list of str, optional
        Inclusion words for secondary list of search terms.
    exclusions_b : list of list of str, optional
        Exclusion words for secondary list of search terms.
    db : str, optional, default: 'pubmed'
        Which database to access from EUtils.
    field : str, optional, default: 'TIAB'
        Field to search for term within.
        Defaults to 'TIAB', which is Title/Abstract.
    api_key : str, optional
        An API key for a NCBI account.
    logging : {None, 'print', 'store', 'file'}, optional
        What kind of logging, if any, to do for requested URLs.
    directory : str or SCDB, optional
        Folder or database object specifying the save location.
    verbose : bool, optional, default: False
        Whether to print out updates.

    Returns
    -------
    co_occurences : 2d array
        The numbers of articles found for each combination of terms.
    counts : 1d array or list of 1d array
        Number of articles for each term independently.
    meta_data : dict
        Meta data from the data collection.

    Notes
    -----
    The collection does an exact word search for two terms.

    The HTML page returned by the EUtils search includes a 'count' field.
    This field contains the number of articles with both terms. This is extracted.

    Examples
    --------
    Collect counts and co-occurrences for a single set of two search terms:

    >>> coocs, counts, meta_data = collect_counts([['frontal lobe'], ['temporal lobe']])

    Collect counts and co-occurrences for two sets of search terms:

    >>> coocs, counts, meta_data = collect_counts(terms_a=[['frontal lobe'], ['temporal lobe']],
    ...                                           terms_b=[['attention'], ['perception']])
    """

    # Get e-utils URLS object. Set retmax as 0, since not using UIDs for counts
    urls = EUtils(db=db, retmax='0', field=field, retmode='xml', api_key=api_key)
    urls.build_url('info', settings=['db'])
    urls.build_url('search', settings=['db', 'retmax', 'retmode', 'field'])

    # Initialize meta data & requester
    meta_data = MetaData()
    req = Requester(wait_time=get_wait_time(urls.authenticated),
                    logging=logging, directory=directory)

    # Sort out terms
    n_terms_a = len(terms_a)
    if not terms_b:
        square = True
        terms_b, inclusions_b, exclusions_b = terms_a, inclusions_a, exclusions_a
    else:
        square = False
    n_terms_b = len(terms_b)

    # Check inclusions & exclusions
    inclusions_a = [[]] * n_terms_a if not inclusions_a else inclusions_a
    inclusions_b = [[]] * n_terms_b if not inclusions_b else inclusions_b
    exclusions_a = [[]] * n_terms_a if not exclusions_a else exclusions_a
    exclusions_b = [[]] * n_terms_b if not exclusions_b else exclusions_b

    # Initialize count variables to the correct length
    counts_a = np.ones([n_terms_a], dtype=int) * -1
    counts_b = np.ones([n_terms_b], dtype=int) * -1

    # Initialize right size matrices to store co-occurence data
    co_occurences = np.ones([n_terms_a, n_terms_b], dtype=int) * -1

    # Set diagonal to zero if square (term co-occurrence with itself)
    if square:
        np.fill_diagonal(co_occurences, 0)

    # Get current information about database being used
    meta_data.add_db_info(get_db_info(req, urls.get_url('info')))

    # Loop through each term (list-A)
    for a_ind, (search_a, incl_a, excl_a) in enumerate(zip(terms_a, inclusions_a, exclusions_a)):

        # Make term arguments
        term_a = Term(search_a[0], search_a, incl_a, excl_a)
        term_a_arg = mk_term(term_a)

        if verbose:
            print('Running counts for: ', term_a.label)

        # Get number of results for current term search
        url = urls.get_url('search', settings={'term' : term_a_arg})
        counts_a[a_ind] = get_count(req, url)

        # For each term in list a, loop through each term in list b
        for b_ind, (search_b, incl_b, excl_b) in enumerate(zip(terms_b, inclusions_b, exclusions_b)):

            # Skip collections of equivalent term combinations - if single term list
            #  This will skip the diagonal row, and any combinations already collected
            if square and co_occurences[a_ind, b_ind] != -1:
                continue

            # Make term arguments
            term_b = Term(search_b[0], search_b, incl_b, excl_b)
            term_b_arg = mk_term(term_b)
            full_term_arg = join(term_a_arg, term_b_arg, 'AND')

            # Get number of results for current term search
            if not square:
                url = urls.get_url('search', settings={'term' : term_b_arg})
                counts_b[b_ind] = get_count(req, url)

            # Get number of results for combination of terms
            url = urls.get_url('search', settings={'term' : full_term_arg})
            count = get_count(req, url)

            co_occurences[a_ind, b_ind] = count
            if square:
                co_occurences[b_ind, a_ind] = count

    if square:
        counts = counts_a
    else:
        counts = [counts_a, counts_b]

    meta_data.add_requester(req)

    return co_occurences, counts, meta_data
Пример #4
0
def collect_words(terms, inclusions=None, exclusions=None, labels=None,
                  db='pubmed', retmax=100, field='TIAB', usehistory=False,
                  api_key=None, save_and_clear=False, logging=None, directory=None,
                  verbose=False, **eutils_kwargs):
    """Collect text data and metadata from EUtils using specified search term(s).

    Parameters
    ----------
    terms : list of list of str
        Search terms.
    inclusions : list of list of str, optional
        Inclusion words for search terms.
    exclusions : list of list of str, optional
        Exclusion words for search terms.
    labels : list of str, optional
        Labels for the search terms.
    db : str, optional, default: 'pubmed'
        Which database to access from EUtils.
    retmax : int, optional, default: 100
        Maximum number of articles to return.
    field : str, optional, default: 'TIAB'
        Field to search for term within.
        Defaults to 'TIAB', which is Title/Abstract.
    usehistory : bool, optional, default: False
        Whether to use EUtils history, storing results on their server.
    api_key : str, optional
        An API key for a NCBI account.
    save_and_clear : bool, optional, default: False
        Whether to save words data to disk per term as it goes, instead of holding in memory.
    logging : {None, 'print', 'store', 'file'}
        What kind of logging, if any, to do for requested URLs.
    directory : str or SCDB, optional
        Folder or database object specifying the save location.
    verbose : bool, optional, default: False
        Whether to print out updates.
    **eutils_kwargs
        Additional settings for the EUtils API.

    Returns
    -------
    results : list of Articles
        Results from collecting data for each term.
    meta_data : MetaData
        Meta data from the data collection.

    Notes
    -----
    The collection does an exact word search for the term given. It then loops through all
    the articles found for that term.

    For each article, it pulls and saves out data (including title, abstract, authors, etc),
    using the hierarchical tag structure that organizes the articles.

    Examples
    --------
    Collect words data for two terms, limiting the results to 5 articles per term:

    >>> results, meta_data = collect_words([['frontal lobe'], ['temporal lobe']], retmax=5)
    """

    # Check for valid database based on what words is set up to collect
    if db != 'pubmed':
        msg = 'Only the `pubmed` database is currently supported for words collection.'
        raise NotImplementedError(msg)

    # Get EUtils URLS object, with desired settings, and build required utility URLs
    urls = EUtils(db=db, retmax=retmax, usehistory='y' if usehistory else 'n',
                  field=field, retmode='xml', **eutils_kwargs, api_key=api_key)

    # Define the settings for the search utility, adding a default for datetype if not provided
    search_settings = ['db', 'usehistory', 'retmax', 'retmode', 'field']
    if 'date' in ''.join(eutils_kwargs.keys()) and 'datetype' not in eutils_kwargs.keys():
        search_settings.append('datetype')

    # Build the URLs for the utilities that will be used
    urls.build_url('info', settings=['db'])
    urls.build_url('search', settings=search_settings + list(eutils_kwargs.keys()))
    urls.build_url('fetch', settings=['db', 'retmode'])

    # Initialize results & meta data
    results = []
    meta_data = MetaData()

    # Check for a Requester object to be passed in as logging, otherwise initialize
    req = logging if isinstance(logging, Requester) else \
        Requester(wait_time=get_wait_time(urls.authenticated),
                  logging=logging, directory=directory)

    # Get current information about database being used
    meta_data.add_db_info(get_db_info(req, urls.get_url('info')))

    # Check labels, inclusions & exclusions
    labels = labels if labels else [term[0] for term in terms]
    inclusions = inclusions if inclusions else [[]] * len(terms)
    exclusions = exclusions if exclusions else [[]] * len(terms)

    # Loop through all the terms
    for label, search, incl, excl in zip(labels, terms, inclusions, exclusions):

        # Collect term information and make search term argument
        term = Term(label, search, incl, excl)
        term_arg = make_term(term)

        if verbose:
            print('Collecting data for: ', term.label)

        # Initialize object to store data for current term articles
        arts = Articles(term)

        # Request web page
        url = urls.get_url('search', settings={'term' : term_arg})
        page = req.request_url(url)
        page_soup = BeautifulSoup(page.content, 'lxml')

        # Get number of articles
        count = int(page_soup.find('count').text)

        # Collect articles, using history
        if usehistory:

            # Get the information from the page for using history
            web_env = page_soup.find('webenv').text
            query_key = page_soup.find('querykey').text

            # Set default retmax per history iteration
            retmax_hist = 100

            # Loop through, using history to collect groups of articles at a time
            retstart_it = 0
            while retstart_it < count:

                # Set the retmax for the current iteration
                retmax_it = min(retmax-retstart_it, retmax_hist)

                # Get article page, collect data
                url_settings = {'WebEnv' : web_env, 'query_key' : query_key,
                                'retstart' : str(retstart_it), 'retmax' : str(retmax_it)}
                art_url = urls.get_url('fetch', settings=url_settings)
                arts = get_articles(req, art_url, arts)

                # Update position for counting, and break out if more than global retmax
                retstart_it += retmax_hist
                if retstart_it >= int(retmax):
                    break

        # Without using history
        else:

            ids = page_soup.find_all('id')
            ids_str = ','.join([el.text for el in ids])
            art_url = urls.get_url('fetch', settings={'id' : ids_str})
            arts = get_articles(req, art_url, arts)

        arts._check_results()

        if save_and_clear:
            arts.save_and_clear(directory=directory)
        results.append(arts)

    meta_data.add_requester(req)

    return results, meta_data