def scrape_pr_data(url, path):
    """Retrieve the press release from Eurekalert and extract the info.

    Parameters
    ----------
    url : str
        Fetch URL for the desired press release
    path : str
        Path to the save location for scraped data
    """

    # Initialize Requester object for URL requests
    req = Requester()

    # Use Requester() to open the press release URL
    art_page = req.get_url(url)

    # Get press release into a more convenient format for info extraction
    page_soup = BeautifulSoup(art_page.content, 'lxml')

    # Initialize a press release object to store the scraped data and extract info
    #   For this demo - this is a minimized version of the data extraction
    #      Otherwise, there is a whole procedure to pull out all the data from this page
    pr_dict = {'data': 1, 'data2': 2}

    # Close the URL request
    req.close()

    return pr_dict
def test_check():
    """Test the check method."""

    req = Requester()
    req.check()

    assert True
Exemplo n.º 3
0
def test_open():

    req = Requester()

    req.open()

    assert req.is_active
Exemplo n.º 4
0
def test_get_url():

    req = Requester()

    web_page = req.get_url('http://www.google.com')

    assert web_page
Exemplo n.º 5
0
def test_wait():

    req = Requester()

    req.wait(0.01)

    assert True
def test_open():
    """Test the open method."""

    req = Requester()

    req.open()

    assert req.is_active
Exemplo n.º 7
0
def test_throttle():

    req = Requester()
    req.time_last_req = time.time()

    req.throttle()

    assert True
def test_wait():
    """Test the wait method."""

    req = Requester()

    req.wait(0.01)

    assert True
def test_get_url():
    """Test the get_url method."""

    req = Requester()

    web_page = req.get_url('http://www.google.com')

    assert web_page
def test_close():
    """Test the close method."""

    req = Requester()

    req.open()
    req.close()

    assert not req.is_active
def test_throttle():
    """Test the throttle method."""

    req = Requester()
    req.time_last_req = time.time()

    req.throttle()

    assert True
Exemplo n.º 12
0
def test_close():

    req = Requester()

    req.open()
    req.close()

    assert not req.is_active
def scrape_counts(terms_lst_a, excls_lst_a=[], terms_lst_b=[], excls_lst_b=[], db='pubmed', verbose=False):
    """Search through pubmed for all abstracts for co-occurence.

    Parameters
    ----------
    terms_lst_a : list of list of str
        Search terms.
    excl_lst_a : list of list of str, optional
        Exclusion words for search terms.
    terms_lst_b : list of list of str, optional
        Secondary list of search terms.
    excl_lst_b : list of list of str, optional
        Exclusion words for secondary list of search terms.
    db : str, optional (default: 'pubmed')
        Which pubmed database to use.
    verbose : bool, optional (default: False)
        Whether to print out updates.

    Returns
    -------
    dat_numbers : 2d array
        The numbers of papers found for each combination of terms.
    dat_percent : 2d array
        The percentage of papers for each term that include the corresponding term.
    term_a_counts : 1d array
        Number of papers for each term.
    term_b_counts : 1d array
        Number of papers for each term, in the secondary list of terms.
    meta_dat : dict
        Meta data from the scrape.

    The scraping does an exact word search for two terms.

    The HTML page returned by the pubmed search includes a 'count' field.
    This field contains the number of papers with both terms. This is extracted.
    """

    # Initialize meta data
    meta_dat = dict()

    # Initlaize Requester object
    req = Requester()

    # Set date of when data was scraped
    meta_dat['date'] = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

    # Get e-utils URLS object. Set retmax as 0, since not using UIDs in this analysis
    urls = URLS(db=db, retmax='0', retmode='xml', field='TIAB')
    urls.build_info(['db'])
    urls.build_search(['db', 'retmax', 'retmode', 'field'])

    # Sort out terms
    n_terms_a = len(terms_lst_a)
    if len(terms_lst_b) == 0:
        square = True
        terms_lst_b = terms_lst_a
        excls_lst_b = excls_lst_a
    else:
        square = False
    n_terms_b = len(terms_lst_b)

    # Check exclusions
    if not excls_lst_a:
        excls_lst_a = [[]] * n_terms_a
    if not excls_lst_b:
        excls_lst_b = [[]] * n_terms_b

    # Initialize count variables to the correct length
    term_a_counts = np.ones([n_terms_a], dtype=int) * -1
    term_b_counts = np.ones([n_terms_b], dtype=int) * -1

    # Initialize right size matrices to store data
    dat_numbers = np.ones([n_terms_a, n_terms_b], dtype=int) * -1
    dat_percent = np.ones([n_terms_a, n_terms_b]) * -1

    # Set diagonal to zero if square (term co-occurence with itself)
    if square:
        np.fill_diagonal(dat_numbers, 0)
        np.fill_diagonal(dat_percent, 0)

    # Get current information about database being used
    meta_dat['db_info'] = _get_db_info(req, urls.info)

    # Loop through each term (list-A)
    for a_ind, term_a in enumerate(terms_lst_a):

        # Print out status
        if verbose:
            print('Running counts for: ', terms_lst_a[a_ind][0])

        # Get number of results for current term search
        url = urls.search + _mk(terms_lst_a[a_ind]) + \
              _mk(excls_lst_a[a_ind], 'NOT')
        term_a_counts[a_ind] = _get_count(req, url)

        # Loop through each term (list-b)
        for b_ind, term_b in enumerate(terms_lst_b):

            # Skip scrapes of equivalent term combinations - if single term list
            #  This will skip the diaonal row, and any combinations already scraped
            if square and dat_numbers[a_ind, b_ind] != -1:
                continue

            # Get number of results for just term search
            url = urls.search + _mk(terms_lst_b[b_ind]) + \
            	_mk(excls_lst_b[b_ind], 'NOT')
            term_b_counts[b_ind] = _get_count(req, url)

            # Make URL - Exact Term Version, using double quotes, & exclusions
            url = urls.search + _mk(terms_lst_a[a_ind]) + \
                    _mk(excls_lst_a[a_ind], 'NOT') + \
                    _mk(terms_lst_b[b_ind], 'AND') + \
                    _mk(excls_lst_b[b_ind], 'NOT')

            count = _get_count(req, url)

            dat_numbers[a_ind, b_ind] = count
            dat_percent[a_ind, b_ind] = count / term_a_counts[a_ind]

            if square:
                dat_numbers[b_ind, a_ind] = count
                dat_percent[b_ind, a_ind] = count / term_b_counts[b_ind]

        # Save (?)
        #np.save('dat_numbers_' + term_a[0] + '.npy', dat_numbers)
        #np.save('dat_percent_' + term_a[0] + '.npy', dat_percent)

    # Set Requester object as finished being used
    req.close()
    meta_dat['req'] = req

    return dat_numbers, dat_percent, term_a_counts, term_b_counts, meta_dat
def scrape_words(terms_lst, exclusions_lst=[], db='pubmed', retmax=None,
                 use_hist=False, save_n_clear=True, verbose=False):
    """Search and scrape from pubmed for all abstracts referring to a given term.

    Parameters
    ----------
    terms_lst : list of list of str
        Search terms.
    exclusions_lst : list of list of str, optional
        Exclusion words for search terms.
    db : str, optional (default: 'pubmed')
        Which pubmed database to use.
    retmax : int, optional
        Maximum number of records to return.
    use_hist : bool, optional (default: False)
        Use e-utilities history: storing results on their server, as needed.
    save_n_clear : bool, optional (default: False)
        Whether to
    verbose : bool, optional (default: False)
        Whether to print out updates.

    Returns
    -------
    results : list of lisc Data() objects
        Results from the scraping data for each term.
    meta_dat : dict
        Meta data from the scrape.

    Notes
    -----
    The scraping does an exact word search for the term given.
    It then loops through all the articles found about that data.
    For each article, pulls and saves out data (including title, abstract, authors, etc)
        Pulls data using the hierarchical tag structure that organize the articles.
        This procedure loops through each article tag.
    """

    results = []
    meta_dat = dict()

    # Requester object
    req = Requester()

    # Set date of when data was collected
    meta_dat['date'] = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

    # Get e-utils URLS object
    hist_val = 'y' if use_hist else 'n'
    urls = URLS(db=db, usehistory=hist_val, retmax=retmax, retmode='xml', field='TIAB', auto_gen=False)
    urls.build_info(['db'])
    urls.build_search(['db', 'usehistory', 'retmax', 'retmode', 'field'])
    urls.build_fetch(['db', 'retmode'])

    # Get current information about database being used
    meta_dat['db_info'] = _get_db_info(req, urls.info)

    # Check exclusions
    if not exclusions_lst:
        exclusions_lst = [[] for i in range(len(terms_lst))]

    # Loop through all the terms
    for ind, terms in enumerate(terms_lst):

        # Print out status
        if verbose:
            print('Scraping words for: ', terms[0])

        # Initiliaze object to store data for current term papers
        cur_dat = Data(terms[0], terms)

        # Set up search terms - add exclusions, if there are any
        if exclusions_lst[ind]:
            term_arg = comb_terms(terms, 'or') + comb_terms(exclusions_lst[ind], 'not')
        else:
            term_arg = comb_terms(terms, 'or')

        # Create the url for the search term
        url = urls.search + term_arg

        # Update History
        cur_dat.update_history('Start Scrape')

        # Get page and parse
        page = req.get_url(url)
        page_soup = BeautifulSoup(page.content, 'lxml')

        # Using history
        if use_hist:

            # Initialize to start at 0
            ret_start_it = 0

            # Get number of papers, and keys to use history
            count = int(page_soup.find('count').text)
            web_env = page_soup.find('webenv').text
            query_key = page_soup.find('querykey').text

            # Loop through pulling paper data, using history
            while ret_start_it < count:

                # Set the number of papers per iteration (the ret_max per call)
                #  This defaults to 100, but will sets to less if fewer needed to reach retmax
                ret_end_it = min(100, int(retmax) - ret_start_it)

                # Get article page, scrape data, update position
                art_url = urls.fetch + '&WebEnv=' + web_env + '&query_key=' + query_key + \
                          '&retstart=' + str(ret_start_it) + '&retmax=' + str(ret_end_it)
                cur_dat = _scrape_papers(req, art_url, cur_dat)
                ret_start_it += ret_end_it

                # Stop if number of scraped papers has reached total retmax
                if ret_start_it >= int(retmax):
                    break

        # Without using history
        else:

            # Get all ids
            ids = page_soup.find_all('id')

            # Convert ids to string
            ids_str = _ids_to_str(ids)

            # Get article page & scrape data
            art_url = urls.fetch + '&id=' + ids_str
            cur_dat = _scrape_papers(req, art_url, cur_dat)

        # Check consistency of extracted results
        cur_dat.check_results()
        cur_dat.update_history('End Scrape')

        # Save out and clear data
        if save_n_clear:
            cur_dat.save_n_clear()
        results.append(cur_dat)

    # Set Requester object as finished being used
    req.close()
    meta_dat['req'] = req

    return results, meta_dat
Exemplo n.º 15
0
def scrape_counts(terms_lst_a,
                  excls_lst_a=[],
                  terms_lst_b=[],
                  excls_lst_b=[],
                  db='pubmed',
                  verbose=False):
    """Search through pubmed for all abstracts for co-occurence.

    Parameters
    ----------
    terms_lst_a : list of list of str
        Search terms.
    excl_lst_a : list of list of str, optional
        Exclusion words for search terms.
    terms_lst_b : list of list of str, optional
        Secondary list of search terms.
    excl_lst_b : list of list of str, optional
        Exclusion words for secondary list of search terms.
    db : str, optional (default: 'pubmed')
        Which pubmed database to use.
    verbose : bool, optional (default: False)
        Whether to print out updates.

    Returns
    -------
    dat_numbers : 2d array
        The numbers of papers found for each combination of terms.
    dat_percent : 2d array
        The percentage of papers for each term that include the corresponding term.
    term_a_counts : 1d array
        Number of papers for each term.
    term_b_counts : 1d array
        Number of papers for each term, in the secondary list of terms.
    meta_dat : dict
        Meta data from the scrape.

    The scraping does an exact word search for two terms.

    The HTML page returned by the pubmed search includes a 'count' field.
    This field contains the number of papers with both terms. This is extracted.
    """

    # Initialize meta data
    meta_dat = dict()

    # Initlaize Requester object
    req = Requester()

    # Set date of when data was scraped
    meta_dat['date'] = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

    # Get e-utils URLS object. Set retmax as 0, since not using UIDs in this analysis
    urls = URLS(db=db, retmax='0', retmode='xml', field='TIAB')
    urls.build_info(['db'])
    urls.build_search(['db', 'retmax', 'retmode', 'field'])

    # Sort out terms
    n_terms_a = len(terms_lst_a)
    if len(terms_lst_b) == 0:
        square = True
        terms_lst_b = terms_lst_a
        excls_lst_b = excls_lst_a
    else:
        square = False
    n_terms_b = len(terms_lst_b)

    # Check exclusions
    if not excls_lst_a:
        excls_lst_a = [[]] * n_terms_a
    if not excls_lst_b:
        excls_lst_b = [[]] * n_terms_b

    # Initialize count variables to the correct length
    term_a_counts = np.ones([n_terms_a], dtype=int) * -1
    term_b_counts = np.ones([n_terms_b], dtype=int) * -1

    # Initialize right size matrices to store data
    dat_numbers = np.ones([n_terms_a, n_terms_b], dtype=int) * -1
    dat_percent = np.ones([n_terms_a, n_terms_b]) * -1

    # Set diagonal to zero if square (term co-occurence with itself)
    if square:
        np.fill_diagonal(dat_numbers, 0)
        np.fill_diagonal(dat_percent, 0)

    # Get current information about database being used
    meta_dat['db_info'] = _get_db_info(req, urls.info)

    # Loop through each term (list-A)
    for a_ind, term_a in enumerate(terms_lst_a):

        # Print out status
        if verbose:
            print('Running counts for: ', terms_lst_a[a_ind][0])

        # Get number of results for current term search
        url = urls.search + _mk(terms_lst_a[a_ind]) + \
              _mk(excls_lst_a[a_ind], 'NOT')
        term_a_counts[a_ind] = _get_count(req, url)

        # Loop through each term (list-b)
        for b_ind, term_b in enumerate(terms_lst_b):

            # Skip scrapes of equivalent term combinations - if single term list
            #  This will skip the diaonal row, and any combinations already scraped
            if square and dat_numbers[a_ind, b_ind] != -1:
                continue

            # Get number of results for just term search
            url = urls.search + _mk(terms_lst_b[b_ind]) + \
             _mk(excls_lst_b[b_ind], 'NOT')
            term_b_counts[b_ind] = _get_count(req, url)

            # Make URL - Exact Term Version, using double quotes, & exclusions
            url = urls.search + _mk(terms_lst_a[a_ind]) + \
                    _mk(excls_lst_a[a_ind], 'NOT') + \
                    _mk(terms_lst_b[b_ind], 'AND') + \
                    _mk(excls_lst_b[b_ind], 'NOT')

            count = _get_count(req, url)

            dat_numbers[a_ind, b_ind] = count
            dat_percent[a_ind, b_ind] = count / term_a_counts[a_ind]

            if square:
                dat_numbers[b_ind, a_ind] = count
                dat_percent[b_ind, a_ind] = count / term_b_counts[b_ind]

        # Save (?)
        #np.save('dat_numbers_' + term_a[0] + '.npy', dat_numbers)
        #np.save('dat_percent_' + term_a[0] + '.npy', dat_percent)

    # Set Requester object as finished being used
    req.close()
    meta_dat['req'] = req

    return dat_numbers, dat_percent, term_a_counts, term_b_counts, meta_dat
Exemplo n.º 16
0
def scrape_words(terms_lst,
                 exclusions_lst=[],
                 db='pubmed',
                 retmax=None,
                 use_hist=False,
                 save_n_clear=True,
                 verbose=False):
    """Search and scrape from pubmed for all abstracts referring to a given term.

    Parameters
    ----------
    terms_lst : list of list of str
        Search terms.
    exclusions_lst : list of list of str, optional
        Exclusion words for search terms.
    db : str, optional (default: 'pubmed')
        Which pubmed database to use.
    retmax : int, optional
        Maximum number of records to return.
    use_hist : bool, optional (default: False)
        Use e-utilities history: storing results on their server, as needed.
    save_n_clear : bool, optional (default: False)
        Whether to
    verbose : bool, optional (default: False)
        Whether to print out updates.

    Returns
    -------
    results : list of lisc Data() objects
        Results from the scraping data for each term.
    meta_dat : dict
        Meta data from the scrape.

    Notes
    -----
    The scraping does an exact word search for the term given.
    It then loops through all the articles found about that data.
    For each article, pulls and saves out data (including title, abstract, authors, etc)
        Pulls data using the hierarchical tag structure that organize the articles.
        This procedure loops through each article tag.
    """

    results = []
    meta_dat = dict()

    # Requester object
    req = Requester()

    # Set date of when data was collected
    meta_dat['date'] = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

    # Get e-utils URLS object
    hist_val = 'y' if use_hist else 'n'
    urls = URLS(db=db,
                usehistory=hist_val,
                retmax=retmax,
                retmode='xml',
                field='TIAB',
                auto_gen=False)
    urls.build_info(['db'])
    urls.build_search(['db', 'usehistory', 'retmax', 'retmode', 'field'])
    urls.build_fetch(['db', 'retmode'])

    # Get current information about database being used
    meta_dat['db_info'] = _get_db_info(req, urls.info)

    # Check exclusions
    if not exclusions_lst:
        exclusions_lst = [[] for i in range(len(terms_lst))]

    # Loop through all the terms
    for ind, terms in enumerate(terms_lst):

        # Print out status
        if verbose:
            print('Scraping words for: ', terms[0])

        # Initiliaze object to store data for current term papers
        cur_dat = Data(terms[0], terms)

        # Set up search terms - add exclusions, if there are any
        if exclusions_lst[ind]:
            term_arg = comb_terms(terms, 'or') + comb_terms(
                exclusions_lst[ind], 'not')
        else:
            term_arg = comb_terms(terms, 'or')

        # Create the url for the search term
        url = urls.search + term_arg

        # Update History
        cur_dat.update_history('Start Scrape')

        # Get page and parse
        page = req.get_url(url)
        page_soup = BeautifulSoup(page.content, 'lxml')

        # Using history
        if use_hist:

            # Initialize to start at 0
            ret_start_it = 0

            # Get number of papers, and keys to use history
            count = int(page_soup.find('count').text)
            web_env = page_soup.find('webenv').text
            query_key = page_soup.find('querykey').text

            # Loop through pulling paper data, using history
            while ret_start_it < count:

                # Set the number of papers per iteration (the ret_max per call)
                #  This defaults to 100, but will sets to less if fewer needed to reach retmax
                ret_end_it = min(100, int(retmax) - ret_start_it)

                # Get article page, scrape data, update position
                art_url = urls.fetch + '&WebEnv=' + web_env + '&query_key=' + query_key + \
                          '&retstart=' + str(ret_start_it) + '&retmax=' + str(ret_end_it)
                cur_dat = _scrape_papers(req, art_url, cur_dat)
                ret_start_it += ret_end_it

                # Stop if number of scraped papers has reached total retmax
                if ret_start_it >= int(retmax):
                    break

        # Without using history
        else:

            # Get all ids
            ids = page_soup.find_all('id')

            # Convert ids to string
            ids_str = _ids_to_str(ids)

            # Get article page & scrape data
            art_url = urls.fetch + '&id=' + ids_str
            cur_dat = _scrape_papers(req, art_url, cur_dat)

        # Check consistency of extracted results
        cur_dat.check_results()
        cur_dat.update_history('End Scrape')

        # Save out and clear data
        if save_n_clear:
            cur_dat.save_n_clear()
        results.append(cur_dat)

    # Set Requester object as finished being used
    req.close()
    meta_dat['req'] = req

    return results, meta_dat
Exemplo n.º 17
0
def test_requester():

    assert Requester()
Exemplo n.º 18
0
def test_check():

    req = Requester()
    req.check()

    assert True
Exemplo n.º 19
0
def test_set_wait_time():

    req = Requester()
    req.set_wait_time(1)

    assert req.wait_time == 1