示例#1
0
def test_build_query():
    """Test the build_query() method from URLS()."""

    urls = URLS(db='pubmed')

    urls.build_query(['db'])

    assert urls.query
示例#2
0
def test_build_fetch():
    """Test the build_fetch() method from URLS()."""

    urls = URLS(db='pubmed', retmax='500', field='id', retmode='xml')

    urls.build_fetch(['db', 'retmode'])

    assert urls.fetch
示例#3
0
def test_build_info():
    """Test the build_info() method from URLS()."""

    urls = URLS()

    urls.build_info([])

    assert urls.info
示例#4
0
def test_check_args():
    """Test the check_args() method from URLS()."""

    urls = URLS(db='pubmed', field='id')

    urls.check_args(['db', 'field'])

    # Check error
    with raises(InconsistentDataError):
        urls.check_args(['db', 'retmax', 'field'])
示例#5
0
文件: count.py 项目: njr175/ERP_SCANR
    def scrape_data(self, db=None, verbose=False):
        """Search through pubmed for all abstracts with co-occurence of ERP & terms.

        The scraping does an exact word search for two terms (one ERP and one term)
        The HTML page returned by the pubmed search includes a 'count' field.
        This field contains the number of papers with both terms. This is extracted.
        """

        # Set date of when data was scraped
        self.date = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

        # Get e-utils URLS object. Set retmax as 0, since not using UIDs in this analysis
        urls = URLS(db=db, retmax='0', retmode='xml', field='TIAB')
        urls.build_info(['db'])
        urls.build_search(['db', 'retmax', 'retmode', 'field'])

        # Get current information about database being used
        self.get_db_info(urls.info)

        # Initialize count variables to the correct length
        self.term_counts = np.zeros([self.n_terms])
        self.erp_counts = np.zeros([self.n_erps])

        # Initialize right size matrices to store data
        self.dat_numbers = np.zeros([self.n_erps, self.n_terms], dtype=int)
        self.dat_percent = np.zeros([self.n_erps, self.n_terms])

        # Loop through each ERP term
        for erp_ls in self.erps:

            # Get the index of the current erp
            erp_ind = self.erps.index(erp_ls)

            # Print out status
            if verbose:
                print('Running counts for: ', self.labels[erp_ind])

            # Get number of results for just ERP search
            #url = urls.search + '"' + erp_ls[0] + '"'
            url = urls.search + _mk(self.erps[erp_ind]) + \
                  _mk(self.exclusions[erp_ind], 'NOT')
            self.erp_counts[erp_ind] = self._get_count(url)

            # For each ERP, loop through each term term
            for term_ls in self.terms:

                # Get the indices of the current term
                term_ind = self.terms.index(term_ls)

                # Get number of results for just term search
                #url = urls.search + '"' + term_ls[0] + '"'
                url = urls.search + _mk(self.terms[term_ind])
                self.term_counts[term_ind] = self._get_count(url)

                # Make URL - Exact Term Version, using double quotes, & exclusions
                url = urls.search + _mk(self.erps[erp_ind]) + \
                        _mk(self.exclusions[erp_ind], 'NOT') + \
                        _mk(self.terms[term_ind], 'AND')
                #
                #url = urls.search + '"' + erp_ls[0] + '"AND"' + term_ls[0] + '"'
                #url = urls.search + comb_terms(erp_ls, 'or') + 'AND' + comb_terms(term_ls, 'or')

                # Make URL - Non-exact term version
                #url = self.eutils_search + erp + ' erp ' + term

                count = self._get_count(url)
                self.dat_numbers[erp_ind, term_ind] = count
                self.dat_percent[erp_ind, term_ind] = count / self.erp_counts[erp_ind]


                """
                # Pull the page, and parse with Beautiful Soup
                page = self.req.get_url(url)
                page_soup = BeautifulSoup(page.content, 'lxml')

                # Get all 'count' tags
                counts = extract(page_soup, 'count', 'all')

                # Initialize empty temp vector to hold counts
                vec = []

                # Loop through counts, extracting into vec
                # There should be n+1 count fields, where n is the number of search terms
                #   The number of search terms includes all of them, including 'OR's & 'NOT's
                # Example: term=("N400"OR"N4")AND("language")NOT("cancer"OR"histone")
                #   Here there are 5 search terms, and so 6 count tags
                # The 1st count tag is the number of articles meeting the full search term
                #   Each subsequent count tag is each search term, in order.
                for count in counts:
                    vec.append(int(count.text))

                # Add the total number of papers for erp & term
                self.erp_counts[erp_ind] = vec[1]
                self.term_counts[term_ind] = vec[2]

                # Add the number & percent of overlapping papers
                self.dat_numbers[erp_ind, term_ind] = vec[0]
                self.dat_percent[erp_ind, term_ind] = vec[0]/vec[1]
                """

        # Set Requester object as finished being used
        self.req.close()
示例#6
0
    def scrape_data(self, db=None, verbose=False):
        """Search through pubmed for all abstracts with co-occurence of ERP & terms.

        The scraping does an exact word search for two terms (one ERP and one term)
        The HTML page returned by the pubmed search includes a 'count' field.
        This field contains the number of papers with both terms. This is extracted.
        """

        # Set date of when data was scraped
        self.date = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

        # Get e-utils URLS object. Set retmax as 0, since not using UIDs in this analysis
        urls = URLS(db=db, retmax='0', retmode='xml', field='TIAB')
        urls.build_info(['db'])
        urls.build_search(['db', 'retmax', 'retmode', 'field'])

        # Get current information about database being used
        self.get_db_info(urls.info)

        # Initialize count variables to the correct length
        self.term_counts = np.zeros([self.n_terms])
        self.erp_counts = np.zeros([self.n_erps])

        # Initialize right size matrices to store data
        self.dat_numbers = np.zeros([self.n_erps, self.n_terms], dtype=int)
        self.dat_percent = np.zeros([self.n_erps, self.n_terms])

        # Loop through each ERP term
        for erp_ls in self.erps:

            # Get the index of the current erp
            erp_ind = self.erps.index(erp_ls)

            # Print out status
            if verbose:
                print('Running counts for: ', self.labels[erp_ind])

            # Get number of results for just ERP search
            #url = urls.search + '"' + erp_ls[0] + '"'
            url = urls.search + _mk(self.erps[erp_ind]) + \
                  _mk(self.exclusions[erp_ind], 'NOT')
            self.erp_counts[erp_ind] = self._get_count(url)

            # For each ERP, loop through each term term
            for term_ls in self.terms:

                # Get the indices of the current term
                term_ind = self.terms.index(term_ls)

                # Get number of results for just term search
                #url = urls.search + '"' + term_ls[0] + '"'
                url = urls.search + _mk(self.terms[term_ind])
                self.term_counts[term_ind] = self._get_count(url)

                # Make URL - Exact Term Version, using double quotes, & exclusions
                url = urls.search + _mk(self.erps[erp_ind]) + \
                        _mk(self.exclusions[erp_ind], 'NOT') + \
                        _mk(self.terms[term_ind], 'AND')
                #
                #url = urls.search + '"' + erp_ls[0] + '"AND"' + term_ls[0] + '"'
                #url = urls.search + comb_terms(erp_ls, 'or') + 'AND' + comb_terms(term_ls, 'or')

                # Make URL - Non-exact term version
                #url = self.eutils_search + erp + ' erp ' + term

                count = self._get_count(url)
                self.dat_numbers[erp_ind, term_ind] = count
                self.dat_percent[erp_ind,
                                 term_ind] = count / self.erp_counts[erp_ind]
                """
                # Pull the page, and parse with Beautiful Soup
                page = self.req.get_url(url)
                page_soup = BeautifulSoup(page.content, 'lxml')

                # Get all 'count' tags
                counts = extract(page_soup, 'count', 'all')

                # Initialize empty temp vector to hold counts
                vec = []

                # Loop through counts, extracting into vec
                # There should be n+1 count fields, where n is the number of search terms
                #   The number of search terms includes all of them, including 'OR's & 'NOT's
                # Example: term=("N400"OR"N4")AND("language")NOT("cancer"OR"histone")
                #   Here there are 5 search terms, and so 6 count tags
                # The 1st count tag is the number of articles meeting the full search term
                #   Each subsequent count tag is each search term, in order.
                for count in counts:
                    vec.append(int(count.text))

                # Add the total number of papers for erp & term
                self.erp_counts[erp_ind] = vec[1]
                self.term_counts[term_ind] = vec[2]

                # Add the number & percent of overlapping papers
                self.dat_numbers[erp_ind, term_ind] = vec[0]
                self.dat_percent[erp_ind, term_ind] = vec[0]/vec[1]
                """

        # Set Requester object as finished being used
        self.req.close()
示例#7
0
文件: words.py 项目: njr175/ERP_SCANR
    def scrape_data(self, db=None, retmax=None, use_hist=False, verbose=False):
        """Search through pubmed for all abstracts referring to a given ERP.

        The scraping does an exact word search for the ERP term given.
        It then loops through all the artciles found about that data.
        For each article, pulls title, year and word data.

        Notes
        -----
        - Pulls data using the hierarchical tag structure that organize the articles.
        - Initially, the procedure was to pull all tags of a certain type.
            For example: extract all 'DateCreated' tags.
            This procedure fails (or badly organizes data) when an articles is
                missing a particular tag.
            Now: take advantage of the hierarchy, loop through each article tag.
                From here, pull out the data, if available.
                This way, can deal with cases of missing data.
        """

        # Set date of when data was collected
        self.date = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

        # Get e-utils URLS object
        if use_hist: hist_val = 'y'
        else: hist_val = 'n'
        urls = URLS(db=db, usehistory=hist_val, retmax=retmax, retmode='xml', field='TIAB', auto_gen=False)
        urls.build_info(['db'])
        urls.build_search(['db', 'usehistory', 'retmax', 'retmode', 'field'])
        urls.build_fetch(['db', 'retmode'])

        # Get current information about database being used
        self.get_db_info(urls.info)

        # Loop through all the erps
        #for ind, erp in enumerate(self.erps):
        for ind, lab in enumerate(self.labels):

            # Print out status
            print('Scraping words for: ', lab)

            # Initiliaze object to store data for current erp papers
            cur_erp = ERPData(lab, self.erps[ind])

            # Set up search terms - add exclusions, if there are any
            if self.exclusions[ind][0]:
                #term_arg = '"' + erp[0] + '"' + 'NOT' + '"' + self.exclusions[ind][0] + '"'
                term_arg = comb_terms(self.erps[ind], 'or') + comb_terms(self.exclusions[ind], 'not')
            else:
                #term_arg = '"' + erp[0] + '"'
                term_arg = comb_terms(self.erps[ind], 'or')

            # Create the url for the erp search term
            url = urls.search + term_arg

            # Get page and parse
            page = self.req.get_url(url)
            page_soup = BeautifulSoup(page.content, 'lxml')

            # Using history
            if use_hist:

                #
                ret_start = 0
                ret_max = 100

                #
                count = int(page_soup.find('count').text)
                web_env = page_soup.find('webenv').text
                query_key = page_soup.find('querykey').text

                # Update History
                cur_erp.update_history('Start Scrape')

                #
                while ret_start < count:

                    #
                    art_url = urls.fetch + '&WebEnv=' + web_env + '&query_key=' + query_key + \
                              '&retstart=' + str(ret_start) + '&retmax=' + str(ret_max)
                    art_page = self.req.get_url(art_url)
                    art_page_soup = BeautifulSoup(art_page.content, "xml")

                    # Pull out articles
                    articles = art_page_soup.findAll('PubmedArticle')

                    # Loop through each article, extracting relevant information
                    for ind, art in enumerate(articles):

                        # Get ID of current article
                        new_id = _process_ids(extract(art, 'ArticleId', 'all'), 'pubmed')
                        #new_id = int(ids[ind].text)

                        # Extract and add all relevant info from current articles to ERPData object
                        cur_erp = self.extract_add_info(cur_erp, new_id, art)

                    #
                    ret_start += ret_max

            # Without using history
            else:

                # Get all ids
                ids = page_soup.find_all('id')

                # Convert ids to string
                ids_str = _ids_to_str(ids)

                # Get article page
                art_url = urls.fetch + '&id=' + ids_str
                art_page = self.req.get_url(art_url)
                art_page_soup = BeautifulSoup(art_page.content, "xml")

                # Pull out articles
                articles = art_page_soup.findAll('PubmedArticle')

                # Update History
                cur_erp.update_history('Start Scrape')

                # Loop through each article, extracting relevant information
                for ind, art in enumerate(articles):

                    # Get ID of current article
                    new_id = int(ids[ind].text)

                    # Extract and add all relevant info from current articles to ERPData object
                    cur_erp = self.extract_add_info(cur_erp, new_id, art)

            # Check consistency of extracted results
            cur_erp.check_results()
            cur_erp.update_history('End Scrape')

            # Save out and clear data
            cur_erp.save_n_clear()

            # Add the object with current erp data to results list
            self.add_results(cur_erp)

        # Set Requester object as finished being used
        self.req.close()
示例#8
0
def test_urls_settings_args():
    """Tests URLS() returns properly with settings provided, and args defined.
    This triggers save_settings() and save_args() methods with inputs from __init__.
    """

    assert URLS(db='pubmed', retmax='500', field='id', retmode='xml')
示例#9
0
def test_urls():
    """Test the URLS object returns properly."""

    assert URLS(auto_gen=False)
    assert URLS(auto_gen=True)
示例#10
0
文件: words.py 项目: njr175/ERP_SCANR
    def scrape_data(self, db=None, retmax=None, use_hist=False, verbose=False):
        """Search through pubmed for all abstracts referring to a given ERP.

        The scraping does an exact word search for the ERP term given.
        It then loops through all the artciles found about that data.
        For each article, pulls title, year and word data.

        Notes
        -----
        - Pulls data using the hierarchical tag structure that organize the articles.
        - Initially, the procedure was to pull all tags of a certain type.
            For example: extract all 'DateCreated' tags.
            This procedure fails (or badly organizes data) when an articles is
                missing a particular tag.
            Now: take advantage of the hierarchy, loop through each article tag.
                From here, pull out the data, if available.
                This way, can deal with cases of missing data.
        """

        # Set date of when data was collected
        self.date = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

        # Get e-utils URLS object
        if use_hist: hist_val = 'y'
        else: hist_val = 'n'
        urls = URLS(db=db,
                    usehistory=hist_val,
                    retmax=retmax,
                    retmode='xml',
                    field='TIAB',
                    auto_gen=False)
        urls.build_info(['db'])
        urls.build_search(['db', 'usehistory', 'retmax', 'retmode', 'field'])
        urls.build_fetch(['db', 'retmode'])

        # Get current information about database being used
        self.get_db_info(urls.info)

        # Loop through all the erps
        #for ind, erp in enumerate(self.erps):
        for ind, lab in enumerate(self.labels):

            # Print out status
            print('Scraping words for: ', lab)

            # Initiliaze object to store data for current erp papers
            cur_erp = ERPData(lab, self.erps[ind])

            # Set up search terms - add exclusions, if there are any
            if self.exclusions[ind][0]:
                #term_arg = '"' + erp[0] + '"' + 'NOT' + '"' + self.exclusions[ind][0] + '"'
                term_arg = comb_terms(self.erps[ind], 'or') + comb_terms(
                    self.exclusions[ind], 'not')
            else:
                #term_arg = '"' + erp[0] + '"'
                term_arg = comb_terms(self.erps[ind], 'or')

            # Create the url for the erp search term
            url = urls.search + term_arg

            # Get page and parse
            page = self.req.get_url(url)
            page_soup = BeautifulSoup(page.content, 'lxml')

            # Using history
            if use_hist:

                #
                ret_start = 0
                ret_max = 100

                #
                count = int(page_soup.find('count').text)
                web_env = page_soup.find('webenv').text
                query_key = page_soup.find('querykey').text

                # Update History
                cur_erp.update_history('Start Scrape')

                #
                while ret_start < count:

                    #
                    art_url = urls.fetch + '&WebEnv=' + web_env + '&query_key=' + query_key + \
                              '&retstart=' + str(ret_start) + '&retmax=' + str(ret_max)
                    art_page = self.req.get_url(art_url)
                    art_page_soup = BeautifulSoup(art_page.content, "xml")

                    # Pull out articles
                    articles = art_page_soup.findAll('PubmedArticle')

                    # Loop through each article, extracting relevant information
                    for ind, art in enumerate(articles):

                        # Get ID of current article
                        new_id = _process_ids(extract(art, 'ArticleId', 'all'),
                                              'pubmed')
                        #new_id = int(ids[ind].text)

                        # Extract and add all relevant info from current articles to ERPData object
                        cur_erp = self.extract_add_info(cur_erp, new_id, art)

                    #
                    ret_start += ret_max

            # Without using history
            else:

                # Get all ids
                ids = page_soup.find_all('id')

                # Convert ids to string
                ids_str = _ids_to_str(ids)

                # Get article page
                art_url = urls.fetch + '&id=' + ids_str
                art_page = self.req.get_url(art_url)
                art_page_soup = BeautifulSoup(art_page.content, "xml")

                # Pull out articles
                articles = art_page_soup.findAll('PubmedArticle')

                # Update History
                cur_erp.update_history('Start Scrape')

                # Loop through each article, extracting relevant information
                for ind, art in enumerate(articles):

                    # Get ID of current article
                    new_id = int(ids[ind].text)

                    # Extract and add all relevant info from current articles to ERPData object
                    cur_erp = self.extract_add_info(cur_erp, new_id, art)

            # Check consistency of extracted results
            cur_erp.check_results()
            cur_erp.update_history('End Scrape')

            # Save out and clear data
            cur_erp.save_n_clear()

            # Add the object with current erp data to results list
            self.add_results(cur_erp)

        # Set Requester object as finished being used
        self.req.close()