def test_build_query(): """Test the build_query() method from URLS().""" urls = URLS(db='pubmed') urls.build_query(['db']) assert urls.query
def test_build_fetch(): """Test the build_fetch() method from URLS().""" urls = URLS(db='pubmed', retmax='500', field='id', retmode='xml') urls.build_fetch(['db', 'retmode']) assert urls.fetch
def test_build_info(): """Test the build_info() method from URLS().""" urls = URLS() urls.build_info([]) assert urls.info
def test_check_args(): """Test the check_args() method from URLS().""" urls = URLS(db='pubmed', field='id') urls.check_args(['db', 'field']) # Check error with raises(InconsistentDataError): urls.check_args(['db', 'retmax', 'field'])
def scrape_data(self, db=None, verbose=False): """Search through pubmed for all abstracts with co-occurence of ERP & terms. The scraping does an exact word search for two terms (one ERP and one term) The HTML page returned by the pubmed search includes a 'count' field. This field contains the number of papers with both terms. This is extracted. """ # Set date of when data was scraped self.date = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S") # Get e-utils URLS object. Set retmax as 0, since not using UIDs in this analysis urls = URLS(db=db, retmax='0', retmode='xml', field='TIAB') urls.build_info(['db']) urls.build_search(['db', 'retmax', 'retmode', 'field']) # Get current information about database being used self.get_db_info(urls.info) # Initialize count variables to the correct length self.term_counts = np.zeros([self.n_terms]) self.erp_counts = np.zeros([self.n_erps]) # Initialize right size matrices to store data self.dat_numbers = np.zeros([self.n_erps, self.n_terms], dtype=int) self.dat_percent = np.zeros([self.n_erps, self.n_terms]) # Loop through each ERP term for erp_ls in self.erps: # Get the index of the current erp erp_ind = self.erps.index(erp_ls) # Print out status if verbose: print('Running counts for: ', self.labels[erp_ind]) # Get number of results for just ERP search #url = urls.search + '"' + erp_ls[0] + '"' url = urls.search + _mk(self.erps[erp_ind]) + \ _mk(self.exclusions[erp_ind], 'NOT') self.erp_counts[erp_ind] = self._get_count(url) # For each ERP, loop through each term term for term_ls in self.terms: # Get the indices of the current term term_ind = self.terms.index(term_ls) # Get number of results for just term search #url = urls.search + '"' + term_ls[0] + '"' url = urls.search + _mk(self.terms[term_ind]) self.term_counts[term_ind] = self._get_count(url) # Make URL - Exact Term Version, using double quotes, & exclusions url = urls.search + _mk(self.erps[erp_ind]) + \ _mk(self.exclusions[erp_ind], 'NOT') + \ _mk(self.terms[term_ind], 'AND') # #url = urls.search + '"' + erp_ls[0] + '"AND"' + term_ls[0] + '"' #url = urls.search + comb_terms(erp_ls, 'or') + 'AND' + comb_terms(term_ls, 'or') # Make URL - Non-exact term version #url = self.eutils_search + erp + ' erp ' + term count = self._get_count(url) self.dat_numbers[erp_ind, term_ind] = count self.dat_percent[erp_ind, term_ind] = count / self.erp_counts[erp_ind] """ # Pull the page, and parse with Beautiful Soup page = self.req.get_url(url) page_soup = BeautifulSoup(page.content, 'lxml') # Get all 'count' tags counts = extract(page_soup, 'count', 'all') # Initialize empty temp vector to hold counts vec = [] # Loop through counts, extracting into vec # There should be n+1 count fields, where n is the number of search terms # The number of search terms includes all of them, including 'OR's & 'NOT's # Example: term=("N400"OR"N4")AND("language")NOT("cancer"OR"histone") # Here there are 5 search terms, and so 6 count tags # The 1st count tag is the number of articles meeting the full search term # Each subsequent count tag is each search term, in order. for count in counts: vec.append(int(count.text)) # Add the total number of papers for erp & term self.erp_counts[erp_ind] = vec[1] self.term_counts[term_ind] = vec[2] # Add the number & percent of overlapping papers self.dat_numbers[erp_ind, term_ind] = vec[0] self.dat_percent[erp_ind, term_ind] = vec[0]/vec[1] """ # Set Requester object as finished being used self.req.close()
def scrape_data(self, db=None, retmax=None, use_hist=False, verbose=False): """Search through pubmed for all abstracts referring to a given ERP. The scraping does an exact word search for the ERP term given. It then loops through all the artciles found about that data. For each article, pulls title, year and word data. Notes ----- - Pulls data using the hierarchical tag structure that organize the articles. - Initially, the procedure was to pull all tags of a certain type. For example: extract all 'DateCreated' tags. This procedure fails (or badly organizes data) when an articles is missing a particular tag. Now: take advantage of the hierarchy, loop through each article tag. From here, pull out the data, if available. This way, can deal with cases of missing data. """ # Set date of when data was collected self.date = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S") # Get e-utils URLS object if use_hist: hist_val = 'y' else: hist_val = 'n' urls = URLS(db=db, usehistory=hist_val, retmax=retmax, retmode='xml', field='TIAB', auto_gen=False) urls.build_info(['db']) urls.build_search(['db', 'usehistory', 'retmax', 'retmode', 'field']) urls.build_fetch(['db', 'retmode']) # Get current information about database being used self.get_db_info(urls.info) # Loop through all the erps #for ind, erp in enumerate(self.erps): for ind, lab in enumerate(self.labels): # Print out status print('Scraping words for: ', lab) # Initiliaze object to store data for current erp papers cur_erp = ERPData(lab, self.erps[ind]) # Set up search terms - add exclusions, if there are any if self.exclusions[ind][0]: #term_arg = '"' + erp[0] + '"' + 'NOT' + '"' + self.exclusions[ind][0] + '"' term_arg = comb_terms(self.erps[ind], 'or') + comb_terms(self.exclusions[ind], 'not') else: #term_arg = '"' + erp[0] + '"' term_arg = comb_terms(self.erps[ind], 'or') # Create the url for the erp search term url = urls.search + term_arg # Get page and parse page = self.req.get_url(url) page_soup = BeautifulSoup(page.content, 'lxml') # Using history if use_hist: # ret_start = 0 ret_max = 100 # count = int(page_soup.find('count').text) web_env = page_soup.find('webenv').text query_key = page_soup.find('querykey').text # Update History cur_erp.update_history('Start Scrape') # while ret_start < count: # art_url = urls.fetch + '&WebEnv=' + web_env + '&query_key=' + query_key + \ '&retstart=' + str(ret_start) + '&retmax=' + str(ret_max) art_page = self.req.get_url(art_url) art_page_soup = BeautifulSoup(art_page.content, "xml") # Pull out articles articles = art_page_soup.findAll('PubmedArticle') # Loop through each article, extracting relevant information for ind, art in enumerate(articles): # Get ID of current article new_id = _process_ids(extract(art, 'ArticleId', 'all'), 'pubmed') #new_id = int(ids[ind].text) # Extract and add all relevant info from current articles to ERPData object cur_erp = self.extract_add_info(cur_erp, new_id, art) # ret_start += ret_max # Without using history else: # Get all ids ids = page_soup.find_all('id') # Convert ids to string ids_str = _ids_to_str(ids) # Get article page art_url = urls.fetch + '&id=' + ids_str art_page = self.req.get_url(art_url) art_page_soup = BeautifulSoup(art_page.content, "xml") # Pull out articles articles = art_page_soup.findAll('PubmedArticle') # Update History cur_erp.update_history('Start Scrape') # Loop through each article, extracting relevant information for ind, art in enumerate(articles): # Get ID of current article new_id = int(ids[ind].text) # Extract and add all relevant info from current articles to ERPData object cur_erp = self.extract_add_info(cur_erp, new_id, art) # Check consistency of extracted results cur_erp.check_results() cur_erp.update_history('End Scrape') # Save out and clear data cur_erp.save_n_clear() # Add the object with current erp data to results list self.add_results(cur_erp) # Set Requester object as finished being used self.req.close()
def test_urls_settings_args(): """Tests URLS() returns properly with settings provided, and args defined. This triggers save_settings() and save_args() methods with inputs from __init__. """ assert URLS(db='pubmed', retmax='500', field='id', retmode='xml')
def test_urls(): """Test the URLS object returns properly.""" assert URLS(auto_gen=False) assert URLS(auto_gen=True)
def scrape_data(self, db=None, retmax=None, use_hist=False, verbose=False): """Search through pubmed for all abstracts referring to a given ERP. The scraping does an exact word search for the ERP term given. It then loops through all the artciles found about that data. For each article, pulls title, year and word data. Notes ----- - Pulls data using the hierarchical tag structure that organize the articles. - Initially, the procedure was to pull all tags of a certain type. For example: extract all 'DateCreated' tags. This procedure fails (or badly organizes data) when an articles is missing a particular tag. Now: take advantage of the hierarchy, loop through each article tag. From here, pull out the data, if available. This way, can deal with cases of missing data. """ # Set date of when data was collected self.date = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S") # Get e-utils URLS object if use_hist: hist_val = 'y' else: hist_val = 'n' urls = URLS(db=db, usehistory=hist_val, retmax=retmax, retmode='xml', field='TIAB', auto_gen=False) urls.build_info(['db']) urls.build_search(['db', 'usehistory', 'retmax', 'retmode', 'field']) urls.build_fetch(['db', 'retmode']) # Get current information about database being used self.get_db_info(urls.info) # Loop through all the erps #for ind, erp in enumerate(self.erps): for ind, lab in enumerate(self.labels): # Print out status print('Scraping words for: ', lab) # Initiliaze object to store data for current erp papers cur_erp = ERPData(lab, self.erps[ind]) # Set up search terms - add exclusions, if there are any if self.exclusions[ind][0]: #term_arg = '"' + erp[0] + '"' + 'NOT' + '"' + self.exclusions[ind][0] + '"' term_arg = comb_terms(self.erps[ind], 'or') + comb_terms( self.exclusions[ind], 'not') else: #term_arg = '"' + erp[0] + '"' term_arg = comb_terms(self.erps[ind], 'or') # Create the url for the erp search term url = urls.search + term_arg # Get page and parse page = self.req.get_url(url) page_soup = BeautifulSoup(page.content, 'lxml') # Using history if use_hist: # ret_start = 0 ret_max = 100 # count = int(page_soup.find('count').text) web_env = page_soup.find('webenv').text query_key = page_soup.find('querykey').text # Update History cur_erp.update_history('Start Scrape') # while ret_start < count: # art_url = urls.fetch + '&WebEnv=' + web_env + '&query_key=' + query_key + \ '&retstart=' + str(ret_start) + '&retmax=' + str(ret_max) art_page = self.req.get_url(art_url) art_page_soup = BeautifulSoup(art_page.content, "xml") # Pull out articles articles = art_page_soup.findAll('PubmedArticle') # Loop through each article, extracting relevant information for ind, art in enumerate(articles): # Get ID of current article new_id = _process_ids(extract(art, 'ArticleId', 'all'), 'pubmed') #new_id = int(ids[ind].text) # Extract and add all relevant info from current articles to ERPData object cur_erp = self.extract_add_info(cur_erp, new_id, art) # ret_start += ret_max # Without using history else: # Get all ids ids = page_soup.find_all('id') # Convert ids to string ids_str = _ids_to_str(ids) # Get article page art_url = urls.fetch + '&id=' + ids_str art_page = self.req.get_url(art_url) art_page_soup = BeautifulSoup(art_page.content, "xml") # Pull out articles articles = art_page_soup.findAll('PubmedArticle') # Update History cur_erp.update_history('Start Scrape') # Loop through each article, extracting relevant information for ind, art in enumerate(articles): # Get ID of current article new_id = int(ids[ind].text) # Extract and add all relevant info from current articles to ERPData object cur_erp = self.extract_add_info(cur_erp, new_id, art) # Check consistency of extracted results cur_erp.check_results() cur_erp.update_history('End Scrape') # Save out and clear data cur_erp.save_n_clear() # Add the object with current erp data to results list self.add_results(cur_erp) # Set Requester object as finished being used self.req.close()