def test_extract_add_info(): """Tset the extract_add_info method.""" words = Words() # Check page with all fields defined - check data extraction erp_word = Data('test') page = requests.get(("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/" "efetch.fcgi?&db=pubmed&retmode=xml&id=28000963")) page_soup = BeautifulSoup(page.content, "xml") art = page_soup.findAll('PubmedArticle')[0] words.extract_add_info(erp_word, 111111, art) assert erp_word.ids[0] == 111111 assert erp_word.titles[0] == ("A Neurocomputational Model of the N400" " and the P600 in Language Processing.") assert erp_word.words[0][0] == "ten" assert erp_word.kws[0][0] == "computational modeling" assert erp_word.years[0] == 2017 assert erp_word.months[0] == 'May' assert erp_word.dois[0] == '10.1111/cogs.12461' # Check page with all fields missing - check error handling page = requests.get('http://www.google.com') erp_word = words.extract_add_info(erp_word, 999999, page) assert erp_word.ids[1] == 999999 assert erp_word.titles[1] is None assert erp_word.words[1] is None assert erp_word.kws[1] is None assert erp_word.years[1] is None assert erp_word.months[1] is None assert erp_word.dois[1] is None
def test_add_results(): words = Words() words.add_results(Data(['test'])) assert words.results
def test_add_results(): """Test the add_results method.""" words = Words() words.add_results(Data(['test'])) assert words.results
def test_get_item(): words = Words() # Test error for empty object with raises(IndexError): words['not a thing'] words.add_results(Data('test', ['test'])) # Test error for wrong key with raises(IndexError): words['wrong'] # Test properly extracting item assert words['test']
def scrape_words(terms_lst, exclusions_lst=[], db='pubmed', retmax=None, use_hist=False, save_n_clear=True, verbose=False): """Search and scrape from pubmed for all abstracts referring to a given term. Parameters ---------- terms_lst : list of list of str Search terms. exclusions_lst : list of list of str, optional Exclusion words for search terms. db : str, optional (default: 'pubmed') Which pubmed database to use. retmax : int, optional Maximum number of records to return. use_hist : bool, optional (default: False) Use e-utilities history: storing results on their server, as needed. save_n_clear : bool, optional (default: False) Whether to verbose : bool, optional (default: False) Whether to print out updates. Returns ------- results : list of lisc Data() objects Results from the scraping data for each term. meta_dat : dict Meta data from the scrape. Notes ----- The scraping does an exact word search for the term given. It then loops through all the articles found about that data. For each article, pulls and saves out data (including title, abstract, authors, etc) Pulls data using the hierarchical tag structure that organize the articles. This procedure loops through each article tag. """ results = [] meta_dat = dict() # Requester object req = Requester() # Set date of when data was collected meta_dat['date'] = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S") # Get e-utils URLS object hist_val = 'y' if use_hist else 'n' urls = URLS(db=db, usehistory=hist_val, retmax=retmax, retmode='xml', field='TIAB', auto_gen=False) urls.build_info(['db']) urls.build_search(['db', 'usehistory', 'retmax', 'retmode', 'field']) urls.build_fetch(['db', 'retmode']) # Get current information about database being used meta_dat['db_info'] = _get_db_info(req, urls.info) # Check exclusions if not exclusions_lst: exclusions_lst = [[] for i in range(len(terms_lst))] # Loop through all the terms for ind, terms in enumerate(terms_lst): # Print out status if verbose: print('Scraping words for: ', terms[0]) # Initiliaze object to store data for current term papers cur_dat = Data(terms[0], terms) # Set up search terms - add exclusions, if there are any if exclusions_lst[ind]: term_arg = comb_terms(terms, 'or') + comb_terms(exclusions_lst[ind], 'not') else: term_arg = comb_terms(terms, 'or') # Create the url for the search term url = urls.search + term_arg # Update History cur_dat.update_history('Start Scrape') # Get page and parse page = req.get_url(url) page_soup = BeautifulSoup(page.content, 'lxml') # Using history if use_hist: # Initialize to start at 0 ret_start_it = 0 # Get number of papers, and keys to use history count = int(page_soup.find('count').text) web_env = page_soup.find('webenv').text query_key = page_soup.find('querykey').text # Loop through pulling paper data, using history while ret_start_it < count: # Set the number of papers per iteration (the ret_max per call) # This defaults to 100, but will sets to less if fewer needed to reach retmax ret_end_it = min(100, int(retmax) - ret_start_it) # Get article page, scrape data, update position art_url = urls.fetch + '&WebEnv=' + web_env + '&query_key=' + query_key + \ '&retstart=' + str(ret_start_it) + '&retmax=' + str(ret_end_it) cur_dat = _scrape_papers(req, art_url, cur_dat) ret_start_it += ret_end_it # Stop if number of scraped papers has reached total retmax if ret_start_it >= int(retmax): break # Without using history else: # Get all ids ids = page_soup.find_all('id') # Convert ids to string ids_str = _ids_to_str(ids) # Get article page & scrape data art_url = urls.fetch + '&id=' + ids_str cur_dat = _scrape_papers(req, art_url, cur_dat) # Check consistency of extracted results cur_dat.check_results() cur_dat.update_history('End Scrape') # Save out and clear data if save_n_clear: cur_dat.save_n_clear() results.append(cur_dat) # Set Requester object as finished being used req.close() meta_dat['req'] = req return results, meta_dat
def load_data(add_dat=False, n=1): """Helper function to load Data() object for testing.""" dat = Data('test', ['test']) if add_dat: for i in range(n): dat.add_id(1) dat.add_title('title') dat.add_journal('science', 'sc') dat.add_authors([('A', 'B', 'C', 'D')]) dat.add_words(['new', 'dat']) dat.add_kws(['lots', 'of', 'erps']) dat.add_pub_date((2112, 'Jan')) dat.add_doi('doi_str') dat.increment_n_articles() return dat
def scrape_words(terms_lst, exclusions_lst=[], db='pubmed', retmax=None, use_hist=False, save_n_clear=True, verbose=False): """Search and scrape from pubmed for all abstracts referring to a given term. Parameters ---------- terms_lst : list of list of str Search terms. exclusions_lst : list of list of str, optional Exclusion words for search terms. db : str, optional (default: 'pubmed') Which pubmed database to use. retmax : int, optional Maximum number of records to return. use_hist : bool, optional (default: False) Use e-utilities history: storing results on their server, as needed. save_n_clear : bool, optional (default: False) Whether to verbose : bool, optional (default: False) Whether to print out updates. Returns ------- results : list of lisc Data() objects Results from the scraping data for each term. meta_dat : dict Meta data from the scrape. Notes ----- The scraping does an exact word search for the term given. It then loops through all the articles found about that data. For each article, pulls and saves out data (including title, abstract, authors, etc) Pulls data using the hierarchical tag structure that organize the articles. This procedure loops through each article tag. """ results = [] meta_dat = dict() # Requester object req = Requester() # Set date of when data was collected meta_dat['date'] = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S") # Get e-utils URLS object hist_val = 'y' if use_hist else 'n' urls = URLS(db=db, usehistory=hist_val, retmax=retmax, retmode='xml', field='TIAB', auto_gen=False) urls.build_info(['db']) urls.build_search(['db', 'usehistory', 'retmax', 'retmode', 'field']) urls.build_fetch(['db', 'retmode']) # Get current information about database being used meta_dat['db_info'] = _get_db_info(req, urls.info) # Check exclusions if not exclusions_lst: exclusions_lst = [[] for i in range(len(terms_lst))] # Loop through all the terms for ind, terms in enumerate(terms_lst): # Print out status if verbose: print('Scraping words for: ', terms[0]) # Initiliaze object to store data for current term papers cur_dat = Data(terms[0], terms) # Set up search terms - add exclusions, if there are any if exclusions_lst[ind]: term_arg = comb_terms(terms, 'or') + comb_terms( exclusions_lst[ind], 'not') else: term_arg = comb_terms(terms, 'or') # Create the url for the search term url = urls.search + term_arg # Update History cur_dat.update_history('Start Scrape') # Get page and parse page = req.get_url(url) page_soup = BeautifulSoup(page.content, 'lxml') # Using history if use_hist: # Initialize to start at 0 ret_start_it = 0 # Get number of papers, and keys to use history count = int(page_soup.find('count').text) web_env = page_soup.find('webenv').text query_key = page_soup.find('querykey').text # Loop through pulling paper data, using history while ret_start_it < count: # Set the number of papers per iteration (the ret_max per call) # This defaults to 100, but will sets to less if fewer needed to reach retmax ret_end_it = min(100, int(retmax) - ret_start_it) # Get article page, scrape data, update position art_url = urls.fetch + '&WebEnv=' + web_env + '&query_key=' + query_key + \ '&retstart=' + str(ret_start_it) + '&retmax=' + str(ret_end_it) cur_dat = _scrape_papers(req, art_url, cur_dat) ret_start_it += ret_end_it # Stop if number of scraped papers has reached total retmax if ret_start_it >= int(retmax): break # Without using history else: # Get all ids ids = page_soup.find_all('id') # Convert ids to string ids_str = _ids_to_str(ids) # Get article page & scrape data art_url = urls.fetch + '&id=' + ids_str cur_dat = _scrape_papers(req, art_url, cur_dat) # Check consistency of extracted results cur_dat.check_results() cur_dat.update_history('End Scrape') # Save out and clear data if save_n_clear: cur_dat.save_n_clear() results.append(cur_dat) # Set Requester object as finished being used req.close() meta_dat['req'] = req return results, meta_dat