def test_comb_terms(): out = comb_terms(['one', 'two'], 'or') assert out == '("one"OR"two")' out = comb_terms(['one', 'two'], 'not') assert out == 'NOT"one"NOT"two"'
def test_comb_terms(): """Test the comb_terms function.""" out = comb_terms(['one', 'two'], 'or') assert out == '("one"OR"two")' out = comb_terms(['one', 'two'], 'not') assert out == 'NOT"one"NOT"two"'
def _mk(t_lst, cm=''): """Create search term component. Parameters ---------- t_lst : list of str List of words to connect together. cm : str Connector word to append to front of search term. Returns ------- str Search term. """ if t_lst and t_lst[0]: return cm + comb_terms(t_lst, 'or') else: return ''
def scrape_words(terms_lst, exclusions_lst=[], db='pubmed', retmax=None, use_hist=False, save_n_clear=True, verbose=False): """Search and scrape from pubmed for all abstracts referring to a given term. Parameters ---------- terms_lst : list of list of str Search terms. exclusions_lst : list of list of str, optional Exclusion words for search terms. db : str, optional (default: 'pubmed') Which pubmed database to use. retmax : int, optional Maximum number of records to return. use_hist : bool, optional (default: False) Use e-utilities history: storing results on their server, as needed. save_n_clear : bool, optional (default: False) Whether to verbose : bool, optional (default: False) Whether to print out updates. Returns ------- results : list of lisc Data() objects Results from the scraping data for each term. meta_dat : dict Meta data from the scrape. Notes ----- The scraping does an exact word search for the term given. It then loops through all the articles found about that data. For each article, pulls and saves out data (including title, abstract, authors, etc) Pulls data using the hierarchical tag structure that organize the articles. This procedure loops through each article tag. """ results = [] meta_dat = dict() # Requester object req = Requester() # Set date of when data was collected meta_dat['date'] = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S") # Get e-utils URLS object hist_val = 'y' if use_hist else 'n' urls = URLS(db=db, usehistory=hist_val, retmax=retmax, retmode='xml', field='TIAB', auto_gen=False) urls.build_info(['db']) urls.build_search(['db', 'usehistory', 'retmax', 'retmode', 'field']) urls.build_fetch(['db', 'retmode']) # Get current information about database being used meta_dat['db_info'] = _get_db_info(req, urls.info) # Check exclusions if not exclusions_lst: exclusions_lst = [[] for i in range(len(terms_lst))] # Loop through all the terms for ind, terms in enumerate(terms_lst): # Print out status if verbose: print('Scraping words for: ', terms[0]) # Initiliaze object to store data for current term papers cur_dat = Data(terms[0], terms) # Set up search terms - add exclusions, if there are any if exclusions_lst[ind]: term_arg = comb_terms(terms, 'or') + comb_terms(exclusions_lst[ind], 'not') else: term_arg = comb_terms(terms, 'or') # Create the url for the search term url = urls.search + term_arg # Update History cur_dat.update_history('Start Scrape') # Get page and parse page = req.get_url(url) page_soup = BeautifulSoup(page.content, 'lxml') # Using history if use_hist: # Initialize to start at 0 ret_start_it = 0 # Get number of papers, and keys to use history count = int(page_soup.find('count').text) web_env = page_soup.find('webenv').text query_key = page_soup.find('querykey').text # Loop through pulling paper data, using history while ret_start_it < count: # Set the number of papers per iteration (the ret_max per call) # This defaults to 100, but will sets to less if fewer needed to reach retmax ret_end_it = min(100, int(retmax) - ret_start_it) # Get article page, scrape data, update position art_url = urls.fetch + '&WebEnv=' + web_env + '&query_key=' + query_key + \ '&retstart=' + str(ret_start_it) + '&retmax=' + str(ret_end_it) cur_dat = _scrape_papers(req, art_url, cur_dat) ret_start_it += ret_end_it # Stop if number of scraped papers has reached total retmax if ret_start_it >= int(retmax): break # Without using history else: # Get all ids ids = page_soup.find_all('id') # Convert ids to string ids_str = _ids_to_str(ids) # Get article page & scrape data art_url = urls.fetch + '&id=' + ids_str cur_dat = _scrape_papers(req, art_url, cur_dat) # Check consistency of extracted results cur_dat.check_results() cur_dat.update_history('End Scrape') # Save out and clear data if save_n_clear: cur_dat.save_n_clear() results.append(cur_dat) # Set Requester object as finished being used req.close() meta_dat['req'] = req return results, meta_dat
def scrape_words(terms_lst, exclusions_lst=[], db='pubmed', retmax=None, use_hist=False, save_n_clear=True, verbose=False): """Search and scrape from pubmed for all abstracts referring to a given term. Parameters ---------- terms_lst : list of list of str Search terms. exclusions_lst : list of list of str, optional Exclusion words for search terms. db : str, optional (default: 'pubmed') Which pubmed database to use. retmax : int, optional Maximum number of records to return. use_hist : bool, optional (default: False) Use e-utilities history: storing results on their server, as needed. save_n_clear : bool, optional (default: False) Whether to verbose : bool, optional (default: False) Whether to print out updates. Returns ------- results : list of lisc Data() objects Results from the scraping data for each term. meta_dat : dict Meta data from the scrape. Notes ----- The scraping does an exact word search for the term given. It then loops through all the articles found about that data. For each article, pulls and saves out data (including title, abstract, authors, etc) Pulls data using the hierarchical tag structure that organize the articles. This procedure loops through each article tag. """ results = [] meta_dat = dict() # Requester object req = Requester() # Set date of when data was collected meta_dat['date'] = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S") # Get e-utils URLS object hist_val = 'y' if use_hist else 'n' urls = URLS(db=db, usehistory=hist_val, retmax=retmax, retmode='xml', field='TIAB', auto_gen=False) urls.build_info(['db']) urls.build_search(['db', 'usehistory', 'retmax', 'retmode', 'field']) urls.build_fetch(['db', 'retmode']) # Get current information about database being used meta_dat['db_info'] = _get_db_info(req, urls.info) # Check exclusions if not exclusions_lst: exclusions_lst = [[] for i in range(len(terms_lst))] # Loop through all the terms for ind, terms in enumerate(terms_lst): # Print out status if verbose: print('Scraping words for: ', terms[0]) # Initiliaze object to store data for current term papers cur_dat = Data(terms[0], terms) # Set up search terms - add exclusions, if there are any if exclusions_lst[ind]: term_arg = comb_terms(terms, 'or') + comb_terms( exclusions_lst[ind], 'not') else: term_arg = comb_terms(terms, 'or') # Create the url for the search term url = urls.search + term_arg # Update History cur_dat.update_history('Start Scrape') # Get page and parse page = req.get_url(url) page_soup = BeautifulSoup(page.content, 'lxml') # Using history if use_hist: # Initialize to start at 0 ret_start_it = 0 # Get number of papers, and keys to use history count = int(page_soup.find('count').text) web_env = page_soup.find('webenv').text query_key = page_soup.find('querykey').text # Loop through pulling paper data, using history while ret_start_it < count: # Set the number of papers per iteration (the ret_max per call) # This defaults to 100, but will sets to less if fewer needed to reach retmax ret_end_it = min(100, int(retmax) - ret_start_it) # Get article page, scrape data, update position art_url = urls.fetch + '&WebEnv=' + web_env + '&query_key=' + query_key + \ '&retstart=' + str(ret_start_it) + '&retmax=' + str(ret_end_it) cur_dat = _scrape_papers(req, art_url, cur_dat) ret_start_it += ret_end_it # Stop if number of scraped papers has reached total retmax if ret_start_it >= int(retmax): break # Without using history else: # Get all ids ids = page_soup.find_all('id') # Convert ids to string ids_str = _ids_to_str(ids) # Get article page & scrape data art_url = urls.fetch + '&id=' + ids_str cur_dat = _scrape_papers(req, art_url, cur_dat) # Check consistency of extracted results cur_dat.check_results() cur_dat.update_history('End Scrape') # Save out and clear data if save_n_clear: cur_dat.save_n_clear() results.append(cur_dat) # Set Requester object as finished being used req.close() meta_dat['req'] = req return results, meta_dat