def extract_add_info(arts, art_id, art): """Extract information from an article and add it to a data object. Parameters ---------- arts : Articles Object to store information for the current article. art_id : int ID of the new article. art : bs4.element.Tag Extracted article. Returns ------- arts : Articles Object updated with data from the current article. """ arts.add_data('ids', art_id) arts.add_data('titles', extract(art, 'ArticleTitle', 'str')) arts.add_data('authors', process_authors(extract(art, 'AuthorList', 'raw'))) arts.add_data( 'journals', (extract(art, 'Title', 'str'), extract(art, 'ISOAbbreviation', 'str'))) arts.add_data('words', extract(art, 'AbstractText', 'all-str')) arts.add_data('keywords', extract(art, 'Keyword', 'all-list')) arts.add_data('years', process_pub_date(extract(art, 'PubDate', 'raw'))) arts.add_data('dois', process_ids(extract(art, 'ArticleId', 'all'), 'doi')) return arts
def get_db_info(req, info_url): """Calls EInfo to get info and status of the database to be used for data collection. Parameters ---------- req : Requester Object to launch requests from. info_url : str URL to request db information from. Returns ------- db_info : dict Information about the database from which the data was accessed. """ # Get the info page and parse with BeautifulSoup info_page = req.request_url(info_url) info_page_soup = BeautifulSoup(info_page.content, 'lxml') # Set list of fields to extract from EInfo fields = [ 'dbname', 'menuname', 'description', 'dbbuild', 'count', 'lastupdate' ] # Extract basic information into a dictionary db_info = dict() for field in fields: db_info[field] = extract(info_page_soup, field, 'str') return db_info
def get_count(req, url): """Get the count of how many articles listed at the requested URL. Parameters ---------- req : Requester Object to launch requests from. url : str URL to request count data from. Returns ------- count : int Count of the number of articles found. """ page = req.request_url(url) page_soup = BeautifulSoup(page.content, 'lxml') counts = extract(page_soup, 'count', 'all') try: count = int(counts[0].text) except IndexError: count = 0 return count
def get_articles(req, art_url, arts): """Collect information for each article found for a given term. Parameters ---------- req : Requester Requester object to launch requests from. art_url : str URL for the article to be collected. arts : Articles Object to add data to. Returns ------- arts : Articles Object to store information for the current term. """ # Get page of all articles art_page = req.request_url(art_url) art_page_soup = BeautifulSoup(art_page.content, 'xml') articles = art_page_soup.findAll('PubmedArticle') # Loop through each article, extracting relevant information for art in articles: # Get ID of current article & extract and add info to data object new_id = process_ids(extract(art, 'ArticleId', 'all'), 'pubmed') arts = extract_add_info(arts, new_id, art) return arts
def get_db_info(req, info_url): """Calls EInfo to get info and status of the database to be used for data collection. Parameters ---------- req : Requester Object to launch requests from. info_url : str URL to request db information from. Returns ------- db_info : dict Information about the database from which the data was accessed. Examples -------- Get info on the pubmed database: >>> from lisc.requester import Requester >>> url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi?db=pubmed' >>> db_info = get_db_info(Requester(), url) """ # Get the info page and parse with BeautifulSoup info_page = req.request_url(info_url) info_page_soup = BeautifulSoup(info_page.content, 'lxml') # Set list of fields to extract from EInfo fields = [ 'dbname', 'menuname', 'description', 'dbbuild', 'count', 'lastupdate' ] # Extract basic information into a dictionary db_info = dict() for field in fields: db_info[field] = extract(info_page_soup, field, 'str') return db_info