def run(search: Search, api_token: str): """ This method fetch papers from IEEE database using the provided search parameters After fetch the data from IEEE, the collected papers are added to the provided search instance Parameters ---------- search : Search A search instance api_token : str The API key used to fetch data from IEEE database, Raises ------ AttributeError - The API token cannot be null """ if api_token is None or len(api_token.strip()) == 0: raise AttributeError('The API token cannot be null') papers_count = 0 result = _get_api_result(search, api_token) total_papers = result.get('total_records') logging.info(f'IEEE: {total_papers} papers to fetch') while(papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL)): for paper_entry in result.get('articles'): if papers_count >= total_papers or search.reached_its_limit(DATABASE_LABEL): break papers_count += 1 try: logging.info(f'({papers_count}/{total_papers}) Fetching IEEE paper: {paper_entry.get("title")}') publication = _get_publication(paper_entry) paper = _get_paper(paper_entry, publication) if paper is not None: paper.add_database(DATABASE_LABEL) search.add_paper(paper) except Exception as e: # pragma: no cover logging.debug(e, exc_info=True) if papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL): result = _get_api_result(search, api_token, papers_count+1)
def run(search: Search, database: str): """ This method fetch papers from medRxiv/bioRxiv database using the provided search parameters After fetch the data from medRxiv/bioRxiv, the collected papers are added to the provided search instance Parameters ---------- search : Search A search instance database : str The database name (medRxiv or bioRxiv) """ urls = _get_search_urls(search, database) for i, url in enumerate(urls): if search.reached_its_limit(database): break logging.info(f'{database}: Requesting for papers...') data = _get_data(url) total_papers = 0 if len(data) > 0: total_papers = data[0].get('total_papers') logging.info(f'{database}: {total_papers} papers to fetch from {i+1}/{len(urls)} papers requests') papers_count = 0 dois = sum([d.get('dois') for d in [x for x in data]], []) for doi in dois: if papers_count >= total_papers or search.reached_its_limit(database): break try: papers_count += 1 paper_metadata = _get_paper_metadata(doi, database) paper_title = paper_metadata.get('title') logging.info(f'({papers_count}/{total_papers}) Fetching {database} paper: {paper_title}') paper = _get_paper(paper_metadata) paper.add_database(database) search.add_paper(paper) except Exception as e: # pragma: no cover logging.debug(e, exc_info=True)
def _database_safe_run(function: callable, search: Search, database_label: str): """ Private method that calls a provided function catching all exceptions without rasing them, only logging a ERROR message Parameters ---------- function : callable A function that will be call for database fetching search : Search A search instance database_label : str A database label """ if not search.reached_its_limit(database_label): logging.info(f'Fetching papers from {database_label} database...') try: function() except Exception: # pragma: no cover logging.debug( f'Error while fetching papers from {database_label} database', exc_info=True)
def run(search: Search): """ This method fetch papers from IEEE database using the provided search parameters After fetch the data from IEEE, the collected papers are added to the provided search instance Parameters ---------- search : Search A search instance api_token : str The API key used to fetch data from IEEE database, Raises ------ AttributeError - The API token cannot be null """ if search.publication_types is not None and 'journal' not in search.publication_types: logging.info('Skiping PubMed search, journal publication type not in filters. Nowadays the PubMed only retrieves papers published on journals.') return papers_count = 0 result = _get_api_result(search) if result.get('eSearchResult').get('ErrorList', None) is not None: total_papers = 0 else: total_papers = int(result.get('eSearchResult').get('Count')) logging.info(f'PubMed: {total_papers} papers to fetch') while(papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL)): for pubmed_id in result.get('eSearchResult').get('IdList').get('Id'): if papers_count >= total_papers or search.reached_its_limit(DATABASE_LABEL): break papers_count += 1 try: paper_entry = _get_paper_entry(pubmed_id) if paper_entry is not None: paper_title = paper_entry.get('PubmedArticleSet').get('PubmedArticle').get( 'MedlineCitation').get('Article').get('ArticleTitle') logging.info(f'({papers_count}/{total_papers}) Fetching PubMed paper: {paper_title}') publication = _get_publication(paper_entry) paper = _get_paper(paper_entry, publication) if paper is not None: paper.add_database(DATABASE_LABEL) search.add_paper(paper) except Exception as e: # pragma: no cover logging.debug(e, exc_info=True) if papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL): result = _get_api_result(search, papers_count)
def run(search: Search): """ This method fetch papers from ACM database using the provided search parameters After fetch the data from ACM, the collected papers are added to the provided search instance Parameters ---------- search : Search A search instance """ papers_count = 0 result = _get_result(search) try: total_papers = int(result.xpath( '//*[@class="hitsLength"]')[0].text.strip()) except Exception: # pragma: no cover total_papers = 0 logging.info(f'ACM: {total_papers} papers to fetch') page_index = 0 while(papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL)): papers_urls = [BASE_URL+x.attrib['href'] for x in result.xpath('//*[@class="hlFld-Title"]/a')] for paper_url in papers_urls: if papers_count >= total_papers or search.reached_its_limit(DATABASE_LABEL): break try: papers_count += 1 paper_page = _get_paper_page(paper_url) paper_title = paper_page.xpath('//*[@class="citation__title"]')[0].text logging.info(f'({papers_count}/{total_papers}) Fetching ACM paper: {paper_title}') paper_doi = None if '/abs/' in paper_url: paper_doi = paper_url.split('/abs/')[1] elif '/book/' in paper_url: paper_doi = paper_url.split('/book/')[1] else: paper_doi = paper_url.split('/doi/')[1] paper = _get_paper(paper_page, paper_doi, paper_url) if paper is None: continue paper.add_database(DATABASE_LABEL) search.add_paper(paper) except Exception as e: # pragma: no cover logging.debug(e, exc_info=True) if papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL): page_index += 1 result = _get_result(search, page_index)
def run(search: Search, api_token: str, url: Optional[str] = None, papers_count: Optional[int] = 0): """ This method fetch papers from Scopus database using the provided search parameters After fetch the data from Scopus, the collected papers are added to the provided search instance Parameters ---------- search : Search A search instance api_token : str The API key used to fetch data from Scopus database, url : Optional[str] A predefined URL to be used for the search execution, this is usually used for make the next recursive call on a result pagination papers_count : Optional[int] Papers count used on recursion calls Raises ------ AttributeError - The API token cannot be null """ if api_token is None or len(api_token.strip()) == 0: raise AttributeError('The API token cannot be null') search_results = _get_search_results(search, api_token, url) total_papers = int(search_results.get('opensearch:totalResults', 0)) logging.info(f'Scopus: {total_papers} papers to fetch') for paper_entry in search_results.get('entry', []): if papers_count >= total_papers or search.reached_its_limit(DATABASE_LABEL): break papers_count += 1 try: paper_title = paper_entry.get("dc:title") logging.info(f'({papers_count}/{total_papers}) Fetching Scopus paper: {paper_title}') publication = _get_publication(paper_entry, api_token) paper = _get_paper(paper_entry, publication) if paper is not None: paper.add_database(DATABASE_LABEL) search.add_paper(paper) except Exception as e: # pragma: no cover logging.debug(e, exc_info=True) next_url = None for link in search_results['link']: if link['@ref'] == 'next': next_url = link['@href'] break # If there is a next url, the API provided response was paginated and we need to process the next url # We'll make a recursive call for it if papers_count < total_papers and next_url is not None and not search.reached_its_limit(DATABASE_LABEL): run(search, api_token, next_url, papers_count)
def run(search: Search): """ This method fetch papers from arXiv database using the provided search parameters After fetch the data from arXiv, the collected papers are added to the provided search instance Parameters ---------- search : Search A search instance """ papers_count = 0 result = _get_api_result(search) total_papers = int( result.get('feed').get('opensearch:totalResults').get('#text')) logging.info(f'arXiv: {total_papers} papers to fetch') while (papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL)): entries = result.get('feed', {}).get('entry', []) if type( entries ) != list: # if there's only one entry the result is not a list just a dict entries = [entries] for paper_entry in entries: if papers_count >= total_papers or search.reached_its_limit( DATABASE_LABEL): break papers_count += 1 try: paper_title = paper_entry.get("title") logging.info( f'({papers_count}/{total_papers}) Fetching arXiv paper: {paper_title}' ) published_date = datetime.datetime.strptime( paper_entry.get('published')[:10], '%Y-%m-%d').date() # nowadays we don't have a date filter on arXiv API, so we need to do it by ourselves' if search.since is not None and published_date < search.since: logging.info( 'Skipping paper due to "since" date constraint') continue elif search.until is not None and published_date > search.until: logging.info( 'Skipping paper due to "until" date constraint') continue publication = _get_publication(paper_entry) paper = _get_paper(paper_entry, published_date, publication) if paper is not None: paper.add_database(DATABASE_LABEL) search.add_paper(paper) except Exception as e: # pragma: no cover logging.debug(e, exc_info=True) if papers_count < total_papers and not search.reached_its_limit( DATABASE_LABEL): time.sleep(1) # sleep for 1 second to avoid server blocking result = _get_api_result(search, papers_count)