def run(search: Search, api_token: str): """ This method fetch papers from IEEE database using the provided search parameters After fetch the data from IEEE, the collected papers are added to the provided search instance Parameters ---------- search : Search A search instance api_token : str The API key used to fetch data from IEEE database, Raises ------ AttributeError - The API token cannot be null """ if api_token is None or len(api_token.strip()) == 0: raise AttributeError('The API token cannot be null') papers_count = 0 result = _get_api_result(search, api_token) total_papers = result.get('total_records') logging.info(f'IEEE: {total_papers} papers to fetch') while(papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL)): for paper_entry in result.get('articles'): if papers_count >= total_papers or search.reached_its_limit(DATABASE_LABEL): break papers_count += 1 try: logging.info(f'({papers_count}/{total_papers}) Fetching IEEE paper: {paper_entry.get("title")}') publication = _get_publication(paper_entry) paper = _get_paper(paper_entry, publication) if paper is not None: paper.add_database(DATABASE_LABEL) search.add_paper(paper) except Exception as e: # pragma: no cover logging.debug(e, exc_info=True) if papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL): result = _get_api_result(search, api_token, papers_count+1)
def run(search: Search, database: str): """ This method fetch papers from medRxiv/bioRxiv database using the provided search parameters After fetch the data from medRxiv/bioRxiv, the collected papers are added to the provided search instance Parameters ---------- search : Search A search instance database : str The database name (medRxiv or bioRxiv) """ urls = _get_search_urls(search, database) for i, url in enumerate(urls): if search.reached_its_limit(database): break logging.info(f'{database}: Requesting for papers...') data = _get_data(url) total_papers = 0 if len(data) > 0: total_papers = data[0].get('total_papers') logging.info(f'{database}: {total_papers} papers to fetch from {i+1}/{len(urls)} papers requests') papers_count = 0 dois = sum([d.get('dois') for d in [x for x in data]], []) for doi in dois: if papers_count >= total_papers or search.reached_its_limit(database): break try: papers_count += 1 paper_metadata = _get_paper_metadata(doi, database) paper_title = paper_metadata.get('title') logging.info(f'({papers_count}/{total_papers}) Fetching {database} paper: {paper_title}') paper = _get_paper(paper_metadata) paper.add_database(database) search.add_paper(paper) except Exception as e: # pragma: no cover logging.debug(e, exc_info=True)
def test_save_and_load(search: Search, paper: Paper): temp_dirpath = tempfile.mkdtemp() temp_filepath = os.path.join(temp_dirpath, 'output.json') search.add_paper(paper) findpapers.save(search, temp_filepath) loaded_search = findpapers.load(temp_filepath) assert loaded_search.query == search.query assert loaded_search.since == search.since assert loaded_search.until == search.until assert loaded_search.limit == search.limit assert loaded_search.limit_per_database == search.limit_per_database assert loaded_search.processed_at.strftime( '%Y-%m-%d %H:%M:%S') == search.processed_at.strftime( '%Y-%m-%d %H:%M:%S') assert len(loaded_search.papers) == len(search.papers)
def run(search: Search): """ This method fetch papers from IEEE database using the provided search parameters After fetch the data from IEEE, the collected papers are added to the provided search instance Parameters ---------- search : Search A search instance api_token : str The API key used to fetch data from IEEE database, Raises ------ AttributeError - The API token cannot be null """ if search.publication_types is not None and 'journal' not in search.publication_types: logging.info('Skiping PubMed search, journal publication type not in filters. Nowadays the PubMed only retrieves papers published on journals.') return papers_count = 0 result = _get_api_result(search) if result.get('eSearchResult').get('ErrorList', None) is not None: total_papers = 0 else: total_papers = int(result.get('eSearchResult').get('Count')) logging.info(f'PubMed: {total_papers} papers to fetch') while(papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL)): for pubmed_id in result.get('eSearchResult').get('IdList').get('Id'): if papers_count >= total_papers or search.reached_its_limit(DATABASE_LABEL): break papers_count += 1 try: paper_entry = _get_paper_entry(pubmed_id) if paper_entry is not None: paper_title = paper_entry.get('PubmedArticleSet').get('PubmedArticle').get( 'MedlineCitation').get('Article').get('ArticleTitle') logging.info(f'({papers_count}/{total_papers}) Fetching PubMed paper: {paper_title}') publication = _get_publication(paper_entry) paper = _get_paper(paper_entry, publication) if paper is not None: paper.add_database(DATABASE_LABEL) search.add_paper(paper) except Exception as e: # pragma: no cover logging.debug(e, exc_info=True) if papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL): result = _get_api_result(search, papers_count)
def run(search: Search): """ This method fetch papers from ACM database using the provided search parameters After fetch the data from ACM, the collected papers are added to the provided search instance Parameters ---------- search : Search A search instance """ papers_count = 0 result = _get_result(search) try: total_papers = int(result.xpath( '//*[@class="hitsLength"]')[0].text.strip()) except Exception: # pragma: no cover total_papers = 0 logging.info(f'ACM: {total_papers} papers to fetch') page_index = 0 while(papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL)): papers_urls = [BASE_URL+x.attrib['href'] for x in result.xpath('//*[@class="hlFld-Title"]/a')] for paper_url in papers_urls: if papers_count >= total_papers or search.reached_its_limit(DATABASE_LABEL): break try: papers_count += 1 paper_page = _get_paper_page(paper_url) paper_title = paper_page.xpath('//*[@class="citation__title"]')[0].text logging.info(f'({papers_count}/{total_papers}) Fetching ACM paper: {paper_title}') paper_doi = None if '/abs/' in paper_url: paper_doi = paper_url.split('/abs/')[1] elif '/book/' in paper_url: paper_doi = paper_url.split('/book/')[1] else: paper_doi = paper_url.split('/doi/')[1] paper = _get_paper(paper_page, paper_doi, paper_url) if paper is None: continue paper.add_database(DATABASE_LABEL) search.add_paper(paper) except Exception as e: # pragma: no cover logging.debug(e, exc_info=True) if papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL): page_index += 1 result = _get_result(search, page_index)
def run(search: Search, api_token: str, url: Optional[str] = None, papers_count: Optional[int] = 0): """ This method fetch papers from Scopus database using the provided search parameters After fetch the data from Scopus, the collected papers are added to the provided search instance Parameters ---------- search : Search A search instance api_token : str The API key used to fetch data from Scopus database, url : Optional[str] A predefined URL to be used for the search execution, this is usually used for make the next recursive call on a result pagination papers_count : Optional[int] Papers count used on recursion calls Raises ------ AttributeError - The API token cannot be null """ if api_token is None or len(api_token.strip()) == 0: raise AttributeError('The API token cannot be null') search_results = _get_search_results(search, api_token, url) total_papers = int(search_results.get('opensearch:totalResults', 0)) logging.info(f'Scopus: {total_papers} papers to fetch') for paper_entry in search_results.get('entry', []): if papers_count >= total_papers or search.reached_its_limit(DATABASE_LABEL): break papers_count += 1 try: paper_title = paper_entry.get("dc:title") logging.info(f'({papers_count}/{total_papers}) Fetching Scopus paper: {paper_title}') publication = _get_publication(paper_entry, api_token) paper = _get_paper(paper_entry, publication) if paper is not None: paper.add_database(DATABASE_LABEL) search.add_paper(paper) except Exception as e: # pragma: no cover logging.debug(e, exc_info=True) next_url = None for link in search_results['link']: if link['@ref'] == 'next': next_url = link['@href'] break # If there is a next url, the API provided response was paginated and we need to process the next url # We'll make a recursive call for it if papers_count < total_papers and next_url is not None and not search.reached_its_limit(DATABASE_LABEL): run(search, api_token, next_url, papers_count)
def run(search: Search): """ This method fetch papers from arXiv database using the provided search parameters After fetch the data from arXiv, the collected papers are added to the provided search instance Parameters ---------- search : Search A search instance """ papers_count = 0 result = _get_api_result(search) total_papers = int( result.get('feed').get('opensearch:totalResults').get('#text')) logging.info(f'arXiv: {total_papers} papers to fetch') while (papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL)): entries = result.get('feed', {}).get('entry', []) if type( entries ) != list: # if there's only one entry the result is not a list just a dict entries = [entries] for paper_entry in entries: if papers_count >= total_papers or search.reached_its_limit( DATABASE_LABEL): break papers_count += 1 try: paper_title = paper_entry.get("title") logging.info( f'({papers_count}/{total_papers}) Fetching arXiv paper: {paper_title}' ) published_date = datetime.datetime.strptime( paper_entry.get('published')[:10], '%Y-%m-%d').date() # nowadays we don't have a date filter on arXiv API, so we need to do it by ourselves' if search.since is not None and published_date < search.since: logging.info( 'Skipping paper due to "since" date constraint') continue elif search.until is not None and published_date > search.until: logging.info( 'Skipping paper due to "until" date constraint') continue publication = _get_publication(paper_entry) paper = _get_paper(paper_entry, published_date, publication) if paper is not None: paper.add_database(DATABASE_LABEL) search.add_paper(paper) except Exception as e: # pragma: no cover logging.debug(e, exc_info=True) if papers_count < total_papers and not search.reached_its_limit( DATABASE_LABEL): time.sleep(1) # sleep for 1 second to avoid server blocking result = _get_api_result(search, papers_count)
def test_output(search: Search, paper: Paper): paper.publication.category = 'Journal' paper.categories = {'Facet A': ['Category A', 'Category B']} paper.selected = False search.add_paper(paper) other_paper = copy.deepcopy(paper) other_paper.publication.issn = 'ISSN-CONF' other_paper.publication.category = 'Conference Proceedings' other_paper.title = 'Conference paper title' other_paper.doi = 'fake-doi-conference-paper' other_paper.selected = True other_paper.categories = { 'Facet A': ['Category C'], 'Facet B': ['Category 1'] } search.add_paper(other_paper) other_paper = copy.deepcopy(paper) other_paper.publication.issn = 'ISSN-BOOK' other_paper.publication.category = 'Book' other_paper.title = 'Book paper title' other_paper.doi = 'fake-doi-book-paper' other_paper.categories = None search.add_paper(other_paper) other_paper = copy.deepcopy(paper) other_paper.publication = None other_paper.title = 'Unpublished paper title' other_paper.doi = None other_paper.selected = True other_paper.categories = {'Facet A': ['Category A']} search.add_paper(other_paper) search_path = tempfile.NamedTemporaryFile().name outputpath = tempfile.NamedTemporaryFile().name persistence_util.save(search, search_path) findpapers.generate_bibtex(search_path, outputpath) with open(outputpath) as fp: generated_bibtex = fp.read() article_header = '@article{drpaul1969awesome' inproceedings_header = '@inproceedings{drpaul1969conference' book_header = '@book{drpaul1969book' unpublished = '@unpublished{drpaul1969unpublished' assert article_header in generated_bibtex assert inproceedings_header in generated_bibtex assert book_header in generated_bibtex assert unpublished in generated_bibtex findpapers.generate_bibtex(search_path, outputpath, only_selected_papers=True) with open(outputpath) as fp: generated_bibtex = fp.read() assert article_header not in generated_bibtex assert inproceedings_header in generated_bibtex assert book_header not in generated_bibtex assert unpublished in generated_bibtex findpapers.generate_bibtex(search_path, outputpath, categories_filter={ 'Facet A': ['Category A'], 'Facet B': ['Category 1'] }) with open(outputpath) as fp: generated_bibtex = fp.read() assert article_header in generated_bibtex assert inproceedings_header in generated_bibtex assert book_header not in generated_bibtex assert unpublished in generated_bibtex findpapers.generate_bibtex( search_path, outputpath, categories_filter={'Facet A': ['Category B', 'Category C']}) with open(outputpath) as fp: generated_bibtex = fp.read() assert article_header in generated_bibtex assert inproceedings_header in generated_bibtex assert book_header not in generated_bibtex assert unpublished not in generated_bibtex
def test_search(paper: Paper): paper.doi = None search = Search('this AND that', datetime.date(1969, 1, 30), datetime.date(1970, 4, 8), 2) assert len(search.papers) == 0 search.add_paper(paper) assert len(search.papers) == 1 search.add_paper(paper) assert len(search.papers) == 1 another_paper = Paper('awesome paper title 2', 'a long abstract', paper.authors, paper.publication, paper.publication_date, paper.urls) another_paper.add_database('arXiv') search.add_paper(another_paper) assert len(search.papers) == 2 assert paper == search.get_paper(paper.title, paper.publication_date, paper.doi) assert paper.publication == search.get_publication(paper.publication.title, paper.publication.issn, paper.publication.isbn) search.remove_paper(another_paper) assert len(search.papers) == 1 assert paper in search.papers search.limit_per_database = 1 with pytest.raises(OverflowError): search.add_paper(another_paper) search.limit_per_database = 2 search.add_paper(another_paper) assert len(search.papers) == 2 another_paper_2 = copy.deepcopy(paper) another_paper_2.title = 'awesome paper title 3' another_paper_2.abstract = 'a long abstract' another_paper_2.databases = set() with pytest.raises(ValueError): search.add_paper(another_paper_2) another_paper_2.add_database('arXiv') with pytest.raises(OverflowError): search.add_paper(another_paper_2) search.merge_duplications() assert len(search.papers) == 1 publication_title = 'FAKE-TITLE' publication_issn = 'FAKE-ISSN' publication_isbn = 'FAKE-ISBN' assert search.get_publication_key( publication_title, publication_issn, publication_isbn) == f'ISBN-{publication_isbn.lower()}' assert search.get_publication_key( publication_title, publication_issn) == f'ISSN-{publication_issn.lower()}' assert search.get_publication_key( publication_title) == f'TITLE-{publication_title.lower()}'