def test_run(search: Search): search.limit = 14 search.limit_per_database = None acm_searcher.run(search) assert len(search.papers) == 14
def test_run(search: Search): search.limit = 20 search.limit_per_database = None search.since = datetime.date(2020, 8, 26) search.until = datetime.date(2020, 8, 26) arxiv_searcher.run(search) assert len(search.papers) == 18
def run(search: Search, api_token: str): """ This method fetch papers from IEEE database using the provided search parameters After fetch the data from IEEE, the collected papers are added to the provided search instance Parameters ---------- search : Search A search instance api_token : str The API key used to fetch data from IEEE database, Raises ------ AttributeError - The API token cannot be null """ if api_token is None or len(api_token.strip()) == 0: raise AttributeError('The API token cannot be null') papers_count = 0 result = _get_api_result(search, api_token) total_papers = result.get('total_records') logging.info(f'IEEE: {total_papers} papers to fetch') while(papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL)): for paper_entry in result.get('articles'): if papers_count >= total_papers or search.reached_its_limit(DATABASE_LABEL): break papers_count += 1 try: logging.info(f'({papers_count}/{total_papers}) Fetching IEEE paper: {paper_entry.get("title")}') publication = _get_publication(paper_entry) paper = _get_paper(paper_entry, publication) if paper is not None: paper.add_database(DATABASE_LABEL) search.add_paper(paper) except Exception as e: # pragma: no cover logging.debug(e, exc_info=True) if papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL): result = _get_api_result(search, api_token, papers_count+1)
def run(search: Search, database: str): """ This method fetch papers from medRxiv/bioRxiv database using the provided search parameters After fetch the data from medRxiv/bioRxiv, the collected papers are added to the provided search instance Parameters ---------- search : Search A search instance database : str The database name (medRxiv or bioRxiv) """ urls = _get_search_urls(search, database) for i, url in enumerate(urls): if search.reached_its_limit(database): break logging.info(f'{database}: Requesting for papers...') data = _get_data(url) total_papers = 0 if len(data) > 0: total_papers = data[0].get('total_papers') logging.info(f'{database}: {total_papers} papers to fetch from {i+1}/{len(urls)} papers requests') papers_count = 0 dois = sum([d.get('dois') for d in [x for x in data]], []) for doi in dois: if papers_count >= total_papers or search.reached_its_limit(database): break try: papers_count += 1 paper_metadata = _get_paper_metadata(doi, database) paper_title = paper_metadata.get('title') logging.info(f'({papers_count}/{total_papers}) Fetching {database} paper: {paper_title}') paper = _get_paper(paper_metadata) paper.add_database(database) search.add_paper(paper) except Exception as e: # pragma: no cover logging.debug(e, exc_info=True)
def _filter(search: Search): """ Private method that filter the search results Parameters ---------- search : Search A search instance """ if search.publication_types is not None: for paper in list(search.papers): try: if (paper.publication is not None and paper.publication.category.lower() not in search.publication_types) or \ (paper.publication is None and 'other' not in search.publication_types): search.remove_paper(paper) except Exception: pass
def test_save_and_load(search: Search, paper: Paper): temp_dirpath = tempfile.mkdtemp() temp_filepath = os.path.join(temp_dirpath, 'output.json') search.add_paper(paper) findpapers.save(search, temp_filepath) loaded_search = findpapers.load(temp_filepath) assert loaded_search.query == search.query assert loaded_search.since == search.since assert loaded_search.until == search.until assert loaded_search.limit == search.limit assert loaded_search.limit_per_database == search.limit_per_database assert loaded_search.processed_at.strftime( '%Y-%m-%d %H:%M:%S') == search.processed_at.strftime( '%Y-%m-%d %H:%M:%S') assert len(loaded_search.papers) == len(search.papers)
def test_run(search: Search): search.limit = 26 ieee_searcher.run(search, 'fake-api-token') assert len(search.papers) == 26 with pytest.raises(AttributeError): ieee_searcher.run(search, '') with pytest.raises(AttributeError): ieee_searcher.run(search, None)
def load(search_path: str): """ Method used to load a search result using a JSON representation Parameters ---------- search_path : str A valid file path containing a JSON representation of the search results """ with open(search_path, 'r') as jsonfile: return Search.from_dict(json.load(jsonfile))
def test_get_search_urls(search: Search): search.query = '([term a] AND [term b]) OR ([term c] OR [term d])' urls = rxiv_searcher._get_search_urls(search, 'medRxiv') assert len(urls) == 2 with pytest.raises(ValueError): # wildcards not supported search.query = '([term a] AND [term ?]) OR ([term c] OR [term d])' rxiv_searcher._get_search_urls(search, 'medRxiv') with pytest.raises(ValueError): # AND NOT not supported search.query = '([term a] AND NOT [term b]) OR ([term c] OR [term d])' rxiv_searcher._get_search_urls(search, 'medRxiv') with pytest.raises(ValueError): # Max 1-level parentheses group search.query = '(([term a] OR [term b]) OR ([term c] OR [term d])) OR [term e]' rxiv_searcher._get_search_urls(search, 'medRxiv') with pytest.raises(ValueError): # only OR between groups search.query = '([term a] AND [term b]) AND ([term c] OR [term d])' rxiv_searcher._get_search_urls(search, 'medRxiv') with pytest.raises(ValueError): # Mixed connectors not supported search.query = '([term a] AND [term b] OR [term c])' rxiv_searcher._get_search_urls(search, 'medRxiv')
def save(search: Search, outputpath: str): """ Method used to save a search result in a JSON representation Parameters ---------- search : Search A Search instance outputpath : str A valid file path used to save the search results """ with open(outputpath, 'w') as jsonfile: json.dump(Search.to_dict(search), jsonfile, indent=2, sort_keys=True)
def _database_safe_run(function: callable, search: Search, database_label: str): """ Private method that calls a provided function catching all exceptions without rasing them, only logging a ERROR message Parameters ---------- function : callable A function that will be call for database fetching search : Search A search instance database_label : str A database label """ if not search.reached_its_limit(database_label): logging.info(f'Fetching papers from {database_label} database...') try: function() except Exception: # pragma: no cover logging.debug( f'Error while fetching papers from {database_label} database', exc_info=True)
def run(search: Search): """ This method fetch papers from IEEE database using the provided search parameters After fetch the data from IEEE, the collected papers are added to the provided search instance Parameters ---------- search : Search A search instance api_token : str The API key used to fetch data from IEEE database, Raises ------ AttributeError - The API token cannot be null """ if search.publication_types is not None and 'journal' not in search.publication_types: logging.info('Skiping PubMed search, journal publication type not in filters. Nowadays the PubMed only retrieves papers published on journals.') return papers_count = 0 result = _get_api_result(search) if result.get('eSearchResult').get('ErrorList', None) is not None: total_papers = 0 else: total_papers = int(result.get('eSearchResult').get('Count')) logging.info(f'PubMed: {total_papers} papers to fetch') while(papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL)): for pubmed_id in result.get('eSearchResult').get('IdList').get('Id'): if papers_count >= total_papers or search.reached_its_limit(DATABASE_LABEL): break papers_count += 1 try: paper_entry = _get_paper_entry(pubmed_id) if paper_entry is not None: paper_title = paper_entry.get('PubmedArticleSet').get('PubmedArticle').get( 'MedlineCitation').get('Article').get('ArticleTitle') logging.info(f'({papers_count}/{total_papers}) Fetching PubMed paper: {paper_title}') publication = _get_publication(paper_entry) paper = _get_paper(paper_entry, publication) if paper is not None: paper.add_database(DATABASE_LABEL) search.add_paper(paper) except Exception as e: # pragma: no cover logging.debug(e, exc_info=True) if papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL): result = _get_api_result(search, papers_count)
def test_output(search: Search, paper: Paper): paper.publication.category = 'Journal' paper.categories = {'Facet A': ['Category A', 'Category B']} paper.selected = False search.add_paper(paper) other_paper = copy.deepcopy(paper) other_paper.publication.issn = 'ISSN-CONF' other_paper.publication.category = 'Conference Proceedings' other_paper.title = 'Conference paper title' other_paper.doi = 'fake-doi-conference-paper' other_paper.selected = True other_paper.categories = { 'Facet A': ['Category C'], 'Facet B': ['Category 1'] } search.add_paper(other_paper) other_paper = copy.deepcopy(paper) other_paper.publication.issn = 'ISSN-BOOK' other_paper.publication.category = 'Book' other_paper.title = 'Book paper title' other_paper.doi = 'fake-doi-book-paper' other_paper.categories = None search.add_paper(other_paper) other_paper = copy.deepcopy(paper) other_paper.publication = None other_paper.title = 'Unpublished paper title' other_paper.doi = None other_paper.selected = True other_paper.categories = {'Facet A': ['Category A']} search.add_paper(other_paper) search_path = tempfile.NamedTemporaryFile().name outputpath = tempfile.NamedTemporaryFile().name persistence_util.save(search, search_path) findpapers.generate_bibtex(search_path, outputpath) with open(outputpath) as fp: generated_bibtex = fp.read() article_header = '@article{drpaul1969awesome' inproceedings_header = '@inproceedings{drpaul1969conference' book_header = '@book{drpaul1969book' unpublished = '@unpublished{drpaul1969unpublished' assert article_header in generated_bibtex assert inproceedings_header in generated_bibtex assert book_header in generated_bibtex assert unpublished in generated_bibtex findpapers.generate_bibtex(search_path, outputpath, only_selected_papers=True) with open(outputpath) as fp: generated_bibtex = fp.read() assert article_header not in generated_bibtex assert inproceedings_header in generated_bibtex assert book_header not in generated_bibtex assert unpublished in generated_bibtex findpapers.generate_bibtex(search_path, outputpath, categories_filter={ 'Facet A': ['Category A'], 'Facet B': ['Category 1'] }) with open(outputpath) as fp: generated_bibtex = fp.read() assert article_header in generated_bibtex assert inproceedings_header in generated_bibtex assert book_header not in generated_bibtex assert unpublished in generated_bibtex findpapers.generate_bibtex( search_path, outputpath, categories_filter={'Facet A': ['Category B', 'Category C']}) with open(outputpath) as fp: generated_bibtex = fp.read() assert article_header in generated_bibtex assert inproceedings_header in generated_bibtex assert book_header not in generated_bibtex assert unpublished not in generated_bibtex
def test_run(search: Search): search.limit = 51 pubmed_searcher.run(search) assert len(search.papers) == 51
def run(search: Search): """ This method fetch papers from ACM database using the provided search parameters After fetch the data from ACM, the collected papers are added to the provided search instance Parameters ---------- search : Search A search instance """ papers_count = 0 result = _get_result(search) try: total_papers = int(result.xpath( '//*[@class="hitsLength"]')[0].text.strip()) except Exception: # pragma: no cover total_papers = 0 logging.info(f'ACM: {total_papers} papers to fetch') page_index = 0 while(papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL)): papers_urls = [BASE_URL+x.attrib['href'] for x in result.xpath('//*[@class="hlFld-Title"]/a')] for paper_url in papers_urls: if papers_count >= total_papers or search.reached_its_limit(DATABASE_LABEL): break try: papers_count += 1 paper_page = _get_paper_page(paper_url) paper_title = paper_page.xpath('//*[@class="citation__title"]')[0].text logging.info(f'({papers_count}/{total_papers}) Fetching ACM paper: {paper_title}') paper_doi = None if '/abs/' in paper_url: paper_doi = paper_url.split('/abs/')[1] elif '/book/' in paper_url: paper_doi = paper_url.split('/book/')[1] else: paper_doi = paper_url.split('/doi/')[1] paper = _get_paper(paper_page, paper_doi, paper_url) if paper is None: continue paper.add_database(DATABASE_LABEL) search.add_paper(paper) except Exception as e: # pragma: no cover logging.debug(e, exc_info=True) if papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL): page_index += 1 result = _get_result(search, page_index)
def search(): return Search('"this" AND ("that thing" OR "something") AND NOT "anything"', datetime.date(1969, 1, 30), datetime.date(2020, 12, 31), 100, 100)
def search(outputpath: str, query: Optional[str] = None, since: Optional[datetime.date] = None, until: Optional[datetime.date] = None, limit: Optional[int] = None, limit_per_database: Optional[int] = None, databases: Optional[List[str]] = None, publication_types: Optional[List[str]] = None, scopus_api_token: Optional[str] = None, ieee_api_token: Optional[str] = None, proxy: Optional[str] = None, verbose: Optional[bool] = False): """ When you have a query and needs to get papers using it, this is the method that you'll need to call. This method will find papers from some databases based on the provided query. Parameters ---------- outputpath : str A valid file path where the search result file will be placed query : str, optional A query string that will be used to perform the papers search. If not provided, the query will be loaded from the environment variable FINDPAPERS_QUERY All the query terms need to be enclosed in quotes and can be associated using boolean operators, and grouped using parentheses. E.g.: [term A] AND ([term B] OR [term C]) AND NOT [term D] You can use some wildcards in the query too. Use ? to replace a single character or * to replace any number of characters. E.g.: "son?" -> will match song, sons, ... E.g.: "son*" -> will match song, sons, sonar, songwriting, ... Note: All boolean operators needs to be uppercased. The boolean operator "NOT" must be preceded by an "AND" operator. since : Optional[datetime.date], optional A lower bound (inclusive) date that will be used to filter the search results, by default None until : Optional[datetime.date], optional A upper bound (inclusive) date that will be used to filter the search results, by default None limit : Optional[int], optional The max number of papers to collect, by default None limit_per_database : Optional[int], optional The max number of papers to collect per each database, by default None databases : List[str], optional List of databases where the search should be performed, if not specified all databases will be used, by default None publication_types : List[str], optional List of publication list of publication types to filter when searching, if not specified all the publication types will be collected (this parameter is case insensitive). The available publication types are: journal, conference proceedings, book, other, by default None scopus_api_token : Optional[str], optional A API token used to fetch data from Scopus database. If you don't have one go to https://dev.elsevier.com and get it, by default None ieee_api_token : Optional[str], optional A API token used to fetch data from IEEE database. If you don't have one go to https://developer.ieee.org and get it, by default None proxy : Optional[str], optional proxy URL that can be used during requests. This can be also defined by an environment variable FINDPAPERS_PROXY. By default None verbose : Optional[bool], optional If you wanna a verbose logging """ common_util.logging_initialize(verbose) if proxy is not None: os.environ['FINDPAPERS_PROXY'] = proxy logging.info('Let\'s find some papers, this process may take a while...') if databases is not None: databases = [x.lower() for x in databases] if publication_types is not None: publication_types = [x.lower().strip() for x in publication_types] for publication_type in publication_types: if publication_type not in ['journal', 'conference proceedings', 'book', 'other']: raise ValueError(f'Invalid publication type: {publication_type}') if query is None: query = os.getenv('FINDPAPERS_QUERY') if query is not None: query = _sanitize_query(query) if query is None or not _is_query_ok(query): raise ValueError('Invalid query format') common_util.check_write_access(outputpath) if ieee_api_token is None: ieee_api_token = os.getenv('FINDPAPERS_IEEE_API_TOKEN') if scopus_api_token is None: scopus_api_token = os.getenv('FINDPAPERS_SCOPUS_API_TOKEN') search = Search(query, since, until, limit, limit_per_database, databases=databases, publication_types=publication_types) if databases is None or arxiv_searcher.DATABASE_LABEL.lower() in databases: _database_safe_run(lambda: arxiv_searcher.run(search), search, arxiv_searcher.DATABASE_LABEL) if databases is None or pubmed_searcher.DATABASE_LABEL.lower() in databases: _database_safe_run(lambda: pubmed_searcher.run(search), search, pubmed_searcher.DATABASE_LABEL) if databases is None or acm_searcher.DATABASE_LABEL.lower() in databases: _database_safe_run(lambda: acm_searcher.run(search), search, acm_searcher.DATABASE_LABEL) if ieee_api_token is not None: if databases is None or ieee_searcher.DATABASE_LABEL.lower() in databases: _database_safe_run(lambda: ieee_searcher.run( search, ieee_api_token), search, ieee_searcher.DATABASE_LABEL) else: logging.info('IEEE API token not found, skipping search on this database') if scopus_api_token is not None: if databases is None or scopus_searcher.DATABASE_LABEL.lower() in databases: _database_safe_run(lambda: scopus_searcher.run( search, scopus_api_token), search, scopus_searcher.DATABASE_LABEL) else: logging.info('Scopus API token not found, skipping search on this database') if databases is None or medrxiv_searcher.DATABASE_LABEL.lower() in databases: _database_safe_run(lambda: medrxiv_searcher.run(search), search, medrxiv_searcher.DATABASE_LABEL) if databases is None or biorxiv_searcher.DATABASE_LABEL.lower() in databases: _database_safe_run(lambda: biorxiv_searcher.run(search), search, biorxiv_searcher.DATABASE_LABEL) logging.info('Enriching results...') _enrich(search, scopus_api_token) logging.info('Filtering results...') _filter(search) logging.info('Finding and merging duplications...') search.merge_duplications() logging.info('Flagging potentially predatory publications...') _flag_potentially_predatory_publications(search) logging.info(f'It\'s finally over! {len(search.papers)} papers retrieved. Good luck with your research :)') persistence_util.save(search, outputpath)
def test_search(paper: Paper): paper.doi = None search = Search('this AND that', datetime.date(1969, 1, 30), datetime.date(1970, 4, 8), 2) assert len(search.papers) == 0 search.add_paper(paper) assert len(search.papers) == 1 search.add_paper(paper) assert len(search.papers) == 1 another_paper = Paper('awesome paper title 2', 'a long abstract', paper.authors, paper.publication, paper.publication_date, paper.urls) another_paper.add_database('arXiv') search.add_paper(another_paper) assert len(search.papers) == 2 assert paper == search.get_paper(paper.title, paper.publication_date, paper.doi) assert paper.publication == search.get_publication(paper.publication.title, paper.publication.issn, paper.publication.isbn) search.remove_paper(another_paper) assert len(search.papers) == 1 assert paper in search.papers search.limit_per_database = 1 with pytest.raises(OverflowError): search.add_paper(another_paper) search.limit_per_database = 2 search.add_paper(another_paper) assert len(search.papers) == 2 another_paper_2 = copy.deepcopy(paper) another_paper_2.title = 'awesome paper title 3' another_paper_2.abstract = 'a long abstract' another_paper_2.databases = set() with pytest.raises(ValueError): search.add_paper(another_paper_2) another_paper_2.add_database('arXiv') with pytest.raises(OverflowError): search.add_paper(another_paper_2) search.merge_duplications() assert len(search.papers) == 1 publication_title = 'FAKE-TITLE' publication_issn = 'FAKE-ISSN' publication_isbn = 'FAKE-ISBN' assert search.get_publication_key( publication_title, publication_issn, publication_isbn) == f'ISBN-{publication_isbn.lower()}' assert search.get_publication_key( publication_title, publication_issn) == f'ISSN-{publication_issn.lower()}' assert search.get_publication_key( publication_title) == f'TITLE-{publication_title.lower()}'
def run(search: Search): """ This method fetch papers from arXiv database using the provided search parameters After fetch the data from arXiv, the collected papers are added to the provided search instance Parameters ---------- search : Search A search instance """ papers_count = 0 result = _get_api_result(search) total_papers = int( result.get('feed').get('opensearch:totalResults').get('#text')) logging.info(f'arXiv: {total_papers} papers to fetch') while (papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL)): entries = result.get('feed', {}).get('entry', []) if type( entries ) != list: # if there's only one entry the result is not a list just a dict entries = [entries] for paper_entry in entries: if papers_count >= total_papers or search.reached_its_limit( DATABASE_LABEL): break papers_count += 1 try: paper_title = paper_entry.get("title") logging.info( f'({papers_count}/{total_papers}) Fetching arXiv paper: {paper_title}' ) published_date = datetime.datetime.strptime( paper_entry.get('published')[:10], '%Y-%m-%d').date() # nowadays we don't have a date filter on arXiv API, so we need to do it by ourselves' if search.since is not None and published_date < search.since: logging.info( 'Skipping paper due to "since" date constraint') continue elif search.until is not None and published_date > search.until: logging.info( 'Skipping paper due to "until" date constraint') continue publication = _get_publication(paper_entry) paper = _get_paper(paper_entry, published_date, publication) if paper is not None: paper.add_database(DATABASE_LABEL) search.add_paper(paper) except Exception as e: # pragma: no cover logging.debug(e, exc_info=True) if papers_count < total_papers and not search.reached_its_limit( DATABASE_LABEL): time.sleep(1) # sleep for 1 second to avoid server blocking result = _get_api_result(search, papers_count)
def run(search: Search, api_token: str, url: Optional[str] = None, papers_count: Optional[int] = 0): """ This method fetch papers from Scopus database using the provided search parameters After fetch the data from Scopus, the collected papers are added to the provided search instance Parameters ---------- search : Search A search instance api_token : str The API key used to fetch data from Scopus database, url : Optional[str] A predefined URL to be used for the search execution, this is usually used for make the next recursive call on a result pagination papers_count : Optional[int] Papers count used on recursion calls Raises ------ AttributeError - The API token cannot be null """ if api_token is None or len(api_token.strip()) == 0: raise AttributeError('The API token cannot be null') search_results = _get_search_results(search, api_token, url) total_papers = int(search_results.get('opensearch:totalResults', 0)) logging.info(f'Scopus: {total_papers} papers to fetch') for paper_entry in search_results.get('entry', []): if papers_count >= total_papers or search.reached_its_limit(DATABASE_LABEL): break papers_count += 1 try: paper_title = paper_entry.get("dc:title") logging.info(f'({papers_count}/{total_papers}) Fetching Scopus paper: {paper_title}') publication = _get_publication(paper_entry, api_token) paper = _get_paper(paper_entry, publication) if paper is not None: paper.add_database(DATABASE_LABEL) search.add_paper(paper) except Exception as e: # pragma: no cover logging.debug(e, exc_info=True) next_url = None for link in search_results['link']: if link['@ref'] == 'next': next_url = link['@href'] break # If there is a next url, the API provided response was paginated and we need to process the next url # We'll make a recursive call for it if papers_count < total_papers and next_url is not None and not search.reached_its_limit(DATABASE_LABEL): run(search, api_token, next_url, papers_count)