def __init__(self, reset_cache=False): self.journals = Journals() self.works = Works() self.filter_kwargs = dict(has_license='true', has_full_text='true') self.keywords = 'business financial merger entrepreneur banking insurance commerce trade economics' UnpywallCredentials('*****@*****.**') cache_path = path.join(DATA_DIR, 'unpaywall_cache') if reset_cache and path.exists(cache_path): remove(cache_path) self.unpywall_cache = UnpywallCache(cache_path) Unpywall.init_cache(self.unpywall_cache)
def test_init_cache(self): Unpywall.init_cache() path = os.path.join(os.getcwd(), 'unpaywall_cache') assert Unpywall.cache.name == path os.remove(path) with pytest.raises( AttributeError, match='Cache is not of type {0}'.format(UnpywallCache)): assert Unpywall.init_cache('Not a UnpywallCache object.') Unpywall.init_cache(test_cache) assert os.path.exists(Unpywall.cache.name)
def download_pdf_file(url: str, filename: str, filepath: str = '.', progress: bool = False) -> None: """ This function downloads a PDF from a given DOI. @:param ---------- url : url Download url for the article. filename : str The filename for the PDF. filepath : str The path to store the downloaded PDF. progress : bool Whether the progress of the API call should be printed out or not. """ try: headers = {"User-Agent": "python"} r = requests.get(url, stream=url, headers=headers) if r.status_code == 200: file_size = int(r.headers.get('content-length', 0)) block_size = 1024 path = os.path.join(filepath, filename) if not os.path.exists(filepath): os.makedirs(filepath) with open(path, 'wb') as file: chunk_size = 0 for chunk in r.iter_content(block_size): if progress and file_size > 0: chunk_size += len(chunk) Unpywall._progress(chunk_size / file_size) file.write(chunk) else: logger.warning( "Not able to download file, Http Response: {}".format( r.status_code)) except ConnectionError: logger.warning('Connection error received, will retry after 10 secs') sleep(10) Unpywall.download_pdf_file(url, filename, filepath) except Exception: logger.warning('Rethrowing error') raise
def test_doi(self, Unpywall, capfd): df = Unpywall.doi(dois=['10.1038/nature12373'], format='raw', progress=True, errors='ignore') captured = capfd.readouterr() assert len(captured.out) > 0 assert isinstance(df, pd.DataFrame) with pytest.warns(UserWarning): df_empty = Unpywall.doi(dois=['a bad doi'], errors='ignore') assert df_empty is None
def test_validate_dois(self): correct_dois = ['10.1038/nature12373', '10.1103/physreve.88.012814'] bad_dois = 'a bad doi' with pytest.raises(ValueError, match='No DOI specified'): assert Unpywall._validate_dois(None) with pytest.raises(ValueError, match='The input format must be of type list'): assert Unpywall._validate_dois(bad_dois) with pytest.raises(ValueError, match=('Unpaywall only allows to 100,000 calls' ' per day')): assert Unpywall._validate_dois(['doi'] * (Unpywall.api_limit + 1)) assert Unpywall._validate_dois(correct_dois) == correct_dois
def test_download_pdf_file(self, Unpywall, capfd): filename = 'test.pdf' filepath = './test_dir' Unpywall.download_pdf_file('10.1038/nature12373', filename=filename, filepath=filepath, progress=True) captured = capfd.readouterr() assert len(captured.out) > 0 path = os.path.join(filepath, filename) assert os.path.exists(path) os.remove(path) os.rmdir(filepath)
def get_oa_urls(self, doi_list): logger.info('Retreiving doc urls for DOIs now (cached/uncached)') oa_urls = [] for i, doi in tqdm(enumerate(doi_list), total=len(doi_list)): try: oa_urls.append(Unpywall.get_doc_link(doi)) except HTTPError: logger.warning( '\nError received for DOI: {}, will retry 3 times in 20 secs' .format(doi)) sleep(20) for i in range(3): try: logger.info('Retry :{}'.format(i + 1)) oa_urls.append(Unpywall.get_doc_link(doi)) break except HTTPError as e: logger.error('Retry failed', e, exc_info=True) return oa_urls
def test_get_df(self, Unpywall): data = Unpywall.get_json(doi='10.1016/j.tmaid.2020.101663', errors='raise') with pytest.raises(ValueError, match=('The argument format only accepts the' ' values "raw" and "extended"')): assert Unpywall._get_df(data=data, format='not a valid format', errors='raise') df_raw = Unpywall._get_df(data=data, format='raw', errors='ignore') assert isinstance(df_raw, pd.DataFrame) df_extended = Unpywall._get_df(data=data, format='extended', errors='ignore') assert isinstance(df_extended, pd.DataFrame)
def retry_from_another_src(self, faulty_files_list, doi_list): src_dict = {'scirp': []} for file in faulty_files_list: base_name = ntpath.basename(file) doi_list_ind = int(base_name.replace("Sample_", "")[:-8]) - 1 doi = doi_list[doi_list_ind] doc_url = Unpywall.get_pdf_link(doi) if doc_url is not None and 'scirp' in doc_url.lower(): try: scirp_id = doc_url[doc_url.index('paperID=') + 8:] except (IndexError, ValueError): continue if scirp_id != "": src_dict['scirp'].append((file, scirp_id)) return download_frm_another_src(src_dict)
def test_get_json(self, Unpywall): with pytest.raises(HTTPError): Unpywall.get_json(doi='a bad doi', errors='raise') with pytest.warns(UserWarning): Unpywall.get_json(doi='a bad doi', errors='ignore') assert isinstance( Unpywall.get_json(doi='10.1016/j.tmaid.2020.101663', errors='raise'), dict) assert isinstance( Unpywall.get_json(query='test', is_oa=True, errors='raise'), dict) with pytest.raises(ValueError, match=('The argument is_oa only accepts the' ' values "True" and "False"')): assert Unpywall.get_json(query='test', is_oa='test', errors='raise')
def download_doi_pdf(works, doi_list, download_dir): logger.info( "Trying to download the required data now for {} DOIs".format( len(doi_list))) for i, doi in enumerate(doi_list): name_pattern = 'Sample_{}.pdf'.format(str(i + 1)) download_link = Unpywall.get_pdf_link(doi) try: if not download_link: result = works.doi(doi)['link'] for item in result: application = item['intended-application'] type = item['content-type'] if application is not None and application == 'text-mining' and type == 'application/pdf': download_link = item['URL'] break NarrativeDataset.download_links[ name_pattern[:-4]] = download_link if not path.exists(path.join(download_dir, name_pattern)): if download_link and filter_url(download_link): logger.debug('Downloading ' + name_pattern + " : " + doi + ' from url: ' + download_link) download_pdf_file(download_link, name_pattern, download_dir, progress=True) sleep(5) except Exception as e: logger.error( "Error while downloading the article ({}, {})".format( str(i + 1), doi), e, exc_info=True) NarrativeDataset.download_links[ name_pattern[:-4]] = download_link return True
def Unpywall(self): Unpywall.init_cache(test_cache) yield Unpywall
def test_progress(self, Unpywall, capfd): Unpywall._progress(0.5) captured = capfd.readouterr() assert len(captured.out) > 0
def test_get_all_links(self, Unpywall): assert isinstance( Unpywall.get_all_links('10.1016/j.tmaid.2020.101663'), list)
def test_get_doc_link(self, Unpywall): assert isinstance(Unpywall.get_doc_link('10.1016/j.tmaid.2020.101663'), str)
def test_get_pdf_link(self, Unpywall): assert isinstance(Unpywall.get_pdf_link('10.1038/nature12373'), str)
def test_query(self, Unpywall): df = Unpywall.query(query='test', is_oa=True, errors='ignore') assert isinstance(df, pd.DataFrame)
def test_download_pdf_handle(self, Unpywall): assert isinstance(Unpywall.download_pdf_handle('10.1038/nature12373'), BytesIO)