def __init__(self, reset_cache=False):
     self.journals = Journals()
     self.works = Works()
     self.filter_kwargs = dict(has_license='true', has_full_text='true')
     self.keywords = 'business financial merger entrepreneur banking insurance commerce trade economics'
     UnpywallCredentials('*****@*****.**')
     cache_path = path.join(DATA_DIR, 'unpaywall_cache')
     if reset_cache and path.exists(cache_path):
         remove(cache_path)
     self.unpywall_cache = UnpywallCache(cache_path)
     Unpywall.init_cache(self.unpywall_cache)
예제 #2
0
    def test_init_cache(self):

        Unpywall.init_cache()
        path = os.path.join(os.getcwd(), 'unpaywall_cache')
        assert Unpywall.cache.name == path
        os.remove(path)

        with pytest.raises(
                AttributeError,
                match='Cache is not of type {0}'.format(UnpywallCache)):
            assert Unpywall.init_cache('Not a UnpywallCache object.')

        Unpywall.init_cache(test_cache)
        assert os.path.exists(Unpywall.cache.name)
예제 #3
0
def download_pdf_file(url: str,
                      filename: str,
                      filepath: str = '.',
                      progress: bool = False) -> None:
    """
    This function downloads a PDF from a given DOI.

    @:param
    ----------
    url : url
        Download url for the article.
    filename : str
        The filename for the PDF.
    filepath : str
        The path to store the downloaded PDF.
    progress : bool
        Whether the progress of the API call should be printed out or not.
    """
    try:
        headers = {"User-Agent": "python"}
        r = requests.get(url, stream=url, headers=headers)
        if r.status_code == 200:
            file_size = int(r.headers.get('content-length', 0))
            block_size = 1024

            path = os.path.join(filepath, filename)

            if not os.path.exists(filepath):
                os.makedirs(filepath)

            with open(path, 'wb') as file:
                chunk_size = 0
                for chunk in r.iter_content(block_size):
                    if progress and file_size > 0:
                        chunk_size += len(chunk)
                        Unpywall._progress(chunk_size / file_size)
                    file.write(chunk)
        else:
            logger.warning(
                "Not able to download file, Http Response: {}".format(
                    r.status_code))
    except ConnectionError:
        logger.warning('Connection error received, will retry after 10 secs')
        sleep(10)
        Unpywall.download_pdf_file(url, filename, filepath)
    except Exception:
        logger.warning('Rethrowing error')
        raise
예제 #4
0
    def test_doi(self, Unpywall, capfd):

        df = Unpywall.doi(dois=['10.1038/nature12373'],
                          format='raw',
                          progress=True,
                          errors='ignore')

        captured = capfd.readouterr()

        assert len(captured.out) > 0
        assert isinstance(df, pd.DataFrame)

        with pytest.warns(UserWarning):
            df_empty = Unpywall.doi(dois=['a bad doi'], errors='ignore')

            assert df_empty is None
예제 #5
0
    def test_validate_dois(self):

        correct_dois = ['10.1038/nature12373', '10.1103/physreve.88.012814']
        bad_dois = 'a bad doi'

        with pytest.raises(ValueError, match='No DOI specified'):
            assert Unpywall._validate_dois(None)

        with pytest.raises(ValueError,
                           match='The input format must be of type list'):
            assert Unpywall._validate_dois(bad_dois)

        with pytest.raises(ValueError,
                           match=('Unpaywall only allows to 100,000 calls'
                                  ' per day')):
            assert Unpywall._validate_dois(['doi'] * (Unpywall.api_limit + 1))

        assert Unpywall._validate_dois(correct_dois) == correct_dois
예제 #6
0
    def test_download_pdf_file(self, Unpywall, capfd):

        filename = 'test.pdf'
        filepath = './test_dir'

        Unpywall.download_pdf_file('10.1038/nature12373',
                                   filename=filename,
                                   filepath=filepath,
                                   progress=True)

        captured = capfd.readouterr()
        assert len(captured.out) > 0

        path = os.path.join(filepath, filename)
        assert os.path.exists(path)

        os.remove(path)
        os.rmdir(filepath)
 def get_oa_urls(self, doi_list):
     logger.info('Retreiving doc urls for DOIs now (cached/uncached)')
     oa_urls = []
     for i, doi in tqdm(enumerate(doi_list), total=len(doi_list)):
         try:
             oa_urls.append(Unpywall.get_doc_link(doi))
         except HTTPError:
             logger.warning(
                 '\nError received for DOI: {}, will retry 3 times in 20 secs'
                 .format(doi))
             sleep(20)
             for i in range(3):
                 try:
                     logger.info('Retry :{}'.format(i + 1))
                     oa_urls.append(Unpywall.get_doc_link(doi))
                     break
                 except HTTPError as e:
                     logger.error('Retry failed', e, exc_info=True)
     return oa_urls
예제 #8
0
    def test_get_df(self, Unpywall):

        data = Unpywall.get_json(doi='10.1016/j.tmaid.2020.101663',
                                 errors='raise')

        with pytest.raises(ValueError,
                           match=('The argument format only accepts the'
                                  ' values "raw" and "extended"')):
            assert Unpywall._get_df(data=data,
                                    format='not a valid format',
                                    errors='raise')

        df_raw = Unpywall._get_df(data=data, format='raw', errors='ignore')

        assert isinstance(df_raw, pd.DataFrame)

        df_extended = Unpywall._get_df(data=data,
                                       format='extended',
                                       errors='ignore')

        assert isinstance(df_extended, pd.DataFrame)
 def retry_from_another_src(self, faulty_files_list, doi_list):
     src_dict = {'scirp': []}
     for file in faulty_files_list:
         base_name = ntpath.basename(file)
         doi_list_ind = int(base_name.replace("Sample_", "")[:-8]) - 1
         doi = doi_list[doi_list_ind]
         doc_url = Unpywall.get_pdf_link(doi)
         if doc_url is not None and 'scirp' in doc_url.lower():
             try:
                 scirp_id = doc_url[doc_url.index('paperID=') + 8:]
             except (IndexError, ValueError):
                 continue
             if scirp_id != "":
                 src_dict['scirp'].append((file, scirp_id))
     return download_frm_another_src(src_dict)
예제 #10
0
    def test_get_json(self, Unpywall):

        with pytest.raises(HTTPError):
            Unpywall.get_json(doi='a bad doi', errors='raise')

        with pytest.warns(UserWarning):
            Unpywall.get_json(doi='a bad doi', errors='ignore')

        assert isinstance(
            Unpywall.get_json(doi='10.1016/j.tmaid.2020.101663',
                              errors='raise'), dict)

        assert isinstance(
            Unpywall.get_json(query='test', is_oa=True, errors='raise'), dict)

        with pytest.raises(ValueError,
                           match=('The argument is_oa only accepts the'
                                  ' values "True" and "False"')):
            assert Unpywall.get_json(query='test',
                                     is_oa='test',
                                     errors='raise')
    def download_doi_pdf(works, doi_list, download_dir):
        logger.info(
            "Trying to download the required data now for {} DOIs".format(
                len(doi_list)))
        for i, doi in enumerate(doi_list):

            name_pattern = 'Sample_{}.pdf'.format(str(i + 1))
            download_link = Unpywall.get_pdf_link(doi)
            try:
                if not download_link:
                    result = works.doi(doi)['link']
                    for item in result:
                        application = item['intended-application']
                        type = item['content-type']
                        if application is not None and application == 'text-mining' and type == 'application/pdf':
                            download_link = item['URL']
                            break
                NarrativeDataset.download_links[
                    name_pattern[:-4]] = download_link
                if not path.exists(path.join(download_dir, name_pattern)):
                    if download_link and filter_url(download_link):
                        logger.debug('Downloading ' + name_pattern + " : " +
                                     doi + ' from url: ' + download_link)
                        download_pdf_file(download_link,
                                          name_pattern,
                                          download_dir,
                                          progress=True)
                        sleep(5)
            except Exception as e:
                logger.error(
                    "Error while downloading the article ({}, {})".format(
                        str(i + 1), doi),
                    e,
                    exc_info=True)
                NarrativeDataset.download_links[
                    name_pattern[:-4]] = download_link
        return True
예제 #12
0
 def Unpywall(self):
     Unpywall.init_cache(test_cache)
     yield Unpywall
예제 #13
0
 def test_progress(self, Unpywall, capfd):
     Unpywall._progress(0.5)
     captured = capfd.readouterr()
     assert len(captured.out) > 0
예제 #14
0
    def test_get_all_links(self, Unpywall):

        assert isinstance(
            Unpywall.get_all_links('10.1016/j.tmaid.2020.101663'), list)
예제 #15
0
    def test_get_doc_link(self, Unpywall):

        assert isinstance(Unpywall.get_doc_link('10.1016/j.tmaid.2020.101663'),
                          str)
예제 #16
0
    def test_get_pdf_link(self, Unpywall):

        assert isinstance(Unpywall.get_pdf_link('10.1038/nature12373'), str)
예제 #17
0
    def test_query(self, Unpywall):

        df = Unpywall.query(query='test', is_oa=True, errors='ignore')

        assert isinstance(df, pd.DataFrame)
예제 #18
0
    def test_download_pdf_handle(self, Unpywall):

        assert isinstance(Unpywall.download_pdf_handle('10.1038/nature12373'),
                          BytesIO)