def __init__(self, reset_cache=False):
     self.journals = Journals()
     self.works = Works()
     self.filter_kwargs = dict(has_license='true', has_full_text='true')
     self.keywords = 'business financial merger entrepreneur banking insurance commerce trade economics'
     UnpywallCredentials('*****@*****.**')
     cache_path = path.join(DATA_DIR, 'unpaywall_cache')
     if reset_cache and path.exists(cache_path):
         remove(cache_path)
     self.unpywall_cache = UnpywallCache(cache_path)
     Unpywall.init_cache(self.unpywall_cache)
示例#2
0
def testISSN(issn):
    # works = Works()
    # info = works.doi('10.2514/8.7231')
    journals = Journals()
    info = journals.journal(issn)
    '''
    info.pop('last-status-check-time')
    info.pop('counts')
    info.pop('breakdowns')
    info.pop('flags')
    info.pop('coverage')
    info.pop('coverage-type')
    print(info)
    for x in info:
        print(x)
    '''
    print(info)
示例#3
0
def getInfoFromISSN(issn):
    # Retrieve all information about the ISSN
    journals = Journals()
    info = journals.journal(issn)

    string = ","

    # Retrieve the properties we care about only
    itemTitle = info.get('title')
    itemPub = info.get('publisher')
    itemSubj = info.get('subjects')
    itemISSN = info.get('ISSN')
    itemISSNinfo = info.get('issn-type')

    # Go through non-string entities and format
    string = ""
    # Loop through dict values in list
    for x in itemSubj:
        # Gets all the values from this dict
        # and stores all values in a list
        temp = [*x.values()]
        # Joins the items from the list and seperates with a comma
        temp = ', '.join(map(str, temp))
        # Adds current dict entry (now string of values) to a string
        string = string + temp + ', '
    # Removes trailing ', '
    itemSubj = string[:-2]

    # Join all ISSNs for this item using a ,
    itemISSN = ", ".join(itemISSN)

    string = ""
    # Loop through dict values in list
    for x in itemISSNinfo:
        # Gets all the values from this dict
        # and stores all values in a list
        temp = [*x.values()]
        # Joins the items from the list and seperates with a comma
        temp = ', '.join(map(str, temp))
        # Adds current dict entry (now string of values) to a string
        string = string + temp + ', '
    # Removes trailing ', '
    itemISSNinfo = string[:-2]

    infoList = [itemTitle, itemPub, itemSubj, itemISSN, itemISSNinfo]
    return infoList
示例#4
0
def query_to_crossref(document):
    title = document.original_title()
    author = ' '.join([document.first_author.get('surname', ''), document.first_author.get('given_names', '')]).strip()
    pub_date = document.publication_date

    if title is None:
        return None

    result = [i for i in Journals().works(document.journal.scielo_issn).query(title=title, author=author).filter(from_pub_date=pub_date, until_pub_date=pub_date)]

    if len(result) != 1:
        return None

    return result.get('DOI', None)
示例#5
0
# -*- coding: utf-8 -*-
"""
Created on Fri Nov  1 18:04:47 2019

@author: asdqw
"""

from crossref.restful import Works
from crossref.restful import Journals
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
works = Works()
journals = Journals()
import os


def acquire_text(url, index):

    #    from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
    #    from selenium.webdriver.support.ui import WebDriverWait
    #    desired_capabilities = DesiredCapabilities.CHROME  # 修改页面加载策略
    #    desired_capabilities["pageLoadStrategy"] = "eager" # 注释这两行会导致最后输出结果的延迟,即等待页面加载完成再输出
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    #element.get_attribute('text')

    driver = webdriver.Chrome()  #创建浏览器
    driver.get(url)  # 访问网址,并且会等待完全加载除非AJAX
 def retreive_journal_between_dates(self, journal_issn, from_date,
                                    end_date):
     journals = Journals()
     return journals.works(journal_issn).filter(from_created_date=from_date,
                                                until_created_date=end_date)
class NarrativeDataset:
    LICENSE_WHITELIST = [
        'http://creativecommons.org/licenses/by/4.0/',
        'http://creativecommons.org/licenses/by/3.0/'
    ]
    download_links = dict()

    def __init__(self, reset_cache=False):
        self.journals = Journals()
        self.works = Works()
        self.filter_kwargs = dict(has_license='true', has_full_text='true')
        self.keywords = 'business financial merger entrepreneur banking insurance commerce trade economics'
        UnpywallCredentials('*****@*****.**')
        cache_path = path.join(DATA_DIR, 'unpaywall_cache')
        if reset_cache and path.exists(cache_path):
            remove(cache_path)
        self.unpywall_cache = UnpywallCache(cache_path)
        Unpywall.init_cache(self.unpywall_cache)

    def get_dois_from_journal(self, journal_issn):
        doi_list = []
        try:
            if self.journals.journal_exists(journal_issn):
                works = self.journals.works(journal_issn).filter(
                    **self.filter_kwargs).select('DOI', 'license')
                for response_dict in tqdm(works):
                    license_dict = response_dict['license']
                    if self.is_license_whitelist(license_dict[0]['URL']):
                        doi_list.append(response_dict['DOI'])
        except Exception as e:
            logger.error("Error while getting DOIs from REST service",
                         e,
                         exc_info=True)
        return doi_list

    def get_dois_from_keywords(self):
        doi_list = []
        try:
            results = self.works.query(self.keywords).filter(
                **self.filter_kwargs).select('DOI', 'license')
            for response_dict in tqdm(results):
                license_dict = response_dict['license']
                if self.is_license_whitelist(license_dict[0]['URL']):
                    doi_list.append(response_dict['DOI'])
        except Exception as e:
            logger.error("Error while getting DOIs from REST service",
                         e,
                         exc_info=True)
        return doi_list

    def get_oa_urls(self, doi_list):
        logger.info('Retreiving doc urls for DOIs now (cached/uncached)')
        oa_urls = []
        for i, doi in tqdm(enumerate(doi_list), total=len(doi_list)):
            try:
                oa_urls.append(Unpywall.get_doc_link(doi))
            except HTTPError:
                logger.warning(
                    '\nError received for DOI: {}, will retry 3 times in 20 secs'
                    .format(doi))
                sleep(20)
                for i in range(3):
                    try:
                        logger.info('Retry :{}'.format(i + 1))
                        oa_urls.append(Unpywall.get_doc_link(doi))
                        break
                    except HTTPError as e:
                        logger.error('Retry failed', e, exc_info=True)
        return oa_urls

    def is_license_whitelist(self, license):
        license = str(license).replace('https', 'http')
        return license in self.LICENSE_WHITELIST

    def retry_from_another_src(self, faulty_files_list, doi_list):
        src_dict = {'scirp': []}
        for file in faulty_files_list:
            base_name = ntpath.basename(file)
            doi_list_ind = int(base_name.replace("Sample_", "")[:-8]) - 1
            doi = doi_list[doi_list_ind]
            doc_url = Unpywall.get_pdf_link(doi)
            if doc_url is not None and 'scirp' in doc_url.lower():
                try:
                    scirp_id = doc_url[doc_url.index('paperID=') + 8:]
                except (IndexError, ValueError):
                    continue
                if scirp_id != "":
                    src_dict['scirp'].append((file, scirp_id))
        return download_frm_another_src(src_dict)

    @staticmethod
    def download_doi_pdf(works, doi_list, download_dir):
        logger.info(
            "Trying to download the required data now for {} DOIs".format(
                len(doi_list)))
        for i, doi in enumerate(doi_list):

            name_pattern = 'Sample_{}.pdf'.format(str(i + 1))
            download_link = Unpywall.get_pdf_link(doi)
            try:
                if not download_link:
                    result = works.doi(doi)['link']
                    for item in result:
                        application = item['intended-application']
                        type = item['content-type']
                        if application is not None and application == 'text-mining' and type == 'application/pdf':
                            download_link = item['URL']
                            break
                NarrativeDataset.download_links[
                    name_pattern[:-4]] = download_link
                if not path.exists(path.join(download_dir, name_pattern)):
                    if download_link and filter_url(download_link):
                        logger.debug('Downloading ' + name_pattern + " : " +
                                     doi + ' from url: ' + download_link)
                        download_pdf_file(download_link,
                                          name_pattern,
                                          download_dir,
                                          progress=True)
                        sleep(5)
            except Exception as e:
                logger.error(
                    "Error while downloading the article ({}, {})".format(
                        str(i + 1), doi),
                    e,
                    exc_info=True)
                NarrativeDataset.download_links[
                    name_pattern[:-4]] = download_link
        return True