Пример #1
0
def _get_paper_metadata(doi: str) -> dict:  # pragma: no cover
    """
    Get a paper metadata from a provided DOI

    Parameters
    ----------
    doi : str
        The paper DOI

    Returns
    -------
    dict
        The ACM paper metadata, or None if there's no metadata available
    """

    form = {
        'dois': doi,
        'targetFile': 'custom-bibtex',
        'format': 'bibTex'
    }

    response = common_util.try_success(lambda: DefaultSession().post(
        f'{BASE_URL}/action/exportCiteProcCitation', data=form).json(), 2)

    if response is not None and response.get('items', None) is not None and len(response.get('items')) > 0:
        return response['items'][0][doi]
Пример #2
0
def _get_api_result(
        search: Search,
        start_record: Optional[int] = 0) -> dict:  # pragma: no cover
    """
    This method return results from arXiv database using the provided search parameters

    Parameters
    ----------
    search : Search
        A search instance
    start_record : str
        Sequence number of first record to fetch, by default 1

    Returns
    -------
    dict
        a result from arXiv database
    """

    url = _get_search_url(search, start_record)

    return common_util.try_success(
        lambda: xmltodict.parse(DefaultSession().get(url).content),
        2,
        pre_delay=1)
Пример #3
0
def _get_publication_entry(publication_issn: str,
                           api_token: str) -> dict:  # pragma: no cover
    """
    Get publication entry by publication ISSN

    Parameters
    ----------
    publication_issn : str
        A publication ISSN
    api_token : str
        A Scopus API token

    Returns
    -------
    dict (or None)
        publication entry in dict format, or None if the API doesn't return a valid entry
    """

    url = f'{BASE_URL}/content/serial/title/issn/{publication_issn}?apiKey={api_token}'
    headers = {'Accept': 'application/json'}
    response = common_util.try_success(
        lambda: DefaultSession().get(url, headers=headers).json().get(
            'serial-metadata-response', None), 2)

    if response is not None and 'entry' in response and len(
            response.get('entry')) > 0:
        return response.get('entry')[0]
Пример #4
0
def _get_search_results(search: Search,
                        api_token: str,
                        url: Optional[str] = None) -> dict:  # pragma: no cover
    """
    This method fetch papers from Scopus database using the provided search parameters

    Parameters
    ----------
    search : Search
        A search instance
    api_token : str
        The API key used to fetch data from Scopus database,
    url : Optional[str]
        A predefined URL to be used for the search execution, 
        this is usually used for make the next recursive call on a result pagination
    """

    # is url is not None probably this is a recursive call to the next url of a pagination
    if url is None:
        query = _get_query(search)
        url = f'{BASE_URL}/content/search/scopus?&sort=coverDate&apiKey={api_token}&query={query}'

    headers = {'Accept': 'application/json'}

    return common_util.try_success(
        lambda: DefaultSession().get(url, headers=headers).json()[
            'search-results'], 2)
Пример #5
0
def _get_paper_page(url: str) -> html.HtmlElement:  # pragma: no cover
    """
    Get a paper page element from a provided URL

    Parameters
    ----------
    url : str
        The paper URL

    Returns
    -------
    Object
        A HTML element representing the paper given by the provided URL
    """

    response = common_util.try_success(lambda: DefaultSession().get(url), 2)
    return html.fromstring(response.content)
Пример #6
0
def _get_result(url: str) -> html.HtmlElement:  # pragma: no cover
    """
    This method return results from medRxiv/bioRxiv database using the provided search parameters

    Parameters
    ----------
    url : str
        A URL to search for results

    Returns
    -------
    html.HtmlElement
        a page from medRxiv/bioRxiv database
    """

    response = common_util.try_success(lambda: DefaultSession().get(url), 2)
    return html.fromstring(response.content)
Пример #7
0
def _get_paper_entry(pubmed_id: str) -> dict:  # pragma: no cover
    """
    This method return paper data from PubMed database using the provided PubMed ID

    Parameters
    ----------
    pubmed_id : str
        A PubMed ID

    Returns
    -------
    dict
        a paper entry from PubMed database
    """

    url = f'{BASE_URL}/entrez/eutils/efetch.fcgi?db=pubmed&id={pubmed_id}&rettype=abstract'

    return common_util.try_success(lambda: xmltodict.parse(DefaultSession().get(url).content), 2, pre_delay=1)
Пример #8
0
def _get_paper_metadata_by_url(url: str):
    """
    Private method that returns the paper metadata for a given URL, based on the HTML meta tags

    Parameters
    ----------
    url : str
        A paper URL

    Returns
    -------
    dict
        A paper metadata dict (or None if the paper metadata cannot be found)
    """

    response = common_util.try_success(
        lambda url=url: requests.get(url, allow_redirects=True), 2, 2)

    if response is not None and 'text/html' in response.headers.get(
            'content-type').lower():

        page = html.fromstring(response.content)
        meta_list = page.xpath('//meta')

        paper_metadata = {}

        for meta in meta_list:
            meta_name = meta.attrib.get('name')
            meta_content = meta.attrib.get('content')
            if meta_name is not None and meta_content is not None:

                if meta_name in paper_metadata:
                    if not isinstance(paper_metadata.get(meta_name), list):
                        paper_metadata[meta_name] = [
                            paper_metadata.get(meta_name)
                        ]
                    paper_metadata.get(meta_name).append(meta_content)
                else:
                    paper_metadata[meta_name] = meta_content

        return paper_metadata
Пример #9
0
def _get_result(search: Search, start_record: Optional[int] = 0) -> dict:  # pragma: no cover
    """
    This method return results from ACM database using the provided search parameters

    Parameters
    ----------
    search : Search
        A search instance
    start_record : str
        Sequence number of first record to fetch, by default 0

    Returns
    -------
    dict
        a result from ACM database
    """

    url = _get_search_url(search, start_record)

    response = common_util.try_success(lambda: DefaultSession().get(url), 2)
    return html.fromstring(response.content)
Пример #10
0
def _get_api_result(search: Search, api_token: str, start_record: Optional[int] = 1) -> dict:  # pragma: no cover
    """
    This method return results from IEEE database using the provided search parameters

    Parameters
    ----------
    search : Search
        A search instance
    api_token : str
        The API key used to fetch data from IEEE database,
    start_record : str
        Sequence number of first record to fetch, by default 1

    Returns
    -------
    dict
        a result from IEEE database
    """

    url = _get_search_url(search, api_token, start_record)

    return common_util.try_success(lambda: DefaultSession().get(url).json(), 2)
Пример #11
0
def _get_paper_metadata(doi: str, database: str) -> dict:  # pragma: no cover
    """
    Get a paper metadata from a provided DOI

    Parameters
    ----------
    doi : str
        The paper DOI
    database : str
        The database name (medRxiv or bioRxiv)

    Returns
    -------
    dict
        The medRxiv/bioRxiv paper metadata, or None if there's no metadata available
    """

    url = f'{API_BASE_URL}/details/{database.lower()}/{doi}'

    response = common_util.try_success(lambda: DefaultSession().get(url).json(), 2)
    if response is not None and response.get('collection', None) is not None and len(response.get('collection')) > 0:
        return response.get('collection')[0]
def _flag_potentially_predatory_publications(search: Search):
    """
    Flag all the potentially predatory publications

    Parameters
    ----------
    search : Search
        A search instance
    """

    for i, paper in enumerate(search.papers):

        logging.info(f'({i+1}/{len(search.papers)}) Checking paper: {paper.title}')

        try:

            if paper.publication is not None:
                publication_name = paper.publication.title.lower()
                publisher_name = paper.publication.publisher.lower() if paper.publication.publisher is not None else None
                publisher_host = None
            
                if paper.doi is not None:
                    url = f'http://doi.org/{paper.doi}'
                    response = common_util.try_success(lambda url=url: DefaultSession().get(url), 2)

                    if response is not None:
                        publisher_host = urlparse(response.url).netloc.replace("www.", "")

                if publication_name in publication_util.POTENTIAL_PREDATORY_JOURNALS_NAMES \
                    or publisher_name in publication_util.POTENTIAL_PREDATORY_PUBLISHERS_NAMES \
                    or publisher_host in publication_util.POTENTIAL_PREDATORY_PUBLISHERS_HOSTS:

                    paper.publication.is_potentially_predatory = True

        except Exception:
            pass
Пример #13
0
def download(search_path: str,
             output_directory: str,
             only_selected_papers: Optional[bool] = False,
             categories_filter: Optional[dict] = None,
             proxy: Optional[str] = None):
    """
    If you've done your search, (probably made the search refinement too) and wanna download the papers, 
    this is the method that you need to call. This method will try to download the PDF version of the papers to
    the output directory path.

    We use some heuristics to do our job, but sometime they won't work properly, and we cannot be able
    to download the papers, but we logging the downloads or failures in a file download.log
    placed on the output directory, you can check out the log to find what papers cannot be downloaded
    and try to get them manually later. 

    Note: Some papers are behind a paywall and won't be able to be downloaded by this method. 
    However, if you have a proxy provided for the institution where you study or work that permit you 
    to "break" this paywall. You can use this proxy configuration here
    by setting the environment variables FINDPAPERS_HTTP_PROXY and FINDPAPERS_HTTPS_PROXY.

    Parameters
    ----------
    search_path : str
        A valid file path containing a JSON representation of the search results
    output_directory : str
        A valid file path of the directory where the downloaded papers will be placed
    only_selected_papers : bool, False by default
        If only the selected papers will be downloaded
    categories_filter : dict, None by default
        A dict of categories to be used to filter which papers will be downloaded
    proxy : Optional[str], optional
        proxy URL that can be used during requests. This can be also defined by an environment variable FINDPAPERS_PROXY. By default None
    """

    if proxy is not None:
        os.environ['FINDPAPERS_PROXY'] = proxy

    search = persistence_util.load(search_path)

    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    log_filepath = os.path.join(output_directory, 'download.log')

    common_util.check_write_access(log_filepath)

    with open(log_filepath,
              'a' if os.path.exists(log_filepath) else 'w') as fp:
        now = datetime.datetime.now()
        fp.write(
            f"------- A new download process started at: {datetime.datetime.strftime(now, '%Y-%m-%d %H:%M:%S')} \n"
        )

    for i, paper in enumerate(search.papers):

        logging.info(f'({i+1}/{len(search.papers)}) {paper.title}')

        if (only_selected_papers and not paper.selected) or \
        (categories_filter is not None and (paper.categories is None or not paper.has_category_match(categories_filter))):
            continue

        downloaded = False
        output_filename = f'{paper.publication_date.year}-{paper.title}'
        output_filename = re.sub(r'[^\w\d-]', '_',
                                 output_filename)  # sanitize filename
        output_filename += '.pdf'
        output_filepath = os.path.join(output_directory, output_filename)

        if os.path.exists(output_filepath):  # PDF already collected
            logging.info(f'Paper\'s PDF file has already been collected')
            continue

        if paper.doi is not None:
            paper.urls.add(f'http://doi.org/{paper.doi}')

        for url in paper.urls:  # we'll try to download the PDF file of the paper by its URLs
            try:
                logging.info(f'Fetching data from: {url}')

                response = common_util.try_success(
                    lambda url=url: DefaultSession().get(url), 2)

                if response is None:
                    continue

                if 'text/html' in response.headers.get('content-type').lower():

                    response_url = urllib.parse.urlsplit(response.url)
                    response_query_string = urllib.parse.parse_qs(
                        urllib.parse.urlparse(response.url).query)
                    response_url_path = response_url.path
                    host_url = f'{response_url.scheme}://{response_url.hostname}'
                    pdf_url = None

                    if response_url_path.endswith('/'):
                        response_url_path = response_url_path[:-1]

                    response_url_path = response_url_path.split('?')[0]

                    if host_url in ['https://dl.acm.org']:

                        doi = paper.doi
                        if doi is None and response_url_path.startswith(
                                '/doi/'
                        ) and '/doi/pdf/' not in response_url_path:
                            doi = response_url_path[4:]
                        elif doi is None:
                            continue

                        pdf_url = f'https://dl.acm.org/doi/pdf/{doi}'

                    elif host_url in ['https://ieeexplore.ieee.org']:

                        if response_url_path.startswith('/document/'):
                            document_id = response_url_path[10:]
                        elif response_query_string.get('arnumber',
                                                       None) is not None:
                            document_id = response_query_string.get(
                                'arnumber')[0]
                        else:
                            continue

                        pdf_url = f'{host_url}/stampPDF/getPDF.jsp?tp=&arnumber={document_id}'

                    elif host_url in [
                            'https://www.sciencedirect.com',
                            'https://linkinghub.elsevier.com'
                    ]:

                        paper_id = response_url_path.split('/')[-1]
                        pdf_url = f'https://www.sciencedirect.com/science/article/pii/{paper_id}/pdfft?isDTMRedir=true&download=true'

                    elif host_url in ['https://pubs.rsc.org']:

                        pdf_url = response.url.replace('/articlelanding/',
                                                       '/articlepdf/')

                    elif host_url in [
                            'https://www.tandfonline.com',
                            'https://www.frontiersin.org'
                    ]:

                        pdf_url = response.url.replace('/full', '/pdf')

                    elif host_url in [
                            'https://pubs.acs.org',
                            'https://journals.sagepub.com',
                            'https://royalsocietypublishing.org'
                    ]:

                        pdf_url = response.url.replace('/doi', '/doi/pdf')

                    elif host_url in ['https://link.springer.com']:

                        pdf_url = response.url.replace(
                            '/article/', '/content/pdf/').replace('%2F',
                                                                  '/') + '.pdf'

                    elif host_url in ['https://www.isca-speech.org']:

                        pdf_url = response.url.replace('/abstracts/',
                                                       '/pdfs/').replace(
                                                           '.html', '.pdf')

                    elif host_url in ['https://onlinelibrary.wiley.com']:

                        pdf_url = response.url.replace('/full/',
                                                       '/pdfdirect/').replace(
                                                           '/abs/',
                                                           '/pdfdirect/')

                    elif host_url in [
                            'https://www.jmir.org', 'https://www.mdpi.com'
                    ]:

                        pdf_url = response.url + '/pdf'

                    elif host_url in ['https://www.pnas.org']:

                        pdf_url = response.url.replace(
                            '/content/', '/content/pnas/') + '.full.pdf'

                    elif host_url in ['https://www.jneurosci.org']:

                        pdf_url = response.url.replace(
                            '/content/', '/content/jneuro/') + '.full.pdf'

                    elif host_url in ['https://www.ijcai.org']:

                        paper_id = response.url.split('/')[-1].zfill(4)
                        pdf_url = '/'.join(response.url.split('/')
                                           [:-1]) + '/' + paper_id + '.pdf'

                    elif host_url in [
                            'https://asmp-eurasipjournals.springeropen.com'
                    ]:

                        pdf_url = response.url.replace('/articles/',
                                                       '/track/pdf/')

                    if pdf_url is not None:

                        response = common_util.try_success(
                            lambda url=pdf_url: DefaultSession().get(url), 2)

                if 'application/pdf' in response.headers.get(
                        'content-type').lower():
                    with open(output_filepath, 'wb') as fp:
                        fp.write(response.content)
                    downloaded = True
                    break

            except Exception as e:  # pragma: no cover
                logging.debug(e, exc_info=True)

        if downloaded:
            with open(log_filepath, 'a') as fp:
                fp.write(f'[DOWNLOADED] {paper.title}\n')
        else:
            with open(log_filepath, 'a') as fp:
                fp.write(f'[FAILED] {paper.title}\n')
                if len(paper.urls) == 0:
                    fp.write(f'Empty URL list\n')
                else:
                    for url in paper.urls:
                        fp.write(f'{url}\n')
Пример #14
0
def enrich_publication_data(search: Search, api_token: str):
    """
    This method fetch papers from Scopus database to enrich publication data

    Parameters
    ----------
    search : Search
        A search instance
    api_token : str
        The API key used to fetch data from Scopus database,

    Raises
    ------
    AttributeError
        - The API token cannot be null
    """

    if api_token is None or len(api_token.strip()) == 0:
        raise AttributeError('The API token cannot be null')
    
    i = 0
    total = len(search.publication_by_key.items())
    for publication_key, publication in search.publication_by_key.items():
        
        i += 1
        logging.info(f'({i}/{total}) Enriching publication: {publication.title}')

        if publication.issn is not None:

            try:

                publication_entry = _get_publication_entry(
                    publication.issn, api_token)

                if publication_entry is not None:

                    publication_category = publication_entry.get(
                        'prism:aggregationType', None)
                    if publication_category is not None and publication.category is None:
                        publication.category = publication_category

                    publication_publisher = publication_entry.get(
                        'dc:publisher', None)

                    if publication_publisher is not None:
                        publication.publisher = publication_publisher

                    for subject_area in publication_entry.get('subject-area', []):
                        subject_area_value = subject_area.get('$', '').strip()
                        if len(subject_area_value) > 0:
                            publication.subject_areas.add(subject_area_value)

                    publication_cite_score = common_util.try_success(lambda x=publication_entry: float(
                        x.get('citeScoreYearInfoList').get('citeScoreCurrentMetric')))

                    if publication_cite_score is not None:
                        publication.cite_score = publication_cite_score

                    if 'SJRList' in publication_entry and len(publication_entry.get('SJRList').get('SJR')) > 0:
                        publication_sjr = common_util.try_success(lambda x=publication_entry: float(
                            x.get('SJRList').get('SJR')[0].get('$')))

                    if publication_sjr is not None:
                        publication.sjr = publication_sjr

                    if 'SNIPList' in publication_entry and len(publication_entry.get('SNIPList').get('SNIP')) > 0:
                        publication_snip = common_util.try_success(lambda x=publication_entry: float(
                            x.get('SNIPList').get('SNIP')[0].get('$')))

                    if publication_snip is not None:
                        publication.snip = publication_snip

            except Exception:  # pragma: no cover
                pass
Пример #15
0
def test_try_success(func: Callable, result: Any):

    assert util.try_success(func, 2, 1) == result