def mergeResultData(result1, result2):
    """
    Merges bibtex and extra_data dictionaries for a SearchResult and/or a Paper

    :param result1:
    :param result2:
    :return:
    """
    # if there's no year we should update the ID after getting the year
    to_update_id = not result1.bib.get('year') or not 'ID' in result1.bib

    for field in BIB_FIELDS_TRANSFER:
        if len(str(result2.bib.get(field, ''))) > len(
                str(result1.bib.get(field, ''))):
            result1.bib[field] = str(result2.bib[field])

    for field in ['ID', 'ENTRYTYPE']:
        if field in result2.bib:
            result1.bib[field] = str(result2.bib[field])

    if 'ID' not in result2.bib and to_update_id:
        if 'ID' in result1.bib:
            del result1.bib['ID']
        fixBibData(result1.bib, 1)

    for field in result2.extra_data:
        if field not in result1.extra_data:
            result1.extra_data[field] = result2.extra_data[field]

    if 'urls' in result2.extra_data:
        for url in result2.extra_data['urls']:
            addUrlIfNew(result1, url['url'], url['type'], url['source'])

    refreshDOIfromURLs(result1)
    return result1
Пример #2
0
    def search(self,
               query,
               min_year=None,
               max_year=None,
               max_results=MAX_RESULTS):
        # TODO implement max year
        if min_year:
            scholarly.scholarly._PUBSEARCH = '/scholar?as_ylo=' + str(
                min_year) + '&q={0}'

        query = scholarly.search_pubs_query(query)
        results = []
        index = 0
        for result in tqdm(query, desc="Getting results", total=max_results):
            bib = fixBibData(result.bib, index)

            extra_data = {}

            for field in ['citedby', 'url_scholarbib']:
                if hasattr(result, field):
                    extra_data[field] = getattr(result, field)

            if hasattr(result, 'id_scholarcitedby'):
                extra_data['scholarid'] = result.id_scholarcitedby

            for field in ['url', 'eprint']:

                if hasattr(result, field):
                    bib[field] = getattr(result, field)

                    addUrlIfNewWithType(result, result.url, 'scholar')

            doi = getDOIfromURL(bib.get('url'))
            if not doi:
                doi = getDOIfromURL(bib.get('eprint', ''))

            if doi:
                bib['doi'] = doi

            result = SearchResult(index, bib, result.source, extra_data)
            results.append(result)
            index += 1

            if len(results) == max_results:
                break

            if len(results) % 10 == 0:
                self.randomSleep()
        return results
Пример #3
0
def readRIS(filename):
    with open(filename, 'r') as f:
        entries = readris(f)

    res = []

    for entry in entries:
        entry['author'] = authorListFromListOfAuthors(entry.get('authors', []))
        if 'authors' in entry:
            del entry['authors']

        new_type = 'article'
        if entry.get('type_of_reference'):
            if entry['type_of_reference'] in reverse_type_mapping:
                new_type = reverse_type_mapping[entry['type_of_reference']]

        entry['ENTRYTYPE'] = new_type
        entry = fixBibData(entry, 0)
        res.append(entry)

    return res
def enrichMetadata(paper: Paper, identity):
    """
    Tries to retrieve metadata from Crossref and abstract from SemanticScholar for a given paper,
    Google Scholar bib if all else fails

    :param paper: Paper instance
    """
    paper.title = basicTitleCleaning(paper.title)
    original_title = paper.title

    if paper.pmid and not paper.extra_data.get("done_pubmed"):
        pubmed_scraper.enrichWithMetadata(paper)
        paper.extra_data['done_pubmed'] = True

    # if we don't have a DOI, we need to find it on Crossref
    if not paper.doi and not paper.extra_data.get('done_crossref', False):
        crossref_scraper.matchPaperFromResults(paper, identity)

        if paper.doi:
            new_bib = getBibtextFromDOI(paper.doi)
            paper = mergeResultData(
                paper, SearchResult(1, new_bib[0], 'crossref',
                                    paper.extra_data))
        paper.extra_data['done_crossref'] = True

    # if we have a DOI and we haven't got the abstract yet
    if paper.doi and not paper.extra_data.get('done_semanticscholar'):
        semanticscholarmetadata.getMetadata(paper)
        paper.extra_data['done_semanticscholar'] = True

    # try PubMed if we still don't have a  PMID
    if not paper.pmid and not paper.extra_data.get('done_pubmed'):
        # if (not paper.doi or not paper.has_full_abstract) and not paper.pmid and not paper.extra_data.get('done_pubmed'):
        if pubmed_scraper.matchPaperFromResults(paper,
                                                identity,
                                                ok_title_distance=0.4):
            pubmed_scraper.enrichWithMetadata(paper)
        paper.extra_data['done_pubmed'] = True

    # still no DOI? maybe we can get something from SemanticScholar
    if not paper.extra_data.get('ss_id') and not paper.extra_data.get(
            'done_semanticscholar'):
        semanticscholarmetadata.matchPaperFromResults(paper, identity)
        paper.extra_data['done_semanticscholar'] = True

    # # time to try Scopus, see if it's behind a paywall
    # if not paper.doi and not paper.extra_data.get('done_scopus'):
    #     semanticscholarmetadata.getMetadata(paper)
    #     paper.extra_data['done_semanticscholar'] = True

    # if we don't have an abstract maybe it's on arXiv
    if not paper.has_full_abstract and not paper.extra_data.get('done_arxiv'):
        # if not paper.extra_data.get('done_arxiv'):
        arxiv_scraper.matchPaperFromResults(paper,
                                            identity,
                                            ok_title_distance=0.35)
        paper.extra_data['done_arxiv'] = True

    # try to get open access links if DOI present and missing PDF link
    if not paper.has_pdf_link and paper.doi and not paper.extra_data.get(
            'done_unpaywall'):
        unpaywall_scraper.getMetadata(paper, identity)
        paper.extra_data['done_unpaywall'] = True

    # if all else has failed but we have a link to Google Scholar bib data, get that
    if not paper.year and paper.extra_data.get('url_scholarbib'):
        scholar_scraper.getBibtex(paper)

    if paper.title != original_title:
        print('Original: %s\nNew: %s' % (original_title, paper.title))
    paper.bib = fixBibData(paper.bib, 1)
    def getMetadata(self, paper, get_citing_papers=False):
        if not paper.doi and not paper.extra_data.get('ss_id'):
            raise ValueError('paper has no DOI or SSID')

        if paper.extra_data.get('ss_id'):
            unique_id = paper.extra_data.get('ss_id')
        else:
            unique_id = paper.doi

        url = 'https://api.semanticscholar.org/v1/paper/' + unique_id

        r = self.request(url)
        d = r.json()

        if 'error' in d:
            print("SemanticScholar error:", d['error'])
            return

        for field in ['abstract', 'year', 'venue']:
            if d.get(field):
                paper.bib[field] = str(d[field])

        if d.get('arxivId'):
            paper.arxivid = d['arxivId']

        for topic in d['topics']:
            # we really don't need to store the url, it's just
            # https://www.semanticscholar.org/topic/{topicId}
            del topic['url']

        authors = self.loadSSAuthors(d['authors'])
        paper.bib['author'] = authorListFromDict(authors)

        paper.extra_data['ss_topics'] = d['topics']
        paper.extra_data['ss_authors'] = d['authors']
        paper.extra_data['ss_id'] = d['paperId']

        if get_citing_papers:
            citing_papers = []
            for index, citation in enumerate(d['citations']):
                ss_authors = semanticscholarmetadata.loadSSAuthors(
                    citation['authors'])
                authors = authorListFromDict(ss_authors)

                bib = {
                    'title': citation['title'],
                    'author': authors,
                    'year': citation['year'],
                    'doi': citation['year'],
                }
                bib = fixBibData(bib, index)

                extra_data = {
                    'ss_id': citation['paperId'],
                    'ss_influential': citation['isInfluential'],
                    'ss_authors': ss_authors
                }
                if citation.get('arxivId'):
                    extra_data['arxivid'] = citation.get('arxivId')

                new_paper = Paper(bib, extra_data)
                citing_papers.append(new_paper)
            return paper, citing_papers
        return paper