示例#1
0
def loadRefsFromHTML(filename):
    with open(filename) as f:
        html = f.read()

    html = html[html.find('<body>') + 6:]
    # html = re.sub('.+<body>', '', html, flags=re.DOTALL)
    entries = re.split('(<p>\n<p>\n<p>)', html)
    res = []

    for entry in entries:
        lines = entry.split('\n')
        new_bib = {}

        for line in lines:
            match = re.search('<b>Reference Type: <\/b> (.+?)<p>', line)
            if match:
                if match.group(1) in type_mapping:
                    new_bib['ENTRYTYPE'] = type_mapping[match.group(1)]
                else:
                    new_bib['ENTRYTYPE'] = 'article'

            for bib_map in mapping:
                match = re.search('<b>' + bib_map[0] + ':<\/b> (.+?)<p>', line)
                if match:
                    new_bib[bib_map[1]] = match.group(1)

        for match in re.finditer('<A HREF="(.+?)">', entry):
            if isPDFURL(match.group(1)):
                new_bib['eprint'] = match.group(1)
            else:
                new_bib['url'] = match.group(1)

        res.append(new_bib)

    return res
示例#2
0
def bulkDownload(papers,
                 root_dir,
                 report_path,
                 do_not_download_just_list=False):
    root_dir = os.path.abspath(root_dir)

    if not os.path.exists(root_dir):
        os.makedirs(root_dir)

    download_tasks = []

    for paper in papers:
        # if not paper.year:
        #     print("missing year", paper)

        filename = os.path.join(root_dir, generateFilename(paper)) + '.pdf'
        paper.pdf_filename = filename

        task_record = {
            'id': paper.id,
            'doi': paper.doi,
            'filename': filename,
            'abstract': paper.abstract
        }
        url = None
        url_source = None

        for url_rec in paper.extra_data.get('urls', []):
            if url_rec['type'] == 'pdf':
                url = url_rec['url']
                url_source = url_rec['source']
                break

        if not url:
            if paper.bib.get('eprint'):
                url = paper.bib['eprint']
                url_source = 'search'
            elif paper.bib.get('url') and isPDFURL(paper.bib['url']):
                url = paper.bib['url']
                url_source = 'search'

        if url:
            task_record['url'] = url
            task_record['url_source'] = url_source
            download_tasks.append(task_record)
        else:
            print(paper.extra_data)
            print(paper.bib)
            print()

    df = pd.DataFrame(download_tasks)
    df.to_csv('download_tasks.csv')

    if do_not_download_just_list:
        return

    results = ThreadPool(8).imap_unordered(fetch_url, download_tasks)

    df = pd.DataFrame(results)
    df.to_csv(report_path)
    def getMetadata(self, paper, identity):
        if not paper.doi:
            raise ValueError("Paper has no DOI")

        url = 'https://api.unpaywall.org/v2/%s?email=%s' % (paper.doi,
                                                            identity)

        r = self.request(url)

        data = r.json()
        if data.get('error') == 'true':
            return

        top_url = data.get('best_oa_location')
        if not top_url:
            return

        if top_url.get('url_for_pdf') in top_url:
            addUrlIfNew(paper, top_url['url_for_pdf'], 'pdf', 'unpaywall')
        if top_url.get('url_for_landing_page'):
            addUrlIfNew(paper, top_url['url_for_landing_page'], 'main',
                        'unpaywall')
        if top_url.get('url'):
            url = top_url['url']
            if isPDFURL(url):
                type = 'pdf'
            else:
                type = 'main'

            addUrlIfNew(paper, url, type, 'unpaywall')

        paper.extra_data['done_unpaywall'] = True
    def search(self,
               title,
               identity,
               max_results=5,
               min_year=None,
               max_year=None):
        url = 'https://www.semanticscholar.org/api/1/search'

        yearFilter = None

        if min_year or max_year:
            yearFilter = {}
            if not max_year:
                now = datetime.datetime.now()
                max_year = now.year

            if min_year:
                yearFilter['min'] = int(min_year)
            if max_year:
                yearFilter['max'] = int(max_year)

        results_left = max_results
        page_num = 1

        return_results = []

        while results_left > 0:
            data = {
                "queryString": title,
                "page": page_num,
                "pageSize": 10,
                "sort": "relevance",
                "authors": [],
                "coAuthors": [],
                "venues": [],
                "yearFilter": yearFilter,
                "requireViewablePdf": False,
                "publicationTypes": [],
                "externalContentTypes": []
            }

            r = self.request(url, data=data, post=True)

            results_dict = r.json()

            if results_dict.get(
                    'totalResults'
            ) and max_results != results_dict['totalResults']:
                max_results = min(max_results, results_dict['totalResults'])
                results_left = max_results

            if 'results' in results_dict:
                results = results_dict['results']
            else:
                results = []

            results_left -= len(results)

            for index, res in enumerate(results[:results_left]):

                res_title = res['title']['text']

                authors_processed = []
                for author_list in res['authors']:
                    for author_dict in author_list:
                        if 'name' in author_dict:
                            authors_processed.append(author_dict)

                authors = self.loadSSAuthors(authors_processed)

                bib = {
                    'title':
                    res_title,
                    'abstract':
                    res['paperAbstract']['text'],
                    'year':
                    res['year']['text'],
                    'url':
                    'https://www.semanticscholar.org/paper/{}/{}'.format(
                        res['slug'], res['id']),
                    'author':
                    authorListFromDict(authors),
                }

                if res.get('doiInfo'):
                    bib['doi'] = res['doiInfo'].get('doi')

                extra_data = {'ss_id': res['id'], 'x_authors': authors}

                new_res = SearchResult(index, bib, 'semantischolar',
                                       extra_data)

                for link in res.get('links', []):
                    if isPDFURL(link['url']):
                        bib['eprint'] = link['url']
                        addUrlIfNew(new_res, link['url'], 'pdf',
                                    'semanticscholar')

                venue = res['venue'].get('text')
                extra_data['venue'] = venue
                return_results.append(new_res)

        return return_results
    def search(self, title, identity, year=None, max_results=1):
        """
        Searchs and returns a number of results from Crossref

        :param title: article title
        :param identity: email address to provide to Crossref
        :param year: publication year
        :param max_results:
        :return: list of Crossref JSON data results
        """
        urllib.parse.quote(title, safe='')
        headers = {'User-Agent': 'ReviewBuilder(mailto:%s)' % identity}
        # changed because of https://status.crossref.org/incidents/4y45gj63jsp4
        url = 'https://api.crossref.org/works?rows={}&query.bibliographic={}'.format(
            max_results, title)
        if year:
            url += '&query.published=' + str(year)

        r = self.request(url, headers)

        d = r.json()
        if d['status'] != 'ok':
            raise ValueError('Error in request:' +
                             d.get('status', 'NO STATUS') +
                             str(d.get('message', 'NO MESSAGE')))

        results = []
        for index, item in enumerate(d['message']['items']):
            # print(item.get('type'))
            new_bib = {
                'doi': item['DOI'],
                'title': basicTitleCleaning(removeListWrapper(item['title']))
            }

            if 'container-title' in item:
                # reference-entry, book

                if item.get('type') in ['journal-article', 'reference-entry']:
                    new_bib['journal'] = removeListWrapper(
                        item['container-title'])
                    new_bib['ENTRYTYPE'] = 'article'
                elif item.get('type') in ['book-chapter']:
                    new_bib['ENTRYTYPE'] = 'inbook'
                    new_bib['booktitle'] = removeListWrapper(
                        item['container-title'])
                elif item.get('type') in ['proceedings-article']:
                    new_bib['ENTRYTYPE'] = 'inproceedings'
                    new_bib['booktitle'] = removeListWrapper(
                        item['container-title'])

            if item.get('type') in ['book']:
                new_bib['ENTRYTYPE'] = 'book'

            if item.get('type') not in [
                    'journal-article', 'reference-entry', 'book',
                    'book-chapter', 'proceedings-article'
            ]:
                print(json.dumps(item, indent=3))

            for field in [
                ('publisher-location', 'address'),
                ('publisher', 'publisher'),
                ('issue', 'issue'),
                ('volume', 'volume'),
                ('page', 'pages'),
            ]:
                if field[0] in item:
                    new_bib[field[1]] = str(item[field[0]])

            if 'URL' in item:
                new_bib['url'] = item['URL']

            if "issued" in item:
                date_parts = item['issued']['date-parts'][0]
                new_bib['year'] = str(date_parts[0])
                if len(date_parts) > 1:
                    new_bib['month'] = str(date_parts[1])
                if len(date_parts) > 2:
                    new_bib['day'] = str(date_parts[2])

            authors = []
            for author in item.get('author', []):
                authors.append({
                    'given': author.get('given', ''),
                    'family': author.get('family', '')
                })

            if item.get('author'):
                new_bib['author'] = authorListFromDict(authors)

            new_extra = {
                'x_authors': authors,
                'language': item.get('language')
            }

            new_res = SearchResult(index, new_bib, 'crossref', new_extra)

            addUrlIfNew(new_res, item['URL'], 'main', 'crossref')

            if 'link' in item:
                for link in item['link']:
                    if isPDFURL(link['URL']):
                        addUrlIfNew(new_res, link['URL'], 'pdf', 'crossref')

            results.append(new_res)

        return results