def loadRefsFromHTML(filename): with open(filename) as f: html = f.read() html = html[html.find('<body>') + 6:] # html = re.sub('.+<body>', '', html, flags=re.DOTALL) entries = re.split('(<p>\n<p>\n<p>)', html) res = [] for entry in entries: lines = entry.split('\n') new_bib = {} for line in lines: match = re.search('<b>Reference Type: <\/b> (.+?)<p>', line) if match: if match.group(1) in type_mapping: new_bib['ENTRYTYPE'] = type_mapping[match.group(1)] else: new_bib['ENTRYTYPE'] = 'article' for bib_map in mapping: match = re.search('<b>' + bib_map[0] + ':<\/b> (.+?)<p>', line) if match: new_bib[bib_map[1]] = match.group(1) for match in re.finditer('<A HREF="(.+?)">', entry): if isPDFURL(match.group(1)): new_bib['eprint'] = match.group(1) else: new_bib['url'] = match.group(1) res.append(new_bib) return res
def bulkDownload(papers, root_dir, report_path, do_not_download_just_list=False): root_dir = os.path.abspath(root_dir) if not os.path.exists(root_dir): os.makedirs(root_dir) download_tasks = [] for paper in papers: # if not paper.year: # print("missing year", paper) filename = os.path.join(root_dir, generateFilename(paper)) + '.pdf' paper.pdf_filename = filename task_record = { 'id': paper.id, 'doi': paper.doi, 'filename': filename, 'abstract': paper.abstract } url = None url_source = None for url_rec in paper.extra_data.get('urls', []): if url_rec['type'] == 'pdf': url = url_rec['url'] url_source = url_rec['source'] break if not url: if paper.bib.get('eprint'): url = paper.bib['eprint'] url_source = 'search' elif paper.bib.get('url') and isPDFURL(paper.bib['url']): url = paper.bib['url'] url_source = 'search' if url: task_record['url'] = url task_record['url_source'] = url_source download_tasks.append(task_record) else: print(paper.extra_data) print(paper.bib) print() df = pd.DataFrame(download_tasks) df.to_csv('download_tasks.csv') if do_not_download_just_list: return results = ThreadPool(8).imap_unordered(fetch_url, download_tasks) df = pd.DataFrame(results) df.to_csv(report_path)
def getMetadata(self, paper, identity): if not paper.doi: raise ValueError("Paper has no DOI") url = 'https://api.unpaywall.org/v2/%s?email=%s' % (paper.doi, identity) r = self.request(url) data = r.json() if data.get('error') == 'true': return top_url = data.get('best_oa_location') if not top_url: return if top_url.get('url_for_pdf') in top_url: addUrlIfNew(paper, top_url['url_for_pdf'], 'pdf', 'unpaywall') if top_url.get('url_for_landing_page'): addUrlIfNew(paper, top_url['url_for_landing_page'], 'main', 'unpaywall') if top_url.get('url'): url = top_url['url'] if isPDFURL(url): type = 'pdf' else: type = 'main' addUrlIfNew(paper, url, type, 'unpaywall') paper.extra_data['done_unpaywall'] = True
def search(self, title, identity, max_results=5, min_year=None, max_year=None): url = 'https://www.semanticscholar.org/api/1/search' yearFilter = None if min_year or max_year: yearFilter = {} if not max_year: now = datetime.datetime.now() max_year = now.year if min_year: yearFilter['min'] = int(min_year) if max_year: yearFilter['max'] = int(max_year) results_left = max_results page_num = 1 return_results = [] while results_left > 0: data = { "queryString": title, "page": page_num, "pageSize": 10, "sort": "relevance", "authors": [], "coAuthors": [], "venues": [], "yearFilter": yearFilter, "requireViewablePdf": False, "publicationTypes": [], "externalContentTypes": [] } r = self.request(url, data=data, post=True) results_dict = r.json() if results_dict.get( 'totalResults' ) and max_results != results_dict['totalResults']: max_results = min(max_results, results_dict['totalResults']) results_left = max_results if 'results' in results_dict: results = results_dict['results'] else: results = [] results_left -= len(results) for index, res in enumerate(results[:results_left]): res_title = res['title']['text'] authors_processed = [] for author_list in res['authors']: for author_dict in author_list: if 'name' in author_dict: authors_processed.append(author_dict) authors = self.loadSSAuthors(authors_processed) bib = { 'title': res_title, 'abstract': res['paperAbstract']['text'], 'year': res['year']['text'], 'url': 'https://www.semanticscholar.org/paper/{}/{}'.format( res['slug'], res['id']), 'author': authorListFromDict(authors), } if res.get('doiInfo'): bib['doi'] = res['doiInfo'].get('doi') extra_data = {'ss_id': res['id'], 'x_authors': authors} new_res = SearchResult(index, bib, 'semantischolar', extra_data) for link in res.get('links', []): if isPDFURL(link['url']): bib['eprint'] = link['url'] addUrlIfNew(new_res, link['url'], 'pdf', 'semanticscholar') venue = res['venue'].get('text') extra_data['venue'] = venue return_results.append(new_res) return return_results
def search(self, title, identity, year=None, max_results=1): """ Searchs and returns a number of results from Crossref :param title: article title :param identity: email address to provide to Crossref :param year: publication year :param max_results: :return: list of Crossref JSON data results """ urllib.parse.quote(title, safe='') headers = {'User-Agent': 'ReviewBuilder(mailto:%s)' % identity} # changed because of https://status.crossref.org/incidents/4y45gj63jsp4 url = 'https://api.crossref.org/works?rows={}&query.bibliographic={}'.format( max_results, title) if year: url += '&query.published=' + str(year) r = self.request(url, headers) d = r.json() if d['status'] != 'ok': raise ValueError('Error in request:' + d.get('status', 'NO STATUS') + str(d.get('message', 'NO MESSAGE'))) results = [] for index, item in enumerate(d['message']['items']): # print(item.get('type')) new_bib = { 'doi': item['DOI'], 'title': basicTitleCleaning(removeListWrapper(item['title'])) } if 'container-title' in item: # reference-entry, book if item.get('type') in ['journal-article', 'reference-entry']: new_bib['journal'] = removeListWrapper( item['container-title']) new_bib['ENTRYTYPE'] = 'article' elif item.get('type') in ['book-chapter']: new_bib['ENTRYTYPE'] = 'inbook' new_bib['booktitle'] = removeListWrapper( item['container-title']) elif item.get('type') in ['proceedings-article']: new_bib['ENTRYTYPE'] = 'inproceedings' new_bib['booktitle'] = removeListWrapper( item['container-title']) if item.get('type') in ['book']: new_bib['ENTRYTYPE'] = 'book' if item.get('type') not in [ 'journal-article', 'reference-entry', 'book', 'book-chapter', 'proceedings-article' ]: print(json.dumps(item, indent=3)) for field in [ ('publisher-location', 'address'), ('publisher', 'publisher'), ('issue', 'issue'), ('volume', 'volume'), ('page', 'pages'), ]: if field[0] in item: new_bib[field[1]] = str(item[field[0]]) if 'URL' in item: new_bib['url'] = item['URL'] if "issued" in item: date_parts = item['issued']['date-parts'][0] new_bib['year'] = str(date_parts[0]) if len(date_parts) > 1: new_bib['month'] = str(date_parts[1]) if len(date_parts) > 2: new_bib['day'] = str(date_parts[2]) authors = [] for author in item.get('author', []): authors.append({ 'given': author.get('given', ''), 'family': author.get('family', '') }) if item.get('author'): new_bib['author'] = authorListFromDict(authors) new_extra = { 'x_authors': authors, 'language': item.get('language') } new_res = SearchResult(index, new_bib, 'crossref', new_extra) addUrlIfNew(new_res, item['URL'], 'main', 'crossref') if 'link' in item: for link in item['link']: if isPDFURL(link['URL']): addUrlIfNew(new_res, link['URL'], 'pdf', 'crossref') results.append(new_res) return results