def search_doi(journal_title, start_date, end_date, print_issn, online_issn, count): """ A defined number of doi are put into a list If no doi are found using the online issn, the print issn :param journal_title: The title of the journal :param start_date: The start date in the format yyyy-mm-dd :param end_date: The end date in the format yyyy-mm-dd :param print_issn: The International Standard Serial Number for the print journal :param online_issn: The International Standard Serial Number for the online journal :param count: How many doi the method should try to find :return: A list of doi, this can be empty, but should be no larger than count """ works = Works() received_doi = [] if print_issn == '' and online_issn == '': return 'Both ISSNs are empty' if online_issn != '': # online ISSN exists for i in works.query(journal_title).filter( issn=online_issn, from_pub_date=start_date, until_pub_date=end_date).sample(count).select('DOI'): received_doi.append(i['DOI']) if print_issn != '': # print ISSN exists if not received_doi: for j in works.query(journal_title).filter( issn=print_issn, from_pub_date=start_date, until_pub_date=end_date).sample(count).select('DOI'): received_doi.append(j['DOI']) return received_doi
def query_crossref(self, author = None, pub = None): from crossref.restful import Works works = Works() if author is not None and pub is not None: return works.query(title = pub, author = author) elif author is not None: return works.query(author = author) elif pub is not None: return works.query(title = pub)
def cross(L): works = Works() print("no") for e in works.query("cancer"): print("yes") print(e["DOI"]) L.append(e["DOI"])
def search(request): keyword = request.GET.get('search') if (keyword == None): return render(request, 'search.html', {}) else: find_keyword = Sea.objects.filter(keyword__iexact=keyword) l = len(find_keyword) if (l != 0): print('数据库Sea中找到记录') return render(request, 'search.html', { 'results': find_keyword, 'keyword': keyword }) else: print('数据库Sea未中找到记录') works = Works() w1 = works.query(title=keyword) for index, item in enumerate(w1): if (index > 499): break result = searchDo.parse_result(keyword, item) #存入数据库 sea_record = Sea(**result) sea_record.save() print(str(index) + ' record is ok') find_keyword = Sea.objects.filter(keyword__iexact=keyword) return render(request, 'search.html', { 'results': find_keyword, 'keyword': keyword })
def fetch_bibtex_by_fulltext_crossref(txt, **kw): work = Works(etiquette=my_etiquette) logger.debug(six.u('crossref fulltext seach:\n') + six.u(txt)) # get the most likely match of the first results # results = [] # for i, r in enumerate(work.query(txt).sort('score')): # results.append(r) # if i > 50: # break query = work.query(txt, **kw).sort('score') query_result = query.do_http_request('get', query.url, custom_header=str( query.etiquette)).text results = json.loads(query_result)['message']['items'] if len(results) > 1: maxscore = 0 result = results[0] for res in results: score = _crossref_score(txt, res) if score > maxscore: maxscore = score result = res logger.info('score: ' + str(maxscore)) elif len(results) == 0: raise ValueError('crossref fulltext: no results') else: result = results[0] # convert to bibtex return crossref_to_bibtex(result).strip()
def get_doi(title, author): works = Works() work = works.query(bibliographic=title, author=author).url response = requests.get(work) json_response = json.loads(response.text) try: document = json_response["message"]["items"][0] return document.get( "URL" ) # print(document["DOI"], document["title"], document["URL"]) except IndexError: return None
def data(request): if request.method == 'POST': form = SimpleForm(request.POST) if form.is_valid(): #query = input('Enter the query to be searched: ') query = form.cleaned_data.get("enterUrl") parameter_values_list = [1, 10, '9ipXPomYaSrHLAIuONZfzUGk3t57RcBD'] response = requests.get(edited_search_coreAPI(query, parameter_values_list)) # response = requests.get(edited_search_coreAPI(form.enterUrl, parameter_values_list)) content = response.json() works = Works() w1 = works.query(container_title='zika', author='johannes', publisher_name='Wiley-Blackwell') for item in w1: print(item['title']) print(content) print(type(content)) context = { 'form': form, 'content': content } messages.success(request, f'Your Url has been generated') return redirect("query",data=str(content)) # return render(request,'users/query.html', {'content': [content]}) #return render(json.dumps(content,sort_keys=True, indent=4),'users/query.html', content_type="application/json")) #return (HttpResponse(json.dumps(content,sort_keys=True, indent=4), content_type="application/json")) # print(lists[0]) # form.save() else: messages.error(request,f'Wrong Url') return render(request, 'users/query.html', {'form': form}) else: form = SimpleForm() return render(request, 'users/data.html', {'form': form})
def search(): """ Uses Crossref API to search documents. """ queries = {} for key in ['author', 'words', 'doi']: val = request.form[key] if len(val) > 0: queries[key] = request.form[key] else: queries[key] = None # Init API and query works = Works() articles_q = [] if queries['doi']: articles_q = [works.doi(doi=queries['doi'])] else: articles_q = works.query(bibliographic=queries['words'], author=queries['author']).sample(20) # Check if article is in database already and wiki exists articles = [] for article in articles_q: articles.append(article) doi = article['DOI'] search_result = mongo.db.paperwiki.find_one({"DOI": doi}) if search_result: if 'content' in search_result.keys(): article['actionurl'] = "see_wiki?id=" + doi article['wiki_exists'] = True else: article['actionurl'] = "create_wiki?id=" + doi article['wiki_exists'] = False else: insert_id = mongo.db.paperwiki.insert_one(article) article['actionurl'] = "create_wiki?id=" + doi article['wiki_exists'] = False context = {"docs": articles} resp = render_template("home.html", docs=articles) return resp
def crossrefAPI_query(keyword: str) -> Dict: '''This function takes a keyword str and sends an according GET request to the CrossRef API. A normalized version of the first (most 'relevant') result is returned.''' article_dict = False works = Works() # If there is a timeout, try again (5 times) for _ in range(5): try: result = works.query(keyword).sort("relevance") for entry in result: # Take first result article_dict = entry break except: pass else: return if article_dict: #article_dict = normalize_crossref_dict(article_dict) #if contains_minimal_information(article_dict): article_dict = add_retrieval_information(article_dict, 'Crossref', 'unstructured_ID', keyword) return article_dict
def find_meta(title, doi): """ find metadata with title or doi Keyword Arguments: title -- doi -- """ ylog.info(title) works = Works() w1 = works.query(title).sort('relevance').order('desc') i = 0 for item in w1: i = i + 1 try: t = item.get('title')[0] sub_title = item.get('subtitle')[0] except: continue if SequenceMatcher(a=title, b=t).ratio() > 0.9 or SequenceMatcher( a=title, b=sub_title).ratio > 0.9: return item if i > 18: ylog.debug('[x]%s' % title) # ylog.debug(item['title']) return None
#!/usr/bin/python3 paper_title = "The Emotional Voices Database: Towards Controlling the Emotion Dimension in Voice Generation Systems" from crossref.restful import Works works = Works() w1 = works.query(author="birkan kolcu").sample(5) for item in w1: print(item['title'])
text = title + text.get_text() title = text os.mkdir(path + str(index) + 'figure\\') for fig in obj_figure: #图片 fig_url = 'https://onlinelibrary.wiley.com' + fig['src'] r = requests.get(fig_url, stream=True) image_name = fig_url[-20:].split('/') with open(path + str(index) + 'figure\\' + image_name[0], 'wb') as ff: for chunk in r.iter_content(chunk_size=128): ff.write(chunk) file = open(path + str(index) + title + '.txt', mode='w+', encoding='UTF-8') file.write(abstract + content) file.close() driver.close() index = 0 for i in works.query(bibliographic='mof', publisher_name='Wiley-Blackwell').filter( from_online_pub_date='2017').sample(10): index += 1 acquire_text(i['URL'], index) print(i['URL']) #URL = 'http://dx.doi.org/10.1002/aoc.4820' #acquire_text(URL,index)
from pdfminer.pdfdocument import PDFDocument from crossref.restful import Works from PyPDF2 import PdfFileReader import re from ylib import ylog import logging from difflib import SequenceMatcher import bibtexparser ylog.set_level(logging.DEBUG) ylog.console_on() ylog.filelog_on("app") works = Works() title = """Heterogeneous resistance to vancomycin in Staphylococcus epidermidis, Staphylococcus haemolyticus and Staphylococcus warneri clinical strains: characterisation""" w1 = works.query(title).sort('relevance').order('desc') i = 0 target_doi = '10.1109/icdcs.2006.48' items_result = None for item in w1: i = i + 1 try: t = item.get('title')[0] sub_title = item.get('subtitle')[0] ylog.debug('crossref item title ') ylog.debug(t) ylog.debug(sub_title) except: ylog.debug(item) continue if SequenceMatcher(a=title, b=t).ratio() > 0.8:
def meta_data_search(self, search_terms, save_path, publisher_name='Wiley'): """ A method to collect the metadata from crossref based on specific search terms passed in. The results are filtered down to a specific publisher. It then saves all the URLs as a single list. This function writes full_publist to a json file, which contains a list of meta-data for the articles identified in the query. Parameters: search_terms (list of str): List of search terms used to identify articles to be added in the corpus save_path (str): Absolute or relative filepath leading to the directory where the fulltexts will be saved publisher_name (str): The name of the publisher that is to be queried for articles. This must correspond to a publisher within the CrossRef database. Returns: full_publist (pd.DataFrame): A dataframe that contains a list of articles and their DOI's and URL's """ works = Works() members = Members() #query publisher for articles with given search terms chem_subset = works.query(search_terms) pub = next(iter(members.query(publisher_name))) pub_id = pub['id'] chem_subset = chem_subset.filter(member=pub_id) doc_number = chem_subset.count() headers = { 'CR-Clickthrough-Client-Token': self.clickthrough, "User-Agent": self.agent, "Connection": 'close' } url = chem_subset.url + "&select=DOI,link&rows=1000&mailto=" + self.mailto + "&cursor=" #starting cursor value. It will be updated in the loop with each request we make. cursor = '*' url_list = [] doi_list = [] saved_docs = 0 #make the first request before entering the while loop response = requests.get(url + cursor, headers=headers).json() while len(response['message']['items']) > 0: #Add the total number of papers from the response to the saved docs list. saved_docs += len(response['message']['items']) pcnt_comp = 100 * saved_docs / doc_number print(f"{pcnt_comp:.3f}% complete") #for every entry in the response, loop through each entry. for entries in response['message']['items']: #Check to see if the response item has a link in it keycheck = True try: entries['link'] except KeyError: keycheck = False if keycheck: #If the link exists, then append article meta-data URL = entries['link'][0]['URL'] DOI = entries['DOI'] #Update all http to https if URL[:5] != "https" and URL[:4] == 'http': URL = 'https' + URL[4:] #Check to see if the URL format is correct. If yes, add it, #otherwise you don't add it to the list. if URL[8] == 'a': url_list.append(URL) doi_list.append(DOI) #Build a dataframe of URLs and DOIs from our requests. Save checkpoint full_publist = pd.DataFrame() full_publist['URL'] = url_list full_publist['DOI'] = doi_list full_publist.to_json(save_path + 'wiley_meta_list.json') #update the cursor, and make a new request for a new response. cursor = response['message']['next-cursor'] cursor = cursor.replace("+", "%2B") response_nojson = requests.get(url + cursor, headers=headers) #check the status code from this response if response_nojson.status_code == 200: response = response_nojson.json() else: print("Status code is bad! The response code is: " + str(response_nojson.status_code)) break #Build a dataframe from the full list of URLs and DOIs from our requests full_publist['URL'] = url_list full_publist['DOI'] = doi_list #Save the final list after dropping duplicates full_publist.drop_duplicates() full_publist.to_json(save_path + 'wiley_meta_list.json') return full_publist
try: doi = driver.find_element_by_xpath("//*[contains(text(), 'doi')]") doi.click() print(driver.current_url) doiTitle = driver.find_element_by_tag_name("title").get_attribute("textContent") doi = "" print(doiTitle) except Exception: pass if not doi: works = Works() i=0 if doiTitle: print(doiTitle) w1 = works.query(title=doiTitle).filter(type="journal-article").sort('relevance') for item in w1: i=i+1 if item['title'] is doiTitle: doi = item['DOI'] persistent = "available" break if i > 10: break else: w1 = works.query(title=search_phrase).filter(type="journal-article").sort("relevance") for item in w1: i=i+1 if search_phrase in item['title']: doi = item['DOI'] persistent = "available"
def Getdoiplus(markupdict): #### Get DOI of journal article from CrossRef XML queries #### Uses crossref.restful Works python module #### Inputs are Title and First Author #### Output is DOI print 'Attempting to get DOI from CrossRef' works = Works() titlestr = markupdict['Title'] ###Find First Author Surname if ' ' in markupdict['FirstAuthor']: dummydict = markupdict['FirstAuthor'].split(' ') if ',' in markupdict['FirstAuthor']: authorstr = dummydict[0].strip() authorstr = authorstr.replace(',', '', 1) else: authorstr = dummydict.pop().strip() else: authorstr = markupdict['FirstAuthor'] if '-' in authorstr: authorstr = authorstr.split('-').pop() #print markupdict['FirstAuthor'], 'authorst=',authorstr ### Query does not do exact phrase matching - find word in title that returns fewest results titlestr = '' leastquerynumber = 999999999 #print markupdict['Title'].split(' ') #dummy=markupdict['Title'].replace('\u\xa0','') for word in markupdict['Title'].split(' '): if len(word) > 5: #print word, works.query(title=word, author=authorstr).count() if works.query(title=word, author=authorstr).count( ) < leastquerynumber and works.query(title=word, author=authorstr).count() > 0: titlestr = word #print 'titlestr is', word if titlestr == '': titlestr = markupdict['Title'] #print markupdict['Title'], 'titlestr=', titlestr DOIstr = '10.1016/s0022-3115(98)00906-4' print 'number of titles is:', works.query(title='Uranium dioxide', author='Bae').count() #print 'number of titles is:', works.query(title=titlestr, author='Bae' ).count() #print 'doi no. titles:', works.query(DOI=DOIstr) doi = '' ### Perform query and find exact or partial title matches print 'Querying Title=%s Author=%s number of titles is: %d' % ( titlestr, authorstr, works.query(title=titlestr, author=authorstr).count()) for item in works.query(title=titlestr, author=authorstr): #for item in works.query(title='Uranium dioxide', author='Bae' ): #for item in works.query(DOI=DOIstr): #print item['title'][0] if markupdict['Title'].lower() == item['title'][0].lower(): print '**** exact match ****' print '[1]', item['title'][0] print '[2]', markupdict['Title'] doi = item['DOI'] elif abs( len(item['title'][0].split(' ')) - len(markupdict['Title'].split(' ')) ) == 0: ### Word by word matching #dummytitle1=item['title'][0].replace('(',"",1).replace(')',"",1).replace('/',"",1).replace('.',"",1).replace(',',"",1) n_match = 0 for i in range(len(item['title'][0].split(' '))): if item['title'][0].split(' ')[i].strip().lower( ) == markupdict['Title'].split(' ')[i].strip().lower(): n_match = n_match + 1 #if abs(len(item['title'][0].split(' '))-n_match)<3:' if n_match / (len(item['title'][0].split(' ')) * 1.0) > 0.7 and abs( len(item['title'][0].split(' ')) - n_match) < 3: doi = item['DOI'] print '**** partial match ****', n_match, ' of', len( item['title'][0].split( ' ')), 'matches. Proportion:', n_match / ( len(item['title'][0].split(' ')) * 1.0) print '[1]', item['title'][0] print '[2]', markupdict['Title'] #print 'matches=', n_match, 'total=', len(item['title'][0].split(' ')) ## print item['DOI'] ##for item in works.sample(2): ## print (item['title']) ## print item['DOI'] if doi == '': print '**** No Match found ****\n' else: pass #print doi print '*************\n' #print item.keys() return (doi)
def getDoiWithCrossRef(entry, my_etiquette): """ Get the doi of a bibtex entry thanks to crossref. Parameters ---------- entry : BibDatabase The bibtex record with missing doi. my_etiquette : tuple A record that contains all require fields to create Etiqette object. Returns ------- doi : string the doi code. """ # tries counter for each entry count = 0 # store if a match has been found match = False # if provide create the Etiquette object if my_etiquette: etiquette = Etiquette(*my_etiquette) print(etiquette) else: etiquette = None # create crossref api instance for request works = Works(etiquette=etiquette) # convert entry to unicode for searching entry_unicode = bp.customization.convert_to_unicode(entry.copy()) # Check for mandatory field try: # extract basic fields author1 = entry_unicode['author'].split(',')[0].strip() title = entry_unicode['title'].strip() year = entry_unicode['year'].strip() except Exception: warnings.warn("author, title and year fields are missing in entry {}\ ".format(entry_unicode)) doi = None return doi w1 = works.query(author=author1, bibliographic=title).filter( until_pub_date=year, from_pub_date=year, type='journal-article').sort('score').order('desc') # parse the crossref record to find the "best" match for item in w1: count += 1 # fuzzy comprare ratio = SM(None, title, item['title'][0]).ratio() if ratio > TOL_MATCH: match = True break # limit the number of query if count > COUNT: print(' Reach maximal number of tries ({}) \ for this record {}'.format(COUNT, entry_unicode)) break if match: doi = item['DOI'] else: print(" MISSING : {}, {}".format(entry_unicode['author'], entry_unicode['title'])) doi = None return doi
class OutputDOIs(SqliteDataResource): """ This resource is a cached set of output IDs matched to DOIs using regexes, URLs, Crossref searches, and Refindit searches. This resource takes several hours to update, depending on throttling from Crossref and number of threads available for multiprocessing. """ def __init__(self, context): super().__init__(context, DataResource.data_dir / 'output_dois.db') etiquette = Etiquette( 'SYNTH transform', '0.1', 'https://github.com/NaturalHistoryMuseum/synth_transform', '*****@*****.**') self.works = Works(etiquette=etiquette) self._handled = set() self._added = set() self._errors = {} self._methods = {} @property def keys(self): return [tuple(json.loads(k)) for k in self.data.keys()] def mapped_items(self, new_id_map): """ Transform the stored keys (tuples of (synth round, output ID)) into new IDs using a map generated during the rebuild process. Resource must be open. :param new_id_map: a dict with tuple keys and new ID values """ if self.data is None: raise Exception('Resource is not open.') mapped = {} for k, v in self.data.items(): try: new_key = new_id_map[tuple(json.loads(k))] mapped[new_key] = v except KeyError: continue return mapped def _search_output(self, conn, output, synth_round): """ Search for a single output using title and author. Searches the Crossref API first, then ReFindIt if that doesn't return a suitable result. Compares the output title with each result using fuzzywuzzy and considers them a match if the two strings are at least 80% similar. :param conn: SqliteDataResource with an open SQLiteDict, e.g. 'self' within 'with self:' :param output: the Output instance we're attempting to find a DOI for :param synth_round: the round this output was recorded in """ output_key = json.dumps((synth_round, output.Output_ID)) self._handled.add(output_key) try: authors = find_names(clean_string(output.Authors) or '') title = output.Title.rstrip('.') q = self.works.query( author=authors, bibliographic=title).sort('relevance').order('desc') for ri, result in enumerate(q): result_title = result.get('title', [None])[0] if result_title is None: continue similarity = fuzz.partial_ratio(result_title, title.lower()) if similarity >= 80: self._added.add(output.Output_ID) conn.add(output_key, result['DOI'].upper()) self._methods[output_key] = 'crossref' return if ri >= 3 - 1: return # refindit also searches a few other databases, so try that if crossref doesn't find it refindit_url = 'https://refinder.org/find?search=advanced&limit=5&title=' \ f'{title}&author={"&author=".join(authors)}' refindit_response = requests.get(refindit_url) if refindit_response.ok: for ri, result in enumerate(refindit_response.json()): result_title = result.get('title') if result_title is None: continue similarity = fuzz.partial_ratio(result_title, title.lower()) if similarity >= 80: self._added.add(output.Output_ID) conn.add(output_key, result['DOI'].upper()) self._methods[output_key] = 'refindit' return except Exception as e: self._errors[(synth_round, output.Output_ID)] = e def update(self, context, target, *synth_sources): """ Attempt to find a DOI for each output in the NHMOutput tables. """ with self: super(OutputDOIs, self).update(context, target, *synth_sources) self._handled = set() self._errors = {} self._methods = {} for db_ix, synth_db in enumerate(synth_sources): db_ix += 1 self._added = set() def _extract_doi(conn, output, col): output_key = json.dumps((db_ix, output.Output_ID)) self._handled.add(output_key) for x in DOIExtractor.dois(getattr(output, col), fix=True): doi, fn = x doi_metadata = self.works.doi(doi) if doi_metadata: doi_title = doi_metadata.get('title', '') doi_title = clean_string(doi_title[0]).lower() output_title = output.Title if output_title is not None: output_title = clean_string(output_title.lower()) match = fuzz.partial_ratio(doi_title, output_title) if match > 50: self._added.add(output.Output_ID) conn.add(output_key, doi.upper()) self._methods[output_key] = fn break def _search_columns(col, *filters): outputs = synth_db.query(NHMOutput).filter( NHMOutput.Output_ID.notin_(self._added), *filters) thread_workers = context.config.resource_opt( 'dois.threads', 20) with self, ThreadPoolExecutor( thread_workers) as thread_executor: thread_map(lambda x: _extract_doi(self, x, col), outputs.all(), desc=col, unit=' records', leave=False, position=1) _search_columns('URL', NHMOutput.URL.isnot(None)) _search_columns( 'Volume', or_(NHMOutput.Volume.ilike('%doi%'), NHMOutput.Volume.ilike('%10.%/%'))) _search_columns( 'Pages', or_(NHMOutput.Pages.ilike('%doi%'), NHMOutput.Pages.ilike('%10.%/%'))) # now for searching based on metadata title_and_author = synth_db.query(NHMOutput).filter( NHMOutput.Output_ID.notin_(self._added), NHMOutput.Title.isnot(None), NHMOutput.Authors.isnot(None)) workers = context.config.resource_opt('dois.threads', 20) with self, ThreadPoolExecutor(workers) as executor: thread_map(lambda x: self._search_output(self, x, db_ix), title_and_author.all(), desc='Crossref', unit=' records', leave=False, position=1) methods = {} for k, v in self._methods.items(): methods[v] = methods.get(v, []) + [k] for k, v in methods.items(): click.echo(f'{k}: {len(v)}')
def crossrefAPI_improved_query(parsed_ref_dict: Dict) -> Dict: ''' This function takes a parsed reference dict as returned by the parsers from reference_parser. It uses the information given in the dict to create a cleaned up string for a Crossref keyword query and goes through the first 200 entries to check if the returned result overlaps with the parsed information and returns the result. ''' article_dict = False works = Works() for _ in range(5): try: # Create clean query string # If everything is given if 'volume' not in parsed_ref_dict.keys(): return None if 'authors' in parsed_ref_dict.keys(): if 'issue' in parsed_ref_dict.keys(): formatted_bib_str = '{}, {}, {}, ({}), ({}), {}'.format( parsed_ref_dict['authors'], parsed_ref_dict['journal'], parsed_ref_dict['volume'], parsed_ref_dict['issue'], parsed_ref_dict['year'], parsed_ref_dict['pages']) # Everything but the issue is given else: formatted_bib_str = '{}, {}, {}, ({}), {}'.format( parsed_ref_dict['authors'], parsed_ref_dict['journal'], parsed_ref_dict['volume'], parsed_ref_dict['year'], parsed_ref_dict['pages']) # Everything but author given elif 'issue' in parsed_ref_dict.keys(): formatted_bib_str = '{}, {}, ({}), ({}), {}'.format( parsed_ref_dict['journal'], parsed_ref_dict['volume'], parsed_ref_dict['issue'], parsed_ref_dict['year'], parsed_ref_dict['pages']) result = works.query(formatted_bib_str).sort("relevance") # Browse first 200 entries to check if one of the results fit a = 0 try: for entry in result: a += 1 if a == 200: break entry = add_retrieval_information( entry, 'Crossref', 'Crossref_extended_query', str(parsed_ref_dict)) normalized_dict = normalize_crossref_dict(entry) #print(normalized_dict) if normalized_dict: if is_same_publication(parsed_ref_dict, normalized_dict): article_dict = entry break except JSONDecodeError: pass break except: pass if article_dict: return article_dict
class NarrativeDataset: LICENSE_WHITELIST = [ 'http://creativecommons.org/licenses/by/4.0/', 'http://creativecommons.org/licenses/by/3.0/' ] download_links = dict() def __init__(self, reset_cache=False): self.journals = Journals() self.works = Works() self.filter_kwargs = dict(has_license='true', has_full_text='true') self.keywords = 'business financial merger entrepreneur banking insurance commerce trade economics' UnpywallCredentials('*****@*****.**') cache_path = path.join(DATA_DIR, 'unpaywall_cache') if reset_cache and path.exists(cache_path): remove(cache_path) self.unpywall_cache = UnpywallCache(cache_path) Unpywall.init_cache(self.unpywall_cache) def get_dois_from_journal(self, journal_issn): doi_list = [] try: if self.journals.journal_exists(journal_issn): works = self.journals.works(journal_issn).filter( **self.filter_kwargs).select('DOI', 'license') for response_dict in tqdm(works): license_dict = response_dict['license'] if self.is_license_whitelist(license_dict[0]['URL']): doi_list.append(response_dict['DOI']) except Exception as e: logger.error("Error while getting DOIs from REST service", e, exc_info=True) return doi_list def get_dois_from_keywords(self): doi_list = [] try: results = self.works.query(self.keywords).filter( **self.filter_kwargs).select('DOI', 'license') for response_dict in tqdm(results): license_dict = response_dict['license'] if self.is_license_whitelist(license_dict[0]['URL']): doi_list.append(response_dict['DOI']) except Exception as e: logger.error("Error while getting DOIs from REST service", e, exc_info=True) return doi_list def get_oa_urls(self, doi_list): logger.info('Retreiving doc urls for DOIs now (cached/uncached)') oa_urls = [] for i, doi in tqdm(enumerate(doi_list), total=len(doi_list)): try: oa_urls.append(Unpywall.get_doc_link(doi)) except HTTPError: logger.warning( '\nError received for DOI: {}, will retry 3 times in 20 secs' .format(doi)) sleep(20) for i in range(3): try: logger.info('Retry :{}'.format(i + 1)) oa_urls.append(Unpywall.get_doc_link(doi)) break except HTTPError as e: logger.error('Retry failed', e, exc_info=True) return oa_urls def is_license_whitelist(self, license): license = str(license).replace('https', 'http') return license in self.LICENSE_WHITELIST def retry_from_another_src(self, faulty_files_list, doi_list): src_dict = {'scirp': []} for file in faulty_files_list: base_name = ntpath.basename(file) doi_list_ind = int(base_name.replace("Sample_", "")[:-8]) - 1 doi = doi_list[doi_list_ind] doc_url = Unpywall.get_pdf_link(doi) if doc_url is not None and 'scirp' in doc_url.lower(): try: scirp_id = doc_url[doc_url.index('paperID=') + 8:] except (IndexError, ValueError): continue if scirp_id != "": src_dict['scirp'].append((file, scirp_id)) return download_frm_another_src(src_dict) @staticmethod def download_doi_pdf(works, doi_list, download_dir): logger.info( "Trying to download the required data now for {} DOIs".format( len(doi_list))) for i, doi in enumerate(doi_list): name_pattern = 'Sample_{}.pdf'.format(str(i + 1)) download_link = Unpywall.get_pdf_link(doi) try: if not download_link: result = works.doi(doi)['link'] for item in result: application = item['intended-application'] type = item['content-type'] if application is not None and application == 'text-mining' and type == 'application/pdf': download_link = item['URL'] break NarrativeDataset.download_links[ name_pattern[:-4]] = download_link if not path.exists(path.join(download_dir, name_pattern)): if download_link and filter_url(download_link): logger.debug('Downloading ' + name_pattern + " : " + doi + ' from url: ' + download_link) download_pdf_file(download_link, name_pattern, download_dir, progress=True) sleep(5) except Exception as e: logger.error( "Error while downloading the article ({}, {})".format( str(i + 1), doi), e, exc_info=True) NarrativeDataset.download_links[ name_pattern[:-4]] = download_link return True
def crossref_ref_by_count(doi): work = works.doi(doi) if work == None: return -1 else: return work['is-referenced-by-count'] # print(crossref_ref_by_count("10.1108/07363760110410263")) # print(works.query("10.1108/07363760110410263")) # print(works.query("10.1108/07363760110410263" + "&[email protected]")) if __name__ == "__main__": works.query("unable to specify mailto in works.query call&mailto=" + sys.argv[3]) f = open(sys.argv[2], "w") f.write("doi,citedby_crossref") f.close() f = open(sys.argv[2], "a") for doi in sys.argv[1].split(',\n'): time.sleep(0.1) citedby = crossref_ref_by_count(doi) if citedby == -1: # Couldn't find doi, don't add it () else: # Append doi to csv
from crossref.restful import Works import signal, time works = Works() class Timeout(Exception): pass def raiseTimeout(sig, frame): raise Timeout signal.signal(signal.SIGALRM, raiseTimeout) signal.alarm(2) array = [] try: for e in works.query("cancer"): array.add(e["DOI"]) except Timeout: print("took too long") print(array)
''' ##<输入模块 key_words = 'nanozyme' #搜索关键词 path = 'c:\\Users\\asdqw\\Desktop\\Get_Literature\\' + key_words + '\\' #存储目录 date = '2000' #起始年份 num = 100 #文献量 ##输入结束> try: os.mkdir(path) except: pass index = 0 #文献序号 metalist = works.query( bibliographic=key_words, publisher_name='Wiley-Blackwell').filter(from_online_pub_date=date) count = works.query(bibliographic=key_words, publisher_name='Wiley-Blackwell').filter( from_online_pub_date=date).count() print('总文献数:' + str(count)) print('完成Corssref元数据检索') for i in metalist: #.sample(num): index += 1 print('目前进度: ' + str(index) + '/' + str(count + 1)) try: acquire_text(i, index, path) except: print('下载失败') file = open(path + 'Failed' + str(index) + '.txt', mode='w+',
# Author: Guillaume Bouvier -- [email protected] # https://research.pasteur.fr/en/member/guillaume-bouvier/ # 2021-01-21 09:52:56 (UTC+0100) from crossref.restful import Works if __name__ == '__main__': import argparse # argparse.ArgumentParser(prog=None, usage=None, description=None, epilog=None, parents=[], formatter_class=argparse.HelpFormatter, prefix_chars='-', fromfile_prefix_chars=None, argument_default=None, conflict_handler='error', add_help=True, allow_abbrev=True, exit_on_error=True) parser = argparse.ArgumentParser(description='') # parser.add_argument(name or flags...[, action][, nargs][, const][, default][, type][, choices][, required][, help][, metavar][, dest]) parser.add_argument( '-s', '--search', type=str, required=True, help= "Query bibliographic information, useful for citation look up. Includes titles, authors, ISSNs and publication years" ) args = parser.parse_args() works = Works() w = works.query(bibliographic=args.search) for item in w: title = item['title'][0] doi = item['DOI'] print(f"title: {title}") print(f"doi: {doi}") print()
def find_meta(self, identifier): """ find metadata with title or DOI Keyword Arguments: identifier -- """ try: # verify=False is dangerous but sci-hub.io # requires intermediate certificates to verify # and requests doesn't know how to download them. # as a hacky fix, you can add them to your store # and verifying would work. will fix this later. url = self.base_url + identifier['article_link'] self.sess.headers = {'user-agent': self.get_random_user_agent()} res = self.sess.get(url, verify=False, allow_redirects=False) re_bracket = re.compile("\[(.*?)\]\s") title = re.sub(re_bracket, "", identifier['name']) ylog.debug('*' * 80) ylog.debug("title: %s" % title) ylog.debug(res.status_code) # self.out.ix[title]['status_code'] = res.status_code ylog.debug("headers: %s" % res.headers['Content-Type']) ylog.debug('location: %s' % res.headers.get("Location")) # self.out.ix[title]['location'] = res.headers.get("Location") search_title = True if not res.headers.get("Location"): content = res.content if len(content) > 2: import cchardet charset = cchardet.detect(content) text = content.decode(charset['encoding']) soup = BeautifulSoup(text, "lxml") script = soup.script.get_text() doi_regexp = '10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?!["&\'<>])\S)+' try: doi_match = re.compile(doi_regexp).findall(script)[0] ylog.info("DOI: %s" % doi_match) search_title = False # use crossref API to get metadata works = Works() w1 = works.query(doi_match).sort('relevance').order( 'desc') i = 0 for item in w1: # TODO: verify title # self.out.ix[title]['DOI'] = item['DOI'] return {'meta': item['DOI'], 'url': url} except IndexError: ylog.debug('failed to find regexp') elif search_title: works = Works() w1 = works.query(title).sort('relevance').order('desc') i = 0 for item in w1: i = i + 1 try: # ylog.debug('crossref item title ') t = item.get('title')[0] # ylog.debug(t) sub_title = item.get('subtitle')[0] # ylog.debug(sub_title) # ylog.debug("ratio: %s" % # (SequenceMatcher(a=title, b=t).ratio())) except TypeError: sub_title = '' if SequenceMatcher( a=title, b=t).ratio() > 0.9 or SequenceMatcher( a=title, b=sub_title).ratio( ) > 0.9 or t.startswith(title): ylog.debug("DOI %s" % item['DOI']) # self.out.ix[title]['DOI'] = item['DOI'] return {'meta': item['DOI'], 'url': url} if i > 18: # ylog.debug('[x]%s' % title) # ylog.debug(item['title']) return None except requests.exceptions.ConnectionError: logger.info('{} cannot acess,changing'.format( self.available_base_url_list[0])) self._change_base_url() except requests.exceptions.RequestException as e: return { 'err': 'Failed to fetch pdf with identifier %s (resolved url %s) due to request exception.' % (identifier, url) }
def get_doi(entry, config): has_doi = bib_parser.has_doi(entry) my_etiquette = Etiquette(constants.PROJECT_NAME, constants.VERSION, constants.URL, constants.EMAIL) max_levenshtein_distance = config.get_max_levenshtein_distance() update_URL = config.get_update_URL() works = Works(etiquette=my_etiquette) if not has_doi and bib_parser.has_url(entry): entry_url = bib_parser.get_url(entry) if "doi" in entry_url: doi = cleaner.clean_doi(entry_url) if is_crossref_work(doi): crossref_info = works.doi(doi) if crossref_is_similar(crossref_info, entry, max_levenshtein_distance): entry = set_doi(entry, doi, update_URL) has_doi = True if not has_doi: # we try to find the doi for the title entry_title = bib_parser.get_title(entry) entry_title = cleaner.clean_braces(entry_title) author = bib_parser.get_author(entry) first_author = splitname(author[0], strict_mode=False) first_author_last_name = first_author["last"][0] query_parameters = { "author": first_author_last_name, "bibliographic": entry_title } works_query = works.query(**query_parameters) works_query = works_query.sort("score").order("desc").select( ["title", "DOI"]) i_i_item = 0 max_items = min(works_query.count(), 10) works_results = iter(works_query) while i_i_item < max_items and not has_doi: i_item = next(works_results) if crossref_is_similar(i_item, entry, max_levenshtein_distance): doi = cr_parser.get_doi(i_item) entry = set_doi(entry, doi, update_URL) has_doi = True i_i_item += 1 else: # We check to see if the doi is correct doi = bib_parser.get_doi(entry) doi = cleaner.clean_doi(doi) if is_crossref_work(doi): crossref_info = works.doi(doi) if crossref_is_similar(crossref_info, entry, max_levenshtein_distance): entry = set_doi(entry, doi, update_URL) else: entry.pop("doi", None) if "doi" in bib_parser.get_url(entry): entry.pop("url", None) has_doi = False else: entry = set_doi(entry, doi, update_URL) return entry, has_doi