def papers_by_query_api(request): if request.method == 'GET': phrase = request.GET.get('phrase', '') if not phrase: return HttpResponseBadRequest() query = SearchScholarQuery() query.set_phrase(phrase) querier = ScholarQuerier() querier.send_query(query) papers = querier.articles if not papers: result = { 'papers': [{ 'title': '', 'id': 0, 'url': '', 'excerpt': '' }] } else: result = { 'papers': [{ 'title': papers[0]['title'], 'id': papers[0]['cluster_id'], 'url': papers[0]['url'], 'excerpt': papers[0]['excerpt'] }] } return JsonResponse(result) else: return HttpResponseBadRequest()
def pdf_url(self, phrase): """Fetch a paper by *phrase* (usually the title) from scholar.google.com Tries to download a valid PDF. Returns the pdf url if one is found. Return None in case of errors. TODO: Actually check if the file can be downloaded... if error, continue with next candidate """ # Run initial query query = SearchScholarQuery() query.set_phrase(phrase) # --phrase "<phrase>" query.set_num_page_results(1) # -c 1 self.querier.send_query(query) if len(self.querier.articles) == 0: self.status.warning('No search results. Blocked maybe?') # TODO: Open result page in a browser (to answer the captcha) self.on_blocked() return None # Absolutely nothing returned; Abort self.timeout = TIMEOUT # Initial PDF url art = self.querier.articles[0] pdf_url = strip_url(art.attrs['url_pdf'][0]) # Some status self.status.result('Title', art.attrs['title'][0]) self.status.result('Year', art.attrs['year'][0]) self.status.result('PDF', pdf_url) # Check PDF url if pdf_url is None or is_blacklisted(pdf_url): self.status.result('URL', art.attrs['url'][0]) # Article found, but no PDF. Resort to searching by cluster. if art.attrs['cluster_id'][0] is not None: cluster = ClusterScholarQuery(cluster=art.attrs['cluster_id'][0]) self.querier.send_query(cluster) # Walk through results for cart in self.querier.articles: curl = strip_url(cart.attrs['url_pdf'][0]) if curl is not None and not is_blacklisted(curl): # Valid PDF found! pdf_url = curl # More status #self.status.result('Cluster', art.attrs['cluster_id'][0]) self.status.result('Title', cart.attrs['title'][0]) self.status.result('Year', cart.attrs['year'][0]) self.status.result('PDF', pdf_url) # We have a result, abort search break # pdf_url can stil be None if is_book(pdf_url) or is_book(art.attrs['url'][0]): self.status.warning('Might be a book') return pdf_url
def getRelatedPublications(author): print author settings = ScholarSettings() #adjust scholar settings querier = ScholarQuerier() #Instance of ScholarQuerier() conducts a search on Google Scholar querier.apply_settings(settings) #applies settings as provided by the instance of ScholarSettings() query = SearchScholarQuery() query.set_author(author) querier.send_query(query) print querier.articles
def getPublications(author): print author querier = ScholarQuerier() settings = ScholarSettings() querier.apply_settings(settings) query = SearchScholarQuery() query.set_author(author) querier.send_query(query) #scholar.csv(querier) scholar.txt(querier, with_globals=False)
def getRelatedPublications(author): print author settings = ScholarSettings() #adjust scholar settings querier = ScholarQuerier( ) #Instance of ScholarQuerier() conducts a search on Google Scholar querier.apply_settings( settings ) #applies settings as provided by the instance of ScholarSettings() query = SearchScholarQuery() query.set_author(author) querier.send_query(query) print querier.articles
def setCitationByTitle(paper_title): query = SearchScholarQuery() query.set_author("Si Chen") query.set_phrase(paper_title) query.set_num_page_results(1) #querier.send_query(query) #citations = 0 return query
def search(bot, update, args): search_command = ' '.join(args) bot.send_message(chat_id=update.message.chat_id, text="You searched for: " + search_command) querier = ScholarQuerier() query = SearchScholarQuery() query.set_words(args) querier.send_query(query) articles = querier.articles message = "" bot.send_message(chat_id=update.message.chat_id, text="Number of results: " + str(len(articles))) index = 0 for article in articles: bot.send_message(chat_id=update.message.chat_id, text=str(index+1)+". " + article.attrs['title'][0])
def getPublications_Title(title): querier = ScholarQuerier() settings = ScholarSettings() querier.apply_settings(settings) query = SearchScholarQuery() publications = [] query.set_words(title) querier.send_query(query) related_list = scholar.json(querier) if related_list: print "No of related publications found : ", print len(related_list) for item in related_list: #print item.keys() #item["relatedTitle"] = title[0] publications.append(item) #time.sleep(random.randrange(10, 40, 2)); #time.sleep(60); return publications
def process(document): scholar = ScholarQuerier() query = SearchScholarQuery() query.set_phrase(document.title) scholar.send_query(query) scholar_articles = scholar.articles if len(scholar_articles) == 0: return None title_match_ratio = \ difflib.SequenceMatcher(None, document.title, scholar_articles[0]['title']).ratio() if title_match_ratio < min_title_match_ratio: return None old_tags = document.tags citation_tag = ncitations_to_tag(scholar_articles[0]['num_citations']) new_tags = update_tags(old_tags, [(tag_pattern, citation_tag)]) document.update(tags=new_tags) return scholar_articles[0]['num_citations']
def query_scholar_for_papers(author, searchstring): querier = ScholarQuerier() settings = ScholarSettings() settings.set_citation_format(settings.CITFORM_BIBTEX) settings.set_per_page_results(5) querier.apply_settings(settings) query = SearchScholarQuery() query.set_author(author) query.set_phrase(searchstring) querier.send_query(query) return_str = '' if len(querier.articles) > 0: return_str += querier.articles[0].as_citation() + '\n' else: return_str = 'Ooopsie. No results. Maybe we ran over the request limit?' return return_str
def blocked(): print "Test if blocked...." #time.sleep(random.randrange(10, 40, 2)); time.sleep(60) publications = [] querier = ScholarQuerier() settings = ScholarSettings() querier.apply_settings(settings) query = SearchScholarQuery() query.set_author("Ryan Baker") querier.send_query(query) related_list = scholar.json(querier) if related_list: print "Block Test : No of related publications found : ", print len(related_list) for item in related_list: publications.append(item) if len(publications) == 0: return True else: return False
def blocked(): print "Test if blocked...." #time.sleep(random.randrange(10, 40, 2)); time.sleep(60); publications = [] querier = ScholarQuerier() settings = ScholarSettings() querier.apply_settings(settings) query = SearchScholarQuery() query.set_author("Ryan Baker") querier.send_query(query) related_list = scholar.json(querier) if related_list: print "Block Test : No of related publications found : ", print len(related_list) for item in related_list: publications.append(item) if len(publications) == 0: return True else: return False
def process(document): scholar = ScholarQuerier() query = SearchScholarQuery() # save cookie at first paper global save_cookie if save_cookie: query.set_phrase("quantum theory") scholar.send_query(query) scholar.save_cookies() save_cookie = False query.set_phrase(document.title) scholar.send_query(query) scholar_articles = scholar.articles if len(scholar_articles) == 0: return None title_match_ratio = \ difflib.SequenceMatcher(None, document.title, scholar_articles[0]['title']).ratio() if title_match_ratio < min_title_match_ratio: return None old_tags = document.tags citation_tag = ncitations_to_tag(scholar_articles[0]['num_citations']) new_tags = update_tags(old_tags, [(tag_pattern, citation_tag)]) new_tags.append(str(scholar_articles[0]['num_citations'])) document.update(tags=new_tags) return scholar_articles[0]['num_citations']
def papers_by_query_api(request): if request.method == 'GET': phrase = request.GET.get('phrase', '') if not phrase: return HttpResponseBadRequest() query = SearchScholarQuery() query.set_phrase(phrase) querier = ScholarQuerier() querier.send_query(query) papers = querier.articles if not papers: result = {'papers': [{'title': '', 'id': 0, 'url': '', 'excerpt': ''}]} else: result = {'papers': [{'title': papers[0]['title'], 'id': papers[0]['cluster_id'], 'url': papers[0]['url'], 'excerpt': papers[0]['excerpt']}]} return JsonResponse(result) else: return HttpResponseBadRequest()
def process_document(document_id, skip_documents=False): document = mendeley.document_details(document_id) if skip_documents and has_citation_tag(document["tags"], tag_pattern): return False try: query = SearchScholarQuery() query.set_phrase(document["title"]) scholar.send_query(query) scholar_articles = scholar.articles if len(scholar_articles) == 0: print ("No scholar articles found for " + document["title"]) return True except urllib2.HTTPError as e: print e.msg print e.reason sys.exit(-1) if "year" in document: year = document["year"] else: year = -1 print ("%s\t\t%s\tScholar: %s" % (scholar_articles[0]["num_citations"], year, scholar_articles[0]["title"])) print ("\t\t\tMendeley: %s" % document["title"]) title_match_ratio = difflib.SequenceMatcher(None, document["title"], scholar_articles[0]["title"]).ratio() if title_match_ratio < min_title_match_ratio: print ("Paper titles differ too much, skipping (match ratio: %f)." % title_match_ratio) time.sleep(randint(min_sleep_time_sec, max_sleep_time_sec)) if not (title_match_ratio < min_title_match_ratio): old_tags = document["tags"] citation_tag = ncitations_to_tag(scholar_articles[0]["num_citations"]) new_tags = update_tags(old_tags, [(tag_pattern, citation_tag)]) doc_updated = mendeley.update_document(docid, document={"tags": new_tags}) # print doc_updated return True
def getPublications(authors): print authors querier = ScholarQuerier() settings = ScholarSettings() querier.apply_settings(settings) query = SearchScholarQuery() publications = [] for author in authors: if len(author) > 0: print "Using Author : ", print author query.set_author(author) querier.send_query(query) related_list = scholar.json(querier) if related_list: print "No of related publications found : ", print len(related_list) for item in related_list: #print item.keys() #item["relatedAuthor"] = author publications.append(item) #time.sleep(random.randrange(10, 40, 2)); time.sleep(20) return publications
def getPublications(authors): print authors querier = ScholarQuerier() settings = ScholarSettings() querier.apply_settings(settings) query = SearchScholarQuery() publications = [] for author in authors: if len(author) > 0: print "Using Author : ", print author query.set_author(author) querier.send_query(query) related_list = scholar.json(querier) if related_list: print "No of related publications found : ", print len(related_list) for item in related_list: #print item.keys() #item["relatedAuthor"] = author publications.append(item) #time.sleep(random.randrange(10, 40, 2)); time.sleep(20); return publications
def get_url(querier, phrase): # Setup query query = SearchScholarQuery() # Query title / phrase query.set_phrase(phrase) # Set title search only query.set_scope(True) # Result count query.set_num_page_results(1) try: # Send query querier.send_query(query) articles = querier.articles for art in articles: url, _, _ = art.attrs['url'] url_pdf, _, _ = art.attrs['url_pdf'] return url except: return None
DST = Path(__file__).absolute().parents[1] / 'publications.bib' IGNORE = """ vo2014cytotoxicity takeilnatriureticpeptideisolatedfromeelbrain matchintemporal brodbeck2018transformation """.split() ACRONYMS = ['EEG', 'MEG', 'MRI'] querier = ScholarQuerier() settings = ScholarSettings() settings.set_citation_format(ScholarSettings.CITFORM_BIBTEX) querier.apply_settings(settings) query = SearchScholarQuery() query.set_phrase("eelbrain") query.set_timeframe(2012, None) query.set_include_patents(False) bib = parse_file(DST, 'bibtex') start = 0 while True: querier.send_query(query) if len(querier.articles) == 0: break # extract articles for article in querier.articles: querier.get_citation_data(article) # convert to pybtex entry
def get_results_for(title, author): query = SearchScholarQuery() query.set_author(author) query.set_phrase(title) query.set_num_page_results(1) query.set_scope(True) settings = ScholarSettings() settings.set_citation_format(ScholarSettings.CITFORM_BIBTEX) querier = ScholarQuerier() querier.apply_settings(settings) querier.send_query(query) for art in querier.articles: print art.as_citation(); bibtex_split = art.as_citation().split("\n") reftype = bibtex_split[0][1:-1].split("{")[0].lower(); refid = bibtex_split[0][1:-1].split("{")[1].lower(); bibtex_split.remove(bibtex_split[0]) #print reftype + " " + refid + " " + str(bibtex_split) thismodule = sys.modules[__name__] while(True): try: features_of_type = getattr(thismodule, reftype).func_code.co_varnames[ 1: getattr(thismodule, reftype).func_code.co_argcount ] break; except AttributeError: var = raw_input("Type " + reftype + " not recongised, please enter a known type: "); reftype = var; while (True): arranged_name = [] arranged_value = [] for i in range(1, 10): arranged_name.append(None) arranged_value.append(None) for line in bibtex_split: if ( line.find("=") > -1 ): stored_name = line.split("=")[0].strip() stored_value = line.split("=")[1].strip(); stored_value = stored_value[1:-(len(stored_value)-stored_value.rfind("}"))] if stored_name in features_of_type: arranged_name[features_of_type.index(stored_name)] = stored_name arranged_value[features_of_type.index(stored_name)] = stored_value short_arranged_name = arranged_name[ 0 : arranged_name.index(None)]; short_arranged_value = arranged_value[ 0 : arranged_value.index(None)]; if len(short_arranged_name) == len(features_of_type): return getattr(thismodule, reftype)(refid, *short_arranged_value).__getprintable__(True) else: for feature in features_of_type: if ( feature not in arranged_name ): var = raw_input(feature + " is not provided by the retrieved bibtex entry. Would you like to enter it now? (Y) or (N)"); if var == "Y": var = raw_input("Enter value for " + feature + ": "); bibtex_split.append(feature + " = {" + var + "}");
df = pd.read_csv("../data_seperate_sheet/Profile_Publications_Standard.csv") #cites =[] #citation_list = [] # [year of pub] with open('citations_counts_25307.csv', "w") as csv_file: writer = csv.writer(csv_file, delimiter=',') for k,v in df['Article Title'][25307:].iteritems(): # why is df['Article Title'] of type dict?? querier = ScholarQuerier() settings = ScholarSettings() settings.set_citation_format(ScholarSettings.CITFORM_BIBTEX) querier.apply_settings(settings) query = SearchScholarQuery() query.set_phrase(v) query.set_scope(True) querier.send_query(query) if querier.articles: cites=querier.articles[0].__getitem__('num_citations') citation_list=querier.articles[0].__getitem__('url_citations') else: cites=0 citation_list= "" # json_results = [] # file_name = 'query_data/cites_for_article_'+str(k+1)+'.json' # for art in querier.articles: # json_results.append( # {key: art.attrs[key][0] for key in art.attrs.keys()})
#input: full path to xlsx file containing article titles from scopus. Assumes that #titles are first column in the sheet 'Qatar_Scopus'. #output: csv file containing paper titles, citations, possibly free PDF link from web. from openpyxl import load_workbook import sys from scholar import ScholarQuerier, ScholarSettings, SearchScholarQuery,onecsv from time import sleep import csv querier = ScholarQuerier() settings = ScholarSettings() querier.apply_settings(settings) query = SearchScholarQuery() query.set_scope(True) alldata=[] counter=1 xlsxfile=sys.argv[1] wb = load_workbook(xlsxfile, use_iterators=True) print wb.get_sheet_names() ws = wb.get_sheet_by_name('Qatar_Scopus') for row in ws.iter_rows(row_offset=1): if row[0].value is not None: temp=[] title=row[0].value.encode("utf-8") query.set_phrase(title) query.set_num_page_results(1) querier.send_query(query) x=onecsv(querier) if (x!=None):
from scholar import ScholarQuerier, ScholarSettings, SearchScholarQuery DST = Path(__file__).absolute().parents[1] / 'publications.bib' IGNORE = """ vo2014cytotoxicity takeilnatriureticpeptideisolatedfromeelbrain matchintemporal brodbeck2018transformation """.split() ACRONYMS = ['EEG', 'MEG', 'MRI'] querier = ScholarQuerier() settings = ScholarSettings() settings.set_citation_format(ScholarSettings.CITFORM_BIBTEX) querier.apply_settings(settings) query = SearchScholarQuery() query.set_phrase("eelbrain") query.set_timeframe(2012, None) query.set_include_patents(False) bib = parse_file(DST, 'bibtex') start = 0 while True: querier.send_query(query) if len(querier.articles) == 0: break # extract articles for article in querier.articles: querier.get_citation_data(article) # convert to pybtex entry data = parse_bytes(article.citation_data, 'bibtex')
def literature_search(query_terms, type='full_name'): """ perform a google scholar query with given terms """ querier = ScholarQuerier() settings = ScholarSettings() config = ScholarConf() settings.set_citation_format(ScholarSettings.CITFORM_BIBTEX) querier.apply_settings(settings) query = SearchScholarQuery() papers = [] for item in query_terms.values: repo_id = item[0] if type !='full_name': repo_name = item[1] phrase = item[2] keywords = item[3] start_year = item[4] if keywords: if ',' not in keywords: keywords = keywords + ',' query.set_words_some(keywords) query.set_words(repo_name) query.set_phrase(phrase) phrase_text = repo_name + ', ' + phrase else: phrase = item[1] start_year = item[2] query.set_phrase(phrase) # commontk/CTK, meoyo/AIPS phrase_text = phrase print('search papers for {} ...'.format(phrase_text)) query.set_timeframe(start_year) querier.send_query(query) articles = querier.articles if len(articles)==0: continue results = process_arts(config, item[0], phrase_text, articles) papers = papers + results time_delay = random.randrange(1,10) time.sleep(time_delay) return papers