Пример #1
0
def process(document):
    scholar = ScholarQuerier()
    query = SearchScholarQuery()

    # save cookie at first paper
    global save_cookie
    if save_cookie:
        query.set_phrase("quantum theory")
        scholar.send_query(query)
        scholar.save_cookies()
        save_cookie = False

    query.set_phrase(document.title)
    scholar.send_query(query)
    scholar_articles = scholar.articles
    if len(scholar_articles) == 0:
        return None

    title_match_ratio = \
        difflib.SequenceMatcher(None, document.title, scholar_articles[0]['title']).ratio()
    if title_match_ratio < min_title_match_ratio:
        return None

    old_tags = document.tags
    citation_tag = ncitations_to_tag(scholar_articles[0]['num_citations'])
    new_tags = update_tags(old_tags, [(tag_pattern, citation_tag)])
    new_tags.append(str(scholar_articles[0]['num_citations']))
    document.update(tags=new_tags)

    return scholar_articles[0]['num_citations']
Пример #2
0
    def pdf_url(self, phrase):
	"""Fetch a paper by *phrase* (usually the title) from scholar.google.com
	Tries to download a valid PDF. Returns the pdf url if one is found.
	Return None in case of errors.

        TODO: Actually check if the file can be downloaded... if error, continue with next candidate

	"""
        # Run initial query
        query = SearchScholarQuery()
        query.set_phrase(phrase) # --phrase "<phrase>"
        query.set_num_page_results(1) # -c 1
        self.querier.send_query(query)

        if len(self.querier.articles) == 0:
            self.status.warning('No search results. Blocked maybe?')
            # TODO: Open result page in a browser (to answer the captcha)
            self.on_blocked()
            return None # Absolutely nothing returned; Abort

        self.timeout = TIMEOUT

	# Initial PDF url
        art = self.querier.articles[0]
        pdf_url = strip_url(art.attrs['url_pdf'][0])

	# Some status
        self.status.result('Title', art.attrs['title'][0])
        self.status.result('Year', art.attrs['year'][0])
        self.status.result('PDF', pdf_url)

	# Check PDF url
        if pdf_url is None or is_blacklisted(pdf_url):
            self.status.result('URL', art.attrs['url'][0])

            # Article found, but no PDF. Resort to searching by cluster.
            if art.attrs['cluster_id'][0] is not None:
                cluster = ClusterScholarQuery(cluster=art.attrs['cluster_id'][0])
                self.querier.send_query(cluster)

		# Walk through results
                for cart in self.querier.articles:
                    curl = strip_url(cart.attrs['url_pdf'][0])
                    if curl is not None and not is_blacklisted(curl):
			# Valid PDF found!
                        pdf_url = curl
			# More status
                        #self.status.result('Cluster', art.attrs['cluster_id'][0])
                        self.status.result('Title', cart.attrs['title'][0])
                        self.status.result('Year', cart.attrs['year'][0])
                        self.status.result('PDF', pdf_url)
			# We have a result, abort search
                        break

                # pdf_url can stil be None

        if is_book(pdf_url) or is_book(art.attrs['url'][0]):
            self.status.warning('Might be a book')

        return pdf_url
Пример #3
0
def papers_by_query_api(request):
    if request.method == 'GET':
        phrase = request.GET.get('phrase', '')
        if not phrase:
            return HttpResponseBadRequest()

        query = SearchScholarQuery()
        query.set_phrase(phrase)
        querier = ScholarQuerier()
        querier.send_query(query)
        papers = querier.articles

        if not papers:
            result = {
                'papers': [{
                    'title': '',
                    'id': 0,
                    'url': '',
                    'excerpt': ''
                }]
            }
        else:
            result = {
                'papers': [{
                    'title': papers[0]['title'],
                    'id': papers[0]['cluster_id'],
                    'url': papers[0]['url'],
                    'excerpt': papers[0]['excerpt']
                }]
            }
        return JsonResponse(result)
    else:
        return HttpResponseBadRequest()
Пример #4
0
def setCitationByTitle(paper_title):
    query = SearchScholarQuery()
    query.set_author("Si Chen")
    query.set_phrase(paper_title)
    query.set_num_page_results(1)
    #querier.send_query(query)
    #citations = 0
    return query
def literature_search(query_terms, type='full_name'):
    """
    perform a google scholar query with given terms
    """

    querier = ScholarQuerier()
    settings = ScholarSettings()
    config = ScholarConf()
    settings.set_citation_format(ScholarSettings.CITFORM_BIBTEX)
    querier.apply_settings(settings)
    query = SearchScholarQuery()

    papers = []
    for item in query_terms.values:
        repo_id = item[0]
        
        if type !='full_name':
            repo_name = item[1]
            phrase = item[2]
            keywords = item[3]
            start_year = item[4]
            if keywords:
                if ',' not in keywords:
                    keywords = keywords + ','
                query.set_words_some(keywords)                

            query.set_words(repo_name)
            query.set_phrase(phrase)

            phrase_text = repo_name + ', ' + phrase
        else:
            phrase = item[1]
            start_year = item[2]

            query.set_phrase(phrase) # commontk/CTK, meoyo/AIPS
            phrase_text = phrase
        print('search papers for {} ...'.format(phrase_text))
        query.set_timeframe(start_year)
        querier.send_query(query)
        articles = querier.articles
        if len(articles)==0:
            continue
        results = process_arts(config, item[0], phrase_text, articles)
        papers = papers + results
        time_delay = random.randrange(1,10)
        time.sleep(time_delay)

    return papers
Пример #6
0
def query_scholar_for_papers(author, searchstring):

    querier = ScholarQuerier()
    settings = ScholarSettings()
    settings.set_citation_format(settings.CITFORM_BIBTEX)
    settings.set_per_page_results(5)
    querier.apply_settings(settings)
    query = SearchScholarQuery()
    query.set_author(author)
    query.set_phrase(searchstring)

    querier.send_query(query)

    return_str = ''
    if len(querier.articles) > 0:
        return_str += querier.articles[0].as_citation() + '\n'
    else:
        return_str = 'Ooopsie. No results. Maybe we ran over the request limit?'

    return return_str
def process(document):        
    scholar = ScholarQuerier() 
    query = SearchScholarQuery()
    query.set_phrase(document.title)
    scholar.send_query(query)
    scholar_articles = scholar.articles
    if len(scholar_articles) == 0:
        return None

    title_match_ratio = \
        difflib.SequenceMatcher(None, document.title, scholar_articles[0]['title']).ratio()
    if title_match_ratio < min_title_match_ratio:
        return None

    old_tags = document.tags
    citation_tag = ncitations_to_tag(scholar_articles[0]['num_citations'])
    new_tags = update_tags(old_tags, [(tag_pattern, citation_tag)])
    document.update(tags=new_tags)
    
    return scholar_articles[0]['num_citations']
Пример #8
0
def process(document):
    scholar = ScholarQuerier()
    query = SearchScholarQuery()
    query.set_phrase(document.title)
    scholar.send_query(query)
    scholar_articles = scholar.articles
    if len(scholar_articles) == 0:
        return None

    title_match_ratio = \
        difflib.SequenceMatcher(None, document.title, scholar_articles[0]['title']).ratio()
    if title_match_ratio < min_title_match_ratio:
        return None

    old_tags = document.tags
    citation_tag = ncitations_to_tag(scholar_articles[0]['num_citations'])
    new_tags = update_tags(old_tags, [(tag_pattern, citation_tag)])
    document.update(tags=new_tags)

    return scholar_articles[0]['num_citations']
Пример #9
0
def query_scholar_for_papers(author, searchstring):

    querier = ScholarQuerier()
    settings = ScholarSettings()
    settings.set_citation_format(settings.CITFORM_BIBTEX)
    settings.set_per_page_results(5)
    querier.apply_settings(settings)
    query = SearchScholarQuery()
    query.set_author(author)
    query.set_phrase(searchstring)

    querier.send_query(query)

    return_str = ''
    if len(querier.articles) > 0:
        return_str += querier.articles[0].as_citation() + '\n'
    else:
        return_str = 'Ooopsie. No results. Maybe we ran over the request limit?'

    return return_str
Пример #10
0
def papers_by_query_api(request):
    if request.method == 'GET':
        phrase = request.GET.get('phrase', '')
        if not phrase:
            return HttpResponseBadRequest()

        query = SearchScholarQuery()
        query.set_phrase(phrase)
        querier = ScholarQuerier()
        querier.send_query(query)
        papers = querier.articles

        if not papers:
            result = {'papers': [{'title': '', 'id': 0, 'url': '', 'excerpt': ''}]}
        else:
            result = {'papers': [{'title': papers[0]['title'],
                                  'id': papers[0]['cluster_id'],
                                  'url': papers[0]['url'],
                                  'excerpt': papers[0]['excerpt']}]}
        return JsonResponse(result)
    else:
        return HttpResponseBadRequest()
def process_document(document_id, skip_documents=False):
    document = mendeley.document_details(document_id)
    if skip_documents and has_citation_tag(document["tags"], tag_pattern):
        return False

    try:
        query = SearchScholarQuery()
        query.set_phrase(document["title"])
        scholar.send_query(query)
        scholar_articles = scholar.articles
        if len(scholar_articles) == 0:
            print ("No scholar articles found for " + document["title"])
            return True
    except urllib2.HTTPError as e:
        print e.msg
        print e.reason
        sys.exit(-1)

    if "year" in document:
        year = document["year"]
    else:
        year = -1

    print ("%s\t\t%s\tScholar: %s" % (scholar_articles[0]["num_citations"], year, scholar_articles[0]["title"]))
    print ("\t\t\tMendeley: %s" % document["title"])
    title_match_ratio = difflib.SequenceMatcher(None, document["title"], scholar_articles[0]["title"]).ratio()
    if title_match_ratio < min_title_match_ratio:
        print ("Paper titles differ too much, skipping (match ratio: %f)." % title_match_ratio)
    time.sleep(randint(min_sleep_time_sec, max_sleep_time_sec))

    if not (title_match_ratio < min_title_match_ratio):
        old_tags = document["tags"]
        citation_tag = ncitations_to_tag(scholar_articles[0]["num_citations"])
        new_tags = update_tags(old_tags, [(tag_pattern, citation_tag)])
        doc_updated = mendeley.update_document(docid, document={"tags": new_tags})
        # print doc_updated
    return True
Пример #12
0
def get_url(querier, phrase):
    # Setup query
    query = SearchScholarQuery()

    # Query title / phrase
    query.set_phrase(phrase)

    # Set title search only
    query.set_scope(True)

    # Result count
    query.set_num_page_results(1)

    try:
        # Send query
        querier.send_query(query)

        articles = querier.articles
        for art in articles:
            url, _, _ = art.attrs['url']
            url_pdf, _, _ = art.attrs['url_pdf']
            return url
    except:
        return None
Пример #13
0
def get_results_for(title, author):
    
    query = SearchScholarQuery()
    query.set_author(author)
    query.set_phrase(title)
    query.set_num_page_results(1)
    query.set_scope(True)

    settings = ScholarSettings()
    settings.set_citation_format(ScholarSettings.CITFORM_BIBTEX)
 
    querier = ScholarQuerier()
    querier.apply_settings(settings)
    querier.send_query(query)

    for art in querier.articles:
        
        print art.as_citation();
        
        bibtex_split = art.as_citation().split("\n")
        reftype = bibtex_split[0][1:-1].split("{")[0].lower(); 
        refid = bibtex_split[0][1:-1].split("{")[1].lower(); 
        bibtex_split.remove(bibtex_split[0])
    
        #print reftype + " " + refid + " " + str(bibtex_split)
    
        thismodule = sys.modules[__name__]

        while(True):
            
            try:
                features_of_type = getattr(thismodule, reftype).func_code.co_varnames[ 1: getattr(thismodule, reftype).func_code.co_argcount ]
                break;
            except AttributeError:
                var = raw_input("Type " + reftype + " not recongised, please enter a known type: ");
                reftype = var;
        
        while (True):
            arranged_name = []
            arranged_value = []
            for i in range(1, 10):
                arranged_name.append(None)
                arranged_value.append(None)
           
            for line in bibtex_split:
                if ( line.find("=") > -1 ):
                    stored_name = line.split("=")[0].strip()
                    stored_value = line.split("=")[1].strip();
                    stored_value = stored_value[1:-(len(stored_value)-stored_value.rfind("}"))]
                    if stored_name in features_of_type:
                        arranged_name[features_of_type.index(stored_name)] = stored_name
                        arranged_value[features_of_type.index(stored_name)] = stored_value
              
            short_arranged_name = arranged_name[ 0 : arranged_name.index(None)];
            short_arranged_value = arranged_value[ 0 : arranged_value.index(None)];
            
            if len(short_arranged_name) == len(features_of_type):
                return getattr(thismodule, reftype)(refid, *short_arranged_value).__getprintable__(True)
            else:
                for feature in features_of_type:
                    if ( feature not in arranged_name ):
                        var = raw_input(feature + " is not provided by the retrieved bibtex entry. Would you like to enter it now? (Y) or (N)");
                        if var == "Y":
                            var = raw_input("Enter value for " + feature + ": ");
                            bibtex_split.append(feature + " = {" + var + "}");
querier.apply_settings(settings)
query = SearchScholarQuery()
query.set_scope(True)
alldata=[]
counter=1

xlsxfile=sys.argv[1]
wb = load_workbook(xlsxfile, use_iterators=True)
print wb.get_sheet_names()
ws = wb.get_sheet_by_name('Qatar_Scopus')

for row in ws.iter_rows(row_offset=1):
	if row[0].value is not None:
		temp=[]
		title=row[0].value.encode("utf-8")
		query.set_phrase(title)
		query.set_num_page_results(1)
		querier.send_query(query)
 	   	x=onecsv(querier)
	  	if (x!=None):
   			year=x.split("|")[2]
   			numcit=x.split("|")[3]
   			weburl=x.split("|")[1]
   		else:
			year=None;numcit=None;weburl=None;	
   		temp.append(title),temp.append(numcit);temp.append(year);temp.append(weburl);
   		alldata.append(temp)
  		print "Title: ",title,"query number: ",counter,"No. citations: ",numcit
		print "sleeeping for 5 seconds"
		sleep(5)
		counter=counter+1
Пример #15
0
DST = Path(__file__).absolute().parents[1] / 'publications.bib'
IGNORE = """
vo2014cytotoxicity
takeilnatriureticpeptideisolatedfromeelbrain
matchintemporal
brodbeck2018transformation
""".split()
ACRONYMS = ['EEG', 'MEG', 'MRI']

querier = ScholarQuerier()
settings = ScholarSettings()
settings.set_citation_format(ScholarSettings.CITFORM_BIBTEX)
querier.apply_settings(settings)
query = SearchScholarQuery()
query.set_phrase("eelbrain")
query.set_timeframe(2012, None)
query.set_include_patents(False)

bib = parse_file(DST, 'bibtex')
start = 0
while True:
    querier.send_query(query)
    if len(querier.articles) == 0:
        break
    # extract articles
    for article in querier.articles:
        querier.get_citation_data(article)
        # convert to pybtex entry
        data = parse_bytes(article.citation_data, 'bibtex')
        assert len(data.entries) == 1
DST = Path(__file__).absolute().parents[1] / 'publications.bib'
IGNORE = """
vo2014cytotoxicity
takeilnatriureticpeptideisolatedfromeelbrain
matchintemporal
brodbeck2018transformation
""".split()
ACRONYMS = ['EEG', 'MEG', 'MRI']

querier = ScholarQuerier()
settings = ScholarSettings()
settings.set_citation_format(ScholarSettings.CITFORM_BIBTEX)
querier.apply_settings(settings)
query = SearchScholarQuery()
query.set_phrase("eelbrain")
query.set_timeframe(2012, None)
query.set_include_patents(False)


bib = parse_file(DST, 'bibtex')
start = 0
while True:
    querier.send_query(query)
    if len(querier.articles) == 0:
        break
    # extract articles
    for article in querier.articles:
        querier.get_citation_data(article)
        # convert to pybtex entry
        data = parse_bytes(article.citation_data, 'bibtex')
Пример #17
0
df = pd.read_csv("../data_seperate_sheet/Profile_Publications_Standard.csv")

#cites =[]
#citation_list = [] # [year of pub]

with open('citations_counts_25307.csv', "w") as csv_file:
    writer = csv.writer(csv_file, delimiter=',')
    for k,v in df['Article Title'][25307:].iteritems(): 
        # why is df['Article Title'] of type dict??
        
        querier = ScholarQuerier()
        settings = ScholarSettings()
        settings.set_citation_format(ScholarSettings.CITFORM_BIBTEX)
        querier.apply_settings(settings)
        query = SearchScholarQuery()
        query.set_phrase(v)
        query.set_scope(True)
        querier.send_query(query)
        if querier.articles:
            cites=querier.articles[0].__getitem__('num_citations')
            citation_list=querier.articles[0].__getitem__('url_citations')
        else:
            cites=0
            citation_list= ""
            
    #    json_results = []
    #    file_name = 'query_data/cites_for_article_'+str(k+1)+'.json'
    #    for art in querier.articles:
    #        json_results.append(
    #            {key: art.attrs[key][0] for key in art.attrs.keys()})
    #    with open(file_name, 'wb') as f: