def SearchScholar(options): """Send Google Scholar Query""" querier = scholar.ScholarQuerier() settings = scholar.ScholarSettings() querier.apply_settings(settings) if options['cluster_id']: query = scholar.ClusterScholarQuery(cluster=options.cluster_id) else: query = scholar.SearchScholarQuery() if options['author']: query.set_author(options['author']) if options['allw']: query.set_words(options['allw']) if options['some']: query.set_words_some(options['some']) if options['none']: query.set_words_none(options['none']) if options['phrase']: query.set_phrase(options['phrase']) if options['title_only']: query.set_scope(True) if options['pub']: query.set_pub(options['pub']) if options['after'] or options['before']: query.set_timeframe(options.after, options.before) if options['no_patents']: query.set_include_patents(False) query.get_url() querier.send_query(query) return scholar.get_results_objects(querier)
def get_scholar_data(paper_list): querier = scholar.ScholarQuerier() settings = scholar.ScholarSettings() settings.set_citation_format(scholar.ScholarSettings.CITFORM_BIBTEX) querier.apply_settings(settings) scholar.ScholarConf.LOG_LEVEL = 3 cache = read_cache(cache_file) assert (cache != None) if cache.get('paper_list') == paper_list: print 'Use cache from file %s' % cache_file # Use cache to reduce the number of google scholar request else: # Update cache, instead of flushing a complete new one print 'Get data from google scholar' cache_paper_title = [p['title'] for p in cache['paper_list']] missing_paper = [ p for p in paper_list if p['title'] not in cache_paper_title ] missing_scholar_data = [ get_paper_data(querier, v) for v in missing_paper ] # update cache cache['paper_list'] += missing_paper cache['scholar_data'] += missing_scholar_data save_cache(cache_file, cache) save_cache(cache_file, cache) # Enforce to flush cache return cache['scholar_data']
def citation_retriever(title): """ Retrieve number of citation for each paper in our collection """ acc = 0 # Set up scraper querier = scholar.ScholarQuerier() settings = scholar.ScholarSettings() querier.apply_settings(settings) query = scholar.SearchScholarQuery() while True: print(acc) # Set query parameters query.set_author("Alan Turing") query.set_words("computing") query.set_num_page_results(1) querier.send_query(query) # Print the URL of the first article found print(querier.articles[0]['url']) BLOCK_BYPASS = random.randint(15,60) time.sleep(BLOCK_BYPASS) acc += 1
def main(): filename = 'all_papers' title_author_list = [] with open(filename + str('.csv'), 'r') as fin: for line in fin.readlines(): title_author_list.append( [line.split(',')[0], int(line.split(',')[1])]) print('The number of papers ------ : ', len(title_author_list)) print('One of paper titles ------ : ', title_author_list[99][0]) my_querier = scholar.ScholarQuerier() my_querier.apply_settings(scholar.ScholarSettings()) title_author_citation_list = [] counter = 0 for title_author in title_author_list: print('paper @ {}/{}'.format(counter, len(title_author_list))) my_query = scholar.SearchScholarQuery() my_query.set_words(title_author[0]) my_querier.send_query(my_query) print(my_querier.articles) if len(my_querier.articles) == 0: print('NOT found... ', title_author[0]) else: citation = my_querier.articles[0].attrs["num_citations"][0] title_author_citation_list.append(title_author.append(citation)) time.sleep(20 + random.randint(0, 20))
def buscadorAvanzado(frase, words, autor, after, before): # nombre_directorio= str(id_user)+ "."+ str(id_proyecto) querier = scholar.ScholarQuerier() settings = scholar.ScholarSettings() query = scholar.SearchScholarQuery() if frase != "": query.set_phrase(frase) if words != "": query.set_words(words) if autor != "": query.set_author(autor) if after != "" or before != "": query.set_timeframe(after, before) query.set_num_page_results(40) querier.send_query(query) scholar.getArticles(querier) articles = querier.articles articulos = getArticlesDict(articles) # if articulos is not None: # moveFiles() # indexarArchivos() return articulos
def __init__(self): self.querier = scholar.ScholarQuerier() self.settings = scholar.ScholarSettings() self.settings.set_citation_format( scholar.ScholarSettings.CITFORM_BIBTEX) self.querier.apply_settings(self.settings)
def parse(self, response): table = response.xpath('//*[@class="post-content"]//table//tbody') rows = table.xpath('//tr') total = len(rows) for i in range(total): if i == 0 or i == total - 1: print('skipped: unusable html') else: print(i) name = rows[i].xpath('td//div//text()')[1].extract() url = rows[i].xpath('td//div//a/@href').extract() tags = rows[i].xpath('td//ul//li//div//text()').extract() print(name) print(url) print(tags) querier = scholar.ScholarQuerier() settings = scholar.ScholarSettings() querier.apply_settings(settings) query = scholar.SearchScholarQuery() query.set_words(name) querier.send_query(query) scholar.txt(querier, with_globals=1)
def queryGoogleScholar(andkeywords, orkeywords, pnt, filename, header): query = scholar.SearchScholarQuery() query.set_words(' '.join(andkeywords)) query.set_words_some(' '.join(orkeywords)) query.set_num_page_results(10) querier = scholar.ScholarQuerier() settings = scholar.ScholarSettings() querier.send_query(query) scholar.csv(querier, header=header, sep='|', filename=filename, geo=pnt)
def google_scholar_query(querystr): querier = scholar.ScholarQuerier() query = scholar.SearchScholarQuery() query.set_phrase(querystr) querier.send_query(query) ret = "\n=========\n" for article in querier.articles: ret += article.as_txt() + "\n=========\n" return ret
def scrape_citation_count(p): scholar.ScholarConf.COOKIE_JAR_FILE = COOKIE query = scholar.SearchScholarQuery() query.set_words(p.title) querier = scholar.ScholarQuerier() querier.send_query(query) try: print(querier.articles[0].attrs['num_citations'][0]) return querier.articles[0].attrs['num_citations'][0] except: #Practically only fails on Captchas or connection timeout print("Google Scholar captcha :(") return -1
def query(): doi = request.query['doi'] response.content_type = 'text/plain' querier = scholar.ScholarQuerier() settings = scholar.ScholarSettings() querier.apply_settings(settings) query = scholar.SearchScholarQuery() query.set_num_page_results(1) query.set_phrase(doi) querier.send_query(query) scholar.txt(querier, with_globals=False) citation = querier.articles[0]['num_citations'] return template('{{citation}}', citation=citation)
def getPaper(papertitle, querier=scholar.ScholarQuerier()): papers = scholar.papers_by_title(papertitle, querier) if len(papers) > 0: print "[DATA COLLECTOR INFO]" print " Found paper:" print " ", papers[0]["title"], "(", papers[0]["papernumber"], ")" print " ", "with", papers[0]["num_citations"], "citations" print return papers[0] else: print "[DATA COLLECTOR INFO]" print " Didn't find any papers." print return None
def search_author(get_links): # from https://github.com/ckreibich/scholar.py/issues/80 se_, index, category, category, buff = get_links querier = scholar.ScholarQuerier() settings = scholar.ScholarSettings() querier.apply_settings(settings) query = scholar.SearchScholarQuery() query.set_words(category) querier.send_query(query) links = [ a.attrs['url'][0] for a in querier.articles if a.attrs['url'][0] is not None ] #links = query.get_url() #print(links) #if len(links) > NUM_LINKS: links = links[0:NUM_LINKS] [process((se_, index, l, category, buff)) for index, l in enumerate(links)]
def getAllCitingPapersIncremental(papertitle, querier=scholar.ScholarQuerier()): paper = getPaper(papertitle, querier) numPapersProcessedCumulative = 1 numDuplicatesRemoved = 0 paper["depth"] = 0 paper["numPapersProcessedCumulative"] = numPapersProcessedCumulative paper["numDuplicatesRemoved"] = numDuplicatesRemoved allPapers = dict() toCheckPapers = [paper] while (len(toCheckPapers) > 0): paper = toCheckPapers.pop(0) if paper["title"] in allPapers: numDuplicatesRemoved = numDuplicatesRemoved + 1 else: print "[DATA COLLECTOR INFO] Found paper: " + paper["title"] paper[ "numPapersProcessedCumulative"] = numPapersProcessedCumulative paper["numDuplicatesRemoved"] = numDuplicatesRemoved allPapers[paper["title"]] = paper yield paper if (paper["papernumber"]): newCitations = scholar.citations_by_papernr( paper["papernumber"], querier) numPapersProcessedCumulative = numPapersProcessedCumulative + len( newCitations) for art in newCitations: art["depth"] = paper["depth"] + 1 toCheckPapers.append(art)
def operation(keyword_x, index): print('index: ' + str(index)) print('keyword: ' + str(keyword_x)) index = int(index) querier = scholar.ScholarQuerier() settings = scholar.ScholarSettings() querier.apply_settings(settings) query = scholar.SearchScholarQuery() query.set_author("") query.set_words(str(keyword_x)) query.set_num_page_results(10) querier.send_query(query) try: url = querier.articles[index]['url'].encode('utf-8') except AttributeError: url = "No URL" try: title = querier.articles[index]['title'].encode('utf-8') except AttributeError: url = "No Title" try: year = querier.articles[index]['year'].encode('utf-8') except AttributeError: year = "No Date" # Will fix this author = "" publication = "" line = "'" + title.decode('utf-8') + \ "'. " + \ year.decode('utf-8') + ", " + url.decode('utf-8') + "." print(line) citations.append(line)
def buscadorSimple(frase): # nombre_directorio=str(id_user)+ "."+ str(id_proyecto) querier = scholar.ScholarQuerier() settings = scholar.ScholarSettings() query = scholar.SearchScholarQuery() query.set_phrase(frase) query.set_num_page_results(40) querier.send_query(query) scholar.getArticles(querier) articles = querier.articles articulos = getArticlesDict(articles) # MOVER ARTICULOS A CARPETA TMP #if articulos is not None: # moveFiles() # indexarArchivos() return articulos
m = re.search(r'^.*&.*&(.*)&(.*)& \\cite\{(.*)\}.*\\\\', line) if m and m.group(1).strip() != 'License': cites.add(m.group(3)) seen = set() dois = [] with open('bibliography/biblio.bib') as bibtex_file: bib_database = bibtexparser.load(bibtex_file) for e in bib_database.entries: if 'ID' in e and e['ID'] in cites and 'doi' in e: seen.add(e['ID']) dois.append(e['doi']) notseen = cites - seen querier = scholar.ScholarQuerier() citecnts = [] for doi in dois: query = scholar.SearchScholarQuery() query.set_words(doi) querier.send_query(query) if len(querier.articles) > 0: art = querier.articles[0] txt = art.as_txt() if doi in txt: #print art['num_citations'],doi citecnts.append((doi, art['num_citations'])) else:
def main(): data = pd.DataFrame() f = open('../results/5556531000720111691.csv.bkup', 'r') for idx, line in enumerate(f): data_values = line.split(',', 2) to_append = pd.DataFrame([data_values]) data = data.append(to_append) f.close() # # for each cluster id # for from_cluster_id in range( data.shape[0])[:1]: # just get the first one, for now\ print(from_cluster_id) cluster_id = data.iloc[from_cluster_id, 0] try: cluster_id = int(cluster_id) except ValueError: continue querier = scholar.ScholarQuerier() settings = scholar.ScholarSettings() query = scholar.SearchScholarQuery() query_cluster = scholar.ClusterScholarQuery(cluster=cluster_id) querier.send_query(query_cluster) # # for each article in search results # for article in querier.articles[: 1]: # get first article result, for now article.attrs.get('url_citations')[0] current_article = GoogleScholarArticleSimple() current_article.cluster_id = cluster_id current_article.set_search_soup().set_num_search_results( ).set_num_search_pages() # gs_r = current_article.soup.find_all("div", class_="gs_r") # # for each search page result of citing article # for page_idx, search_page_number in enumerate( range(current.article.num_search_pages) [:1]): # get first page result for now url = citations_url_generic.format(search_page_number * 10, from_cluster_id) r = requests.get(url) soup = BeautifulSoup(r.text) gs_r = soup.find_all("div", class_="gs_r") # print(len(gs_r)) output_file_path = '../results/01-{}.csv'.format( from_cluster_id) f = open(output_file_path, 'w') f.close() # # for each search result # for citing_article_soup in gs_r: result_article = DanGoogleScholarArticle( soup=citing_article_soup) result_article.parse_title() # print(result_article.title) result_article.parse_cluster_id() # seed_cluster_id = result_article.cluster_id # print(seed_cluster_id) f = open(output_file_path, 'a+') str_to_write = '{}\t|\t{}\t|\t{}\n'.\ format(result_article.cluster_id, cluster_id, citing_article_soup) f.write(str_to_write) f.close() sleep_time = random() * randint(10, 100) print('cluster_id: {}, page: {}, sleeping: {}'.format( from_cluster_id, page_number, sleep_time)) sleep(sleep_time)
def main(): usage = """demo.py [options] <query string> A command-line interface to Google Scholar. Examples: # Retrieve one article written by Einstein on quantum theory: demo.py -c 1 --author "albert einstein" --phrase "quantum theory" # Retrieve a BibTeX entry for that quantum theory paper: demo.py -c 1 -C 17749203648027613321 --citation bt # Retrieve five articles written by Einstein after 1970 where the title # does not contain the words "quantum" and "theory": demo.py -c 5 -a "albert einstein" -t --none "quantum theory" --after 1970""" fmt = optparse.IndentedHelpFormatter(max_help_position=50, width=100) parser = optparse.OptionParser(usage=usage, formatter=fmt) group = optparse.OptionGroup(parser, 'Query arguments', 'These options define search query arguments and parameters.') group.add_option('-a', '--author', metavar='AUTHORS', default=None, help='Author name(s)') group.add_option('-A', '--all', metavar='WORDS', default=None, dest='allw', help='Results must contain all of these words') group.add_option('-s', '--some', metavar='WORDS', default=None, help='Results must contain at least one of these words. Pass arguments in form -s "foo bar baz" for simple words, and -s "a phrase, another phrase" for phrases') group.add_option('-n', '--none', metavar='WORDS', default=None, help='Results must contain none of these words. See -s|--some re. formatting') group.add_option('-p', '--phrase', metavar='PHRASE', default=None, help='Results must contain exact phrase') group.add_option('-t', '--title-only', action='store_true', default=False, help='Search title only') group.add_option('-P', '--pub', metavar='PUBLICATIONS', default=None, help='Results must have appeared in this publication') group.add_option('--after', metavar='YEAR', default=None, help='Results must have appeared in or after given year') group.add_option('--before', metavar='YEAR', default=None, help='Results must have appeared in or before given year') group.add_option('--no-patents', action='store_true', default=False, help='Do not include patents in results') group.add_option('--no-citations', action='store_true', default=False, help='Do not include citations in results') group.add_option('-C', '--cluster-id', metavar='CLUSTER_ID', default=None, help='Do not search, just use articles in given cluster ID') group.add_option('-c', '--count', type='int', default=None, help='Maximum number of results') parser.add_option_group(group) group = optparse.OptionGroup(parser, 'Output format', 'These options control the appearance of the results.') group.add_option('--txt', action='store_true', help='Print article data in text format (default)') group.add_option('--txt-globals', action='store_true', help='Like --txt, but first print global results too') group.add_option('--csv', action='store_true', help='Print article data in CSV form (separator is "|")') group.add_option('--csv-header', action='store_true', help='Like --csv, but print header with column names') group.add_option('--citation', metavar='FORMAT', default=None, help='Print article details in standard citation format. Argument Must be one of "bt" (BibTeX), "en" (EndNote), "rm" (RefMan), or "rw" (RefWorks).') parser.add_option_group(group) group = optparse.OptionGroup(parser, 'Miscellaneous') group.add_option('--cookie-file', metavar='FILE', default=None, help='File to use for cookie storage. If given, will read any existing cookies if found at startup, and save resulting cookies in the end.') group.add_option('-d', '--debug', action='count', default=0, help='Enable verbose logging to stderr. Repeated options increase detail of debug output.') group.add_option('-v', '--version', action='store_true', default=False, help='Show version information') parser.add_option_group(group) options, _ = parser.parse_args() # Show help if we have neither keyword search nor author name if len(sys.argv) == 1: parser.print_help() return 1 if options.debug > 0: options.debug = min(options.debug, sc.ScholarUtils.LOG_LEVELS['debug']) sc.ScholarConf.LOG_LEVEL = options.debug sc.ScholarUtils.log('info', 'using log level %d' % sc.ScholarConf.LOG_LEVEL) if options.version: print('This is demo.py %s.' % sc.ScholarConf.VERSION) return 0 if options.cookie_file: sc.ScholarConf.COOKIE_JAR_FILE = options.cookie_file # Sanity-check the options: if they include a cluster ID query, it # makes no sense to have search arguments: if options.cluster_id is not None: if options.author or options.allw or options.some or options.none \ or options.phrase or options.title_only or options.pub \ or options.after or options.before: print('Cluster ID queries do not allow additional search arguments.') return 1 querier = sc.ScholarQuerier() settings = sc.ScholarSettings() if options.citation == 'bt': settings.set_citation_format(sc.ScholarSettings.CITFORM_BIBTEX) elif options.citation == 'en': settings.set_citation_format(sc.ScholarSettings.CITFORM_ENDNOTE) elif options.citation == 'rm': settings.set_citation_format(sc.ScholarSettings.CITFORM_REFMAN) elif options.citation == 'rw': settings.set_citation_format(sc.ScholarSettings.CITFORM_REFWORKS) elif options.citation is not None: print('Invalid citation link format, must be one of "bt", "en", "rm", or "rw".') return 1 querier.apply_settings(settings) if options.cluster_id: query = sc.ClusterScholarQuery(cluster=options.cluster_id) else: query = sc.SearchScholarQuery() if options.author: query.set_author(options.author) if options.allw: query.set_words(options.allw) if options.some: query.set_words_some(options.some) if options.none: query.set_words_none(options.none) if options.phrase: query.set_phrase(options.phrase) if options.title_only: query.set_scope(True) if options.pub: query.set_pub(options.pub) if options.after or options.before: query.set_timeframe(options.after, options.before) if options.no_patents: query.set_include_patents(False) if options.no_citations: query.set_include_citations(False) if options.count is not None: options.count = min(options.count, sc.ScholarConf.MAX_PAGE_RESULTS) query.set_num_page_results(options.count) querier.send_query(query) if options.csv: sc.csv(querier) elif options.csv_header: sc.csv(querier, header=True) elif options.citation is not None: sc.citation_export(querier) else: sc.txt(querier, with_globals=options.txt_globals) if options.cookie_file: querier.save_cookies() return 0
from __future__ import print_function import scholar as gs import time import pandas as pd from random import betavariate querier = gs.ScholarQuerier() q = gs.SearchScholarQuery() df = pd.read_csv('JPE.csv') titles = df.Title output = df.iloc[0:20] titles = titles[0:20] # for _title in titles: def query(_title): time.sleep(betavariate(2, 2)/2) # to prevent overrequesting Google's server q.set_phrase(_title) querier.send_query(q) q_title = querier.articles[0].attrs['title'][0] q_num_cit = querier.articles[0].attrs['num_citations'][0] print((q_title, q_num_cit)) return (q_title, q_num_cit) (output['q_title'], output['q_num_cit']) = zip(*titles.map(query)) output.to_csv('JPE2.csv')
def main(): # variables regexp = list() standard_handler = 'biotechnology' # Read the regexp from file file = open('regexp.in', 'r') for line in file.readlines(): regexp.append(line) file.close() # Scholar Parser Variables querier = sc.ScholarQuerier() settings = sc.ScholarSettings() # Varibles for metric results_opt1 = list() results_opt2 = list() results_opt3 = list() # loop for each regexp in file for index, item in enumerate(regexp): query1 = sc.SearchScholarQuery() query2 = sc.SearchScholarQuery() query3 = sc.SearchScholarQuery() # Fixed 1000 because the specificity of query query1.set_num_page_results(1000) query2.set_num_page_results(1000) query3.set_num_page_results(1000) # remove new line item = item.rstrip() # I am using three types of searches because it is not clear how the search handles # more than 1 mandatory expression. # Search 1: # words = ['ascidia curvata', 'biotechnology'] # phrase = [] query1.set_words(list([item, standard_handler])) querier.send_query(query1) result = statistics(querier, item, opt=1) results_opt1.append(result) # Search 2: # words = ['biotechnology'] # phrase = ['ascidia curvata'] query2.set_words(list([standard_handler])) query2.set_phrase(item) querier.send_query(query2) result = statistics(querier, item, opt=2) results_opt2.append(result) # Search 3: # words = ['ascidia', 'curvata', 'biotechnology'] # phrase = [] split_regexp = item.split() split_regexp.append(standard_handler) query3.set_words(split_regexp) querier.send_query(query3) result = statistics(querier, item, opt=3) results_opt3.append(result) # You may want to ajust the binning of the data # If you want to see citations check results_opt#[0][2] plt.hist(results_opt1[index][1], alpha=0.5, label='Option 1 (Number of results: %s)' % results_opt1[0][0]) plt.hist(results_opt2[index][1], alpha=0.5, label='Option 2 (Number of results: %s)' % results_opt1[0][0]) plt.hist(results_opt3[index][1], alpha=0.5, label='Option 3 (Number of results: %s)' % results_opt1[0][0]) plt.legend() plt.ylabel('Number of Articles') plt.xlabel('Year') plt.title(item.upper()) plt.tight_layout() plt.savefig(item.replace(' ', '-') + 'histogram.png') plt.clf() del (query1) del (query2) del (query3) # each array is N dimensional, for N regexp specified in the input file return results_opt1, results_opt2, results_opt3
def main(): # scholar_article = scholar.ScholarArticle() # scholar_article_parser = scholar.ScholarArticleParser() querier = scholar.ScholarQuerier() # settings = scholar.ScholarSettings() query = scholar.SearchScholarQuery() query.set_author('eagly') query.set_words('psychology of attitudes') querier.send_query(query) querier.articles[0].as_txt() querier.articles[0].attrs SEED_ARTICLE = querier.articles[0] assert ( SEED_ARTICLE.attrs.get('title')[0] == 'The psychology of attitudes.') type(SEED_ARTICLE) SEED_ARTICLE.attrs['cluster_id'][0] citations_url = SEED_ARTICLE.attrs.get('url_citations')[0] citations_url citations_url_generic = 'https://scholar.google.com/scholar?start={}&hl=en&as_sdt=2005&sciodt=0,5&cites=5556531000720111691&scipsc=' citations_url_generic citations_url_generic.format('0') r = requests.get(citations_url_generic.format('0')) soup = BeautifulSoup(r.text) citation_results = CitationResults(soup=soup) citation_results.set_num_search_results().set_num_search_pages() citation_results.num_results num_search_pages = citation_results.num_search_pages num_search_pages gs_r = soup.find_all("div", class_="gs_r") len(gs_r) citing_article_soup = gs_r[2] result_article = DanGoogleScholarArticle(soup=citing_article_soup) result_article.parse_title() result_article.title result_article.parse_cluster_id() SEED_CLUSTER_ID = result_article.cluster_id SEED_CLUSTER_ID output_file_path = '../results/{}.csv'.\ format(SEED_ARTICLE.attrs['cluster_id'][0]) f = open(output_file_path, 'w') f.close() for page_url, page_number in enumerate(range(num_search_pages)): r = requests.get(citations_url_generic.format(page_url * 10)) soup = BeautifulSoup(r.text) citations_url_generic.format('0') gs_r = soup.find_all("div", class_="gs_r") # print(len(gs_r)) for citing_article_soup in gs_r: result_article = DanGoogleScholarArticle(soup=citing_article_soup) result_article.parse_title() # print(result_article.title) result_article.parse_cluster_id() # seed_cluster_id = result_article.cluster_id # print(seed_cluster_id) f = open(output_file_path, 'a+') str_to_write = '{}\t|\t{}\t|\t{}\n'.\ format(result_article.cluster_id, SEED_CLUSTER_ID, citing_article_soup) f.write(str_to_write) f.close() sleep_time = random() * randint(10, 100) print('page: {}, sleeping: {}'.format(page_number, sleep_time)) sleep(sleep_time)
def getAuthorsORG(self): query = scholar.SearchScholarQuery() query.set_author(self.author) query.set_phrase(self.keywords) query.set_timeframe(self.ystart, self.yend) query.set_num_page_results(self.count) query.set_include_patents(False) query.set_include_citations(False) query.set_scope(True) querier = scholar.ScholarQuerier() settings = scholar.ScholarSettings() settings.set_citation_format(scholar.ScholarSettings.CITFORM_BIBTEX) querier.apply_settings(settings) querier.send_query(query) print "Query Sent" for i in xrange(0, min(len(querier.articles), self.count)): tempList = [] print str(i) + "processed" pubUnicode = querier.articles[i].attrs['title'][0] pubName = unicodedata.normalize('NFKD', pubUnicode).encode( 'ascii', 'ignore') pubName.replace(" ", "+") pubSearchUrl = "https://www.researchgate.net/publicliterature.PublicLiterature.search.html?type=keyword&search-keyword=" + pubName.replace( " ", "+") + "&search-abstract=&search=Search" searchPage = urllib2.urlopen(pubSearchUrl) soupPub = BeautifulSoup(searchPage) pubUrl = "https://www.researchgate.net/" + soupPub.select( ".ga-publication-item")[0]['href'] pubPage = urllib2.urlopen(pubUrl) soupPub = BeautifulSoup(pubPage) authorInArtical = False for j in range(0, len(soupPub.select(".ga-top-coauthor-name"))): authorUrl = "https://www.researchgate.net/" + soupPub.select( ".ga-top-coauthor-name")[j].a['href'] pageAuthor = urllib2.urlopen(authorUrl) soupAuthor = BeautifulSoup(pageAuthor) tempList.append([ unicodedata.normalize( 'NFKD', soupPub.select(".ga-top-coauthor-name") [j].text.strip()).encode('ascii', 'ignore'), unicodedata.normalize( 'NFKD', soupAuthor.select(".header-institution-name") [0].text.strip()).encode('ascii', 'ignore') ]) if stringMatching( self.author, unicodedata.normalize( 'NFKD', soupPub.select(".ga-top-coauthor-name") [j].text.strip()).encode('ascii', 'ignore')): authorInArtical = True else: print unicodedata.normalize( 'NFKD', soupPub.select(".ga-top-coauthor-name") [j].text.strip()).encode('ascii', 'ignore') if authorInArtical: self.authorList.append(tempList) print "In" + str(len(tempList)) return self.authorList