def main(dirname, fname): f = get_file_offline(dirname, fname) doc = html.parse(f) canonical_url = doc.xpath('.//link[@rel="canonical"]')[0].get("href") parsed = urlparse.urlsplit(canonical_url) query = urlparse.parse_qs(parsed.query) print query['user'][0]
def main(dirname, fname): f = get_file_offline(dirname, fname) doc = html.parse(f) if doc.xpath('.//button[@id="gsc_bpf_next"]')[0].get("disabled"): sys.stdout.write('DONE') else: sys.stdout.write('GO ON')
def show_crawled_data(scholar_id): f = get_file_offline('publications_cache', scholar_id) publication = handle_file(f, scholar_id) print 'Publication object:' print publication.tostring().encode('utf8') print 'Types of the data in its fields:' print publication.inspect_fields()
def crawl_publication(scholar_id, sql_url): """ Crawls Google Scholar in order to retrieve information about a publication. """ print 'Show data for ' + scholar_id + '.' f = get_file_offline('publications_cache', scholar_id) publication = handle_file(f, scholar_id) engine = create_engine(sql_url) Session = sessionmaker(bind=engine) session = Session() add_publication_to_db(publication, session)
def extract_author(scholar_id): author = Author() pub_pages, histo_page, coauthors_page = get_author_filenames(AUTHORS_DIR, scholar_id) with get_file_offline(AUTHORS_DIR, pub_pages[0]) as f: doc = html.parse(f) no_content = doc.xpath('.//div[contains(text(), "Sorry, no content found for this URL")]') if len(no_content): print 'Author ' + scholar_id + ' not found.' return 'Done.' author.scholar_id = scholar_id rxpr = re.compile(r'view_op=view_org.*org=(?P<org>[^s+\&]+)').search norgs = [rxpr(x.get('href', '')).group('org') for x in doc.xpath('.//a') if rxpr(x.get('href', ''))] org = norgs[0] if norgs else None nname = doc.find('.//div[@id="gsc_prf_in"]') if nname is not None: # The name of the author. author.name = unicode(nname.text_content()) nemaildomain = doc.find('.//div[@id="gsc_prf_ivh"]') if nemaildomain is not None: # The domain where the author has an email. author.email_domain = nemaildomain.text_content().split(" - ")[0].split()[-1] ncitations = doc.find('.//table[@id="gsc_rsb_st"]') if ncitations is not None: # The total citations for the author. author.total_citations = ncitations.xpath('.//tr[2]/td')[1].text # The h-index for the author. author.h_index = ncitations.xpath('.//tr[3]/td')[1].text # The i10-index for the author. author.i10_index = ncitations.xpath('.//tr[4]/td')[1].text with get_file_offline(AUTHORS_DIR, histo_page) as f: doc = html.parse(f) # The citations per year for the author. author_citations_per_year = [] nhistogram = doc.find('.//div[@id="gsc_md_hist_b"]') if nhistogram is not None: years = [x.text for x in nhistogram.xpath('.//span[@class="gsc_g_t"]')] for a in nhistogram.xpath('.//a[@class="gsc_g_a"]'): i = int(a.get('style').split('z-index:')[1]) year = int(years[-i]) citations_per_year = AuthorCitationsPerYear() citations_per_year.year = int(years[-i]) citations_per_year.citations = int(a.xpath('./span[@class="gsc_g_al"]')[0].text) author_citations_per_year.append(citations_per_year) author.citations_per_year = author_citations_per_year with get_file_offline(AUTHORS_DIR, coauthors_page) as f: doc = html.parse(f) # The co-authors of the author. author_coauthors = [] for a in doc.xpath('.//h3[@class="gsc_1usr_name"]//a'): co_scholar_id = a.get('href').split('user='******'&hl')[0] coauthor = Author() coauthor.scholar_id = co_scholar_id author_coauthors.append(coauthor) author.coauthors = author_coauthors # The publications. author_publications = [] for pub_page in pub_pages: with get_file_offline(AUTHORS_DIR, pub_page) as f: doc = html.parse(f) for tr in doc.xpath('.//tr[@class="gsc_a_tr"]'): a = tr.find('.//td[@class="gsc_a_t"]//a') # NOTE: When there are no publications, there is a single tr. # <tr class="gsc_a_tr"><td class="gsc_a_e" colspan="3">There are no articles in this profile.</td></tr> if a is None: continue purl = a.get('href') # The ID of the publication in Google Scholar. pub_scholar_id = purl.split('citation_for_view=')[1] # Retrieve the publication with that ID (if any). publication = Publication() publication.scholar_id = pub_scholar_id # The title of the publication. publication.title = unicode(a.text_content()) pub_nyear = tr.find('.//td[@class="gsc_a_y"]//span') if pub_nyear is not None: year_of_publication = pub_nyear.text_content().strip() if year_of_publication: # The year of the publication. publication.year_of_publication = int(year_of_publication) else: publication.year_of_publication = None pub_ncitations = tr.find('.//a[@class="gsc_a_ac"]') if pub_ncitations is not None: total_citations = pub_ncitations.text_content().strip() if total_citations: # The total citations for the publication. publication.total_citations = int(total_citations) else: publication.total_citations = None author_publications.append(publication) author.publications = author_publications return author, org