def main(dirname, fname):
    f = get_file_offline(dirname, fname)
    doc = html.parse(f)
    canonical_url = doc.xpath('.//link[@rel="canonical"]')[0].get("href")
    parsed = urlparse.urlsplit(canonical_url)
    query = urlparse.parse_qs(parsed.query)
    print query['user'][0]
def main(dirname, fname):
    f = get_file_offline(dirname, fname)
    doc = html.parse(f)

    if doc.xpath('.//button[@id="gsc_bpf_next"]')[0].get("disabled"):
        sys.stdout.write('DONE')
    else:
        sys.stdout.write('GO ON')
def show_crawled_data(scholar_id):
    f = get_file_offline('publications_cache', scholar_id)

    publication = handle_file(f, scholar_id)
    print 'Publication object:'
    print publication.tostring().encode('utf8')
    print 'Types of the data in its fields:'
    print publication.inspect_fields()
def crawl_publication(scholar_id, sql_url):
    """
    Crawls Google Scholar in order to retrieve information about a publication.
    """

    print 'Show data for ' + scholar_id + '.'

    f = get_file_offline('publications_cache', scholar_id)

    publication = handle_file(f, scholar_id)

    engine = create_engine(sql_url)
    Session = sessionmaker(bind=engine)

    session = Session()
    add_publication_to_db(publication, session)
def extract_author(scholar_id):
    author = Author()

    pub_pages, histo_page, coauthors_page = get_author_filenames(AUTHORS_DIR, scholar_id)


    with get_file_offline(AUTHORS_DIR, pub_pages[0]) as f:
        doc = html.parse(f)

    no_content = doc.xpath('.//div[contains(text(), "Sorry, no content found for this URL")]')
    if len(no_content):
        print 'Author ' + scholar_id + ' not found.'
        return 'Done.'

    author.scholar_id = scholar_id

    rxpr = re.compile(r'view_op=view_org.*org=(?P<org>[^s+\&]+)').search
    norgs = [rxpr(x.get('href', '')).group('org') for x in doc.xpath('.//a')
             if rxpr(x.get('href', ''))]

    org = norgs[0] if norgs else None

    nname = doc.find('.//div[@id="gsc_prf_in"]')
    if nname is not None:
        # The name of the author.
        author.name = unicode(nname.text_content())

    nemaildomain = doc.find('.//div[@id="gsc_prf_ivh"]')
    if nemaildomain is not None:
        # The domain where the author has an email.
        author.email_domain = nemaildomain.text_content().split(" - ")[0].split()[-1]

    ncitations = doc.find('.//table[@id="gsc_rsb_st"]')
    if ncitations is not None:

        # The total citations for the author.
        author.total_citations = ncitations.xpath('.//tr[2]/td')[1].text

        # The h-index for the author.
        author.h_index = ncitations.xpath('.//tr[3]/td')[1].text

        # The i10-index for the author.
        author.i10_index = ncitations.xpath('.//tr[4]/td')[1].text


    with get_file_offline(AUTHORS_DIR, histo_page) as f:
        doc = html.parse(f)

    # The citations per year for the author.
    author_citations_per_year = []
    nhistogram = doc.find('.//div[@id="gsc_md_hist_b"]')
    if nhistogram is not None:
        years = [x.text for x in nhistogram.xpath('.//span[@class="gsc_g_t"]')]
        for a in nhistogram.xpath('.//a[@class="gsc_g_a"]'):
            i = int(a.get('style').split('z-index:')[1])
            year = int(years[-i])
            citations_per_year = AuthorCitationsPerYear()
            citations_per_year.year = int(years[-i])
            citations_per_year.citations = int(a.xpath('./span[@class="gsc_g_al"]')[0].text)
            author_citations_per_year.append(citations_per_year)
    author.citations_per_year = author_citations_per_year


    with get_file_offline(AUTHORS_DIR, coauthors_page) as f:
        doc = html.parse(f)

    # The co-authors of the author.
    author_coauthors = []
    for a in doc.xpath('.//h3[@class="gsc_1usr_name"]//a'):
        co_scholar_id = a.get('href').split('user='******'&hl')[0]
        coauthor = Author()
        coauthor.scholar_id = co_scholar_id
        author_coauthors.append(coauthor)
    author.coauthors = author_coauthors


    # The publications.
    author_publications = []

    for pub_page in pub_pages:
        with get_file_offline(AUTHORS_DIR, pub_page) as f:
            doc = html.parse(f)

        for tr in doc.xpath('.//tr[@class="gsc_a_tr"]'):
            a = tr.find('.//td[@class="gsc_a_t"]//a')
            # NOTE: When there are no publications, there is a single tr.
            # <tr class="gsc_a_tr"><td class="gsc_a_e" colspan="3">There are no articles in this profile.</td></tr>
            if a is None:
                continue
            purl = a.get('href')

            # The ID of the publication in Google Scholar.
            pub_scholar_id = purl.split('citation_for_view=')[1]

            # Retrieve the publication with that ID (if any).
            publication = Publication()
            publication.scholar_id = pub_scholar_id

            # The title of the publication.
            publication.title = unicode(a.text_content())

            pub_nyear = tr.find('.//td[@class="gsc_a_y"]//span')
            if pub_nyear is not None:
                year_of_publication = pub_nyear.text_content().strip()
                if year_of_publication:
                    # The year of the publication.
                    publication.year_of_publication = int(year_of_publication)
                else:
                    publication.year_of_publication = None

            pub_ncitations = tr.find('.//a[@class="gsc_a_ac"]')

            if pub_ncitations is not None:
                total_citations = pub_ncitations.text_content().strip()
                if total_citations:
                    # The total citations for the publication.
                    publication.total_citations = int(total_citations)
                else:
                    publication.total_citations = None

            author_publications.append(publication)

    author.publications = author_publications

    return author, org