示例#1
0
def get_paper_data(querier, paper):
    if type(paper) is dict:
        title = paper.get('title')
        cluster_id = paper.get('cluster_id')
    elif type(paper) is str:
        title = paper
    else:
        raise "Input arg paper is of an invalid format %s" % repr(paper)

    if cluster_id:
        print 'Query by cluster_id'
        query = scholar.ClusterScholarQuery(cluster=cluster_id)
    else:
        print 'Query by title "%s"' % title
        query = scholar.SearchScholarQuery()
        query.set_phrase(title)

    query.set_num_page_results(1)
    # This is important, set this to 1 can reduce the possiblility of get blocked by google
    querier.send_query(query)
    scholar.txt(querier, with_globals=True)

    articles = querier.articles
    time.sleep(1)
    # for art in articles:
    #     print(encode(art.as_txt()) + '\n')
    return articles[0]  # Only return the top result
    def parse(self, response):
        table = response.xpath('//*[@class="post-content"]//table//tbody')
        rows = table.xpath('//tr')
        total = len(rows)

        for i in range(total):
            if i == 0 or i == total - 1:
                print('skipped: unusable html')
            else:
                print(i)
                name = rows[i].xpath('td//div//text()')[1].extract()
                url = rows[i].xpath('td//div//a/@href').extract()
                tags = rows[i].xpath('td//ul//li//div//text()').extract()
                print(name)
                print(url)
                print(tags)
                querier = scholar.ScholarQuerier()
                settings = scholar.ScholarSettings()

                querier.apply_settings(settings)
                query = scholar.SearchScholarQuery()
                query.set_words(name)

                querier.send_query(query)
                scholar.txt(querier, with_globals=1)
def get_paper_data(querier, paper):
    if type(paper) is dict:
        title = paper.get('title')
        cluster_id = paper.get('cluster_id')
    elif type(paper) is str:
        title = paper
    else:
        raise "Input arg paper is of an invalid format %s" % repr(paper)


    if cluster_id:
        print 'Query by cluster_id'
        query = scholar.ClusterScholarQuery(cluster = cluster_id)
    else:
        print 'Query by title "%s"' % title
        query = scholar.SearchScholarQuery()
        query.set_phrase(title)

    query.set_num_page_results(1)
    # This is important, set this to 1 can reduce the possiblility of get blocked by google
    querier.send_query(query)
    scholar.txt(querier, with_globals=True)

    articles = querier.articles
    time.sleep(1)
    # for art in articles:
    #     print(encode(art.as_txt()) + '\n')
    return articles[0] # Only return the top result
示例#4
0
def getPublications(author):
    print author
    querier = ScholarQuerier()
    settings = ScholarSettings()
    querier.apply_settings(settings)
    query = SearchScholarQuery()
    query.set_author(author)
    querier.send_query(query)
    #scholar.csv(querier)
    scholar.txt(querier, with_globals=False)
示例#5
0
def query():
    doi = request.query["doi"]
    response.content_type = "text/plain"

    querier = scholar.ScholarQuerier()
    settings = scholar.ScholarSettings()
    querier.apply_settings(settings)

    query = scholar.SearchScholarQuery()
    query.set_num_page_results(1)
    query.set_phrase(doi)

    querier.send_query(query)
    scholar.txt(querier, with_globals=False)
    citation = querier.articles[0]["num_citations"]
    return template("{{citation}}", citation=citation)
示例#6
0
def query():
    doi = request.query['doi']
    response.content_type = 'text/plain'

    querier = scholar.ScholarQuerier()
    settings = scholar.ScholarSettings()
    querier.apply_settings(settings)

    query = scholar.SearchScholarQuery()
    query.set_num_page_results(1)
    query.set_phrase(doi)

    querier.send_query(query)
    scholar.txt(querier, with_globals=False)
    citation = querier.articles[0]['num_citations']
    return template('{{citation}}', citation=citation)
示例#7
0
def get_references(doi, items_file="", citations_url="", db=False):
    if os.path.isfile("items.json"):
        os.remove("items.json")

    if citations_url != "":
        print "Scraping from link: %s" % citations_url
    elif items_file == "":
        art = scholar.txt(doi, author="", count=10)

        if art:
            citations_url = art.attrs["url_citations"][0]
        else:
            print "Cannot get citation url for DOI: %s" % doi
            sys.exit()

    cmd = "scrapy crawl gs_spider -a url='"
    cmd += citations_url + "' -t json -o items.json"
    p = subprocess.check_call(cmd, shell=True)
    items_file = "items.json"
    # sys.exit()

    # user provided items.json file
    f = codecs.open(items_file, "r", "utf-8")
    data = f.read()
    f.close()

    to_html_page = "<ol>\n"
    for i in json.loads(data):
        ref = Reference(i["link"])
        if ref.doi != None:
            print "Found doi: %s" % ref.doi
            article = Article(ref.doi)

            to_html_page += "<li>" + article.full_citation + "</li>\n"

            if db:
                upload_to_db(article)
                # this doi, cited by this doi
                upload_cited_by(doi, ref.doi)
    to_html_page += "</ol>"
    create_html_page(doi, to_html_page)
示例#8
0
def main():
    usage = """demo.py [options] <query string>
A command-line interface to Google Scholar.

Examples:

# Retrieve one article written by Einstein on quantum theory:
demo.py -c 1 --author "albert einstein" --phrase "quantum theory"

# Retrieve a BibTeX entry for that quantum theory paper:
demo.py -c 1 -C 17749203648027613321 --citation bt

# Retrieve five articles written by Einstein after 1970 where the title
# does not contain the words "quantum" and "theory":
demo.py -c 5 -a "albert einstein" -t --none "quantum theory" --after 1970"""

    fmt = optparse.IndentedHelpFormatter(max_help_position=50, width=100)
    parser = optparse.OptionParser(usage=usage, formatter=fmt)
    group = optparse.OptionGroup(parser, 'Query arguments',
                                 'These options define search query arguments and parameters.')
    group.add_option('-a', '--author', metavar='AUTHORS', default=None,
                     help='Author name(s)')
    group.add_option('-A', '--all', metavar='WORDS', default=None, dest='allw',
                     help='Results must contain all of these words')
    group.add_option('-s', '--some', metavar='WORDS', default=None,
                     help='Results must contain at least one of these words. Pass arguments in form -s "foo bar baz" for simple words, and -s "a phrase, another phrase" for phrases')
    group.add_option('-n', '--none', metavar='WORDS', default=None,
                     help='Results must contain none of these words. See -s|--some re. formatting')
    group.add_option('-p', '--phrase', metavar='PHRASE', default=None,
                     help='Results must contain exact phrase')
    group.add_option('-t', '--title-only', action='store_true', default=False,
                     help='Search title only')
    group.add_option('-P', '--pub', metavar='PUBLICATIONS', default=None,
                     help='Results must have appeared in this publication')
    group.add_option('--after', metavar='YEAR', default=None,
                     help='Results must have appeared in or after given year')
    group.add_option('--before', metavar='YEAR', default=None,
                     help='Results must have appeared in or before given year')
    group.add_option('--no-patents', action='store_true', default=False,
                     help='Do not include patents in results')
    group.add_option('--no-citations', action='store_true', default=False,
                     help='Do not include citations in results')
    group.add_option('-C', '--cluster-id', metavar='CLUSTER_ID', default=None,
                     help='Do not search, just use articles in given cluster ID')
    group.add_option('-c', '--count', type='int', default=None,
                     help='Maximum number of results')
    parser.add_option_group(group)

    group = optparse.OptionGroup(parser, 'Output format',
                                 'These options control the appearance of the results.')
    group.add_option('--txt', action='store_true',
                     help='Print article data in text format (default)')
    group.add_option('--txt-globals', action='store_true',
                     help='Like --txt, but first print global results too')
    group.add_option('--csv', action='store_true',
                     help='Print article data in CSV form (separator is "|")')
    group.add_option('--csv-header', action='store_true',
                     help='Like --csv, but print header with column names')
    group.add_option('--citation', metavar='FORMAT', default=None,
                     help='Print article details in standard citation format. Argument Must be one of "bt" (BibTeX), "en" (EndNote), "rm" (RefMan), or "rw" (RefWorks).')
    parser.add_option_group(group)

    group = optparse.OptionGroup(parser, 'Miscellaneous')
    group.add_option('--cookie-file', metavar='FILE', default=None,
                     help='File to use for cookie storage. If given, will read any existing cookies if found at startup, and save resulting cookies in the end.')
    group.add_option('-d', '--debug', action='count', default=0,
                     help='Enable verbose logging to stderr. Repeated options increase detail of debug output.')
    group.add_option('-v', '--version', action='store_true', default=False,
                     help='Show version information')
    parser.add_option_group(group)

    options, _ = parser.parse_args()

    # Show help if we have neither keyword search nor author name
    if len(sys.argv) == 1:
        parser.print_help()
        return 1

    if options.debug > 0:
        options.debug = min(options.debug, sc.ScholarUtils.LOG_LEVELS['debug'])
        sc.ScholarConf.LOG_LEVEL = options.debug
        sc.ScholarUtils.log('info', 'using log level %d' % sc.ScholarConf.LOG_LEVEL)

    if options.version:
        print('This is demo.py %s.' % sc.ScholarConf.VERSION)
        return 0

    if options.cookie_file:
        sc.ScholarConf.COOKIE_JAR_FILE = options.cookie_file

    # Sanity-check the options: if they include a cluster ID query, it
    # makes no sense to have search arguments:
    if options.cluster_id is not None:
        if options.author or options.allw or options.some or options.none \
           or options.phrase or options.title_only or options.pub \
           or options.after or options.before:
            print('Cluster ID queries do not allow additional search arguments.')
            return 1

    querier = sc.ScholarQuerier()
    settings = sc.ScholarSettings()

    if options.citation == 'bt':
        settings.set_citation_format(sc.ScholarSettings.CITFORM_BIBTEX)
    elif options.citation == 'en':
        settings.set_citation_format(sc.ScholarSettings.CITFORM_ENDNOTE)
    elif options.citation == 'rm':
        settings.set_citation_format(sc.ScholarSettings.CITFORM_REFMAN)
    elif options.citation == 'rw':
        settings.set_citation_format(sc.ScholarSettings.CITFORM_REFWORKS)
    elif options.citation is not None:
        print('Invalid citation link format, must be one of "bt", "en", "rm", or "rw".')
        return 1

    querier.apply_settings(settings)

    if options.cluster_id:
        query = sc.ClusterScholarQuery(cluster=options.cluster_id)
    else:
        query = sc.SearchScholarQuery()
        if options.author:
            query.set_author(options.author)
        if options.allw:
            query.set_words(options.allw)
        if options.some:
            query.set_words_some(options.some)
        if options.none:
            query.set_words_none(options.none)
        if options.phrase:
            query.set_phrase(options.phrase)
        if options.title_only:
            query.set_scope(True)
        if options.pub:
            query.set_pub(options.pub)
        if options.after or options.before:
            query.set_timeframe(options.after, options.before)
        if options.no_patents:
            query.set_include_patents(False)
        if options.no_citations:
            query.set_include_citations(False)

    if options.count is not None:
        options.count = min(options.count, sc.ScholarConf.MAX_PAGE_RESULTS)
        query.set_num_page_results(options.count)

    querier.send_query(query)

    if options.csv:
        sc.csv(querier)
    elif options.csv_header:
        sc.csv(querier, header=True)
    elif options.citation is not None:
        sc.citation_export(querier)
    else:
        sc.txt(querier, with_globals=options.txt_globals)

    if options.cookie_file:
        querier.save_cookies()

    return 0