# sample = decode(parse_file) Todo_list = [3] # print(len(citation)) # dict_data = writeDictToCSV(current_path, csv_columns, citation) # print(dict_data) querier = scholar.ScholarQuerier() settings = scholar.ScholarSettings() query = scholar.SearchScholarQuery() settings.set_citation_format(scholar.ScholarSettings.CITFORM_BIBTEX) querier.apply_settings(settings) for citation in Todo_list: cur_citation = decode(str(citation) + '.txt') citation_file_name = str(citation) +'.bib' # os.chdir("C:\\Users\\JC\\Desktop\\CEGA-txt\\Individual Source Text Filesa\\allparsed") citationFile = codecs.open(citation_file_name, 'w','utf-8') for c in cur_citation: time.sleep(random.random()) # print(c) query.set_words(c) querier.send_query(query) print(scholar.citation_export(querier)) citationFile.write(scholar.citation_export(querier)) citationFile.write('\n') # writer.writerow(scholar.csv(querier, header=False, sep=',')) # os.chdir("C:\\Users\\JC\\Desktop\\CEGA-txt\\Individual Source Text Filesa") print('finish writing to BibTeX file!')
def main(): usage = """demo.py [options] <query string> A command-line interface to Google Scholar. Examples: # Retrieve one article written by Einstein on quantum theory: demo.py -c 1 --author "albert einstein" --phrase "quantum theory" # Retrieve a BibTeX entry for that quantum theory paper: demo.py -c 1 -C 17749203648027613321 --citation bt # Retrieve five articles written by Einstein after 1970 where the title # does not contain the words "quantum" and "theory": demo.py -c 5 -a "albert einstein" -t --none "quantum theory" --after 1970""" fmt = optparse.IndentedHelpFormatter(max_help_position=50, width=100) parser = optparse.OptionParser(usage=usage, formatter=fmt) group = optparse.OptionGroup(parser, 'Query arguments', 'These options define search query arguments and parameters.') group.add_option('-a', '--author', metavar='AUTHORS', default=None, help='Author name(s)') group.add_option('-A', '--all', metavar='WORDS', default=None, dest='allw', help='Results must contain all of these words') group.add_option('-s', '--some', metavar='WORDS', default=None, help='Results must contain at least one of these words. Pass arguments in form -s "foo bar baz" for simple words, and -s "a phrase, another phrase" for phrases') group.add_option('-n', '--none', metavar='WORDS', default=None, help='Results must contain none of these words. See -s|--some re. formatting') group.add_option('-p', '--phrase', metavar='PHRASE', default=None, help='Results must contain exact phrase') group.add_option('-t', '--title-only', action='store_true', default=False, help='Search title only') group.add_option('-P', '--pub', metavar='PUBLICATIONS', default=None, help='Results must have appeared in this publication') group.add_option('--after', metavar='YEAR', default=None, help='Results must have appeared in or after given year') group.add_option('--before', metavar='YEAR', default=None, help='Results must have appeared in or before given year') group.add_option('--no-patents', action='store_true', default=False, help='Do not include patents in results') group.add_option('--no-citations', action='store_true', default=False, help='Do not include citations in results') group.add_option('-C', '--cluster-id', metavar='CLUSTER_ID', default=None, help='Do not search, just use articles in given cluster ID') group.add_option('-c', '--count', type='int', default=None, help='Maximum number of results') parser.add_option_group(group) group = optparse.OptionGroup(parser, 'Output format', 'These options control the appearance of the results.') group.add_option('--txt', action='store_true', help='Print article data in text format (default)') group.add_option('--txt-globals', action='store_true', help='Like --txt, but first print global results too') group.add_option('--csv', action='store_true', help='Print article data in CSV form (separator is "|")') group.add_option('--csv-header', action='store_true', help='Like --csv, but print header with column names') group.add_option('--citation', metavar='FORMAT', default=None, help='Print article details in standard citation format. Argument Must be one of "bt" (BibTeX), "en" (EndNote), "rm" (RefMan), or "rw" (RefWorks).') parser.add_option_group(group) group = optparse.OptionGroup(parser, 'Miscellaneous') group.add_option('--cookie-file', metavar='FILE', default=None, help='File to use for cookie storage. If given, will read any existing cookies if found at startup, and save resulting cookies in the end.') group.add_option('-d', '--debug', action='count', default=0, help='Enable verbose logging to stderr. Repeated options increase detail of debug output.') group.add_option('-v', '--version', action='store_true', default=False, help='Show version information') parser.add_option_group(group) options, _ = parser.parse_args() # Show help if we have neither keyword search nor author name if len(sys.argv) == 1: parser.print_help() return 1 if options.debug > 0: options.debug = min(options.debug, sc.ScholarUtils.LOG_LEVELS['debug']) sc.ScholarConf.LOG_LEVEL = options.debug sc.ScholarUtils.log('info', 'using log level %d' % sc.ScholarConf.LOG_LEVEL) if options.version: print('This is demo.py %s.' % sc.ScholarConf.VERSION) return 0 if options.cookie_file: sc.ScholarConf.COOKIE_JAR_FILE = options.cookie_file # Sanity-check the options: if they include a cluster ID query, it # makes no sense to have search arguments: if options.cluster_id is not None: if options.author or options.allw or options.some or options.none \ or options.phrase or options.title_only or options.pub \ or options.after or options.before: print('Cluster ID queries do not allow additional search arguments.') return 1 querier = sc.ScholarQuerier() settings = sc.ScholarSettings() if options.citation == 'bt': settings.set_citation_format(sc.ScholarSettings.CITFORM_BIBTEX) elif options.citation == 'en': settings.set_citation_format(sc.ScholarSettings.CITFORM_ENDNOTE) elif options.citation == 'rm': settings.set_citation_format(sc.ScholarSettings.CITFORM_REFMAN) elif options.citation == 'rw': settings.set_citation_format(sc.ScholarSettings.CITFORM_REFWORKS) elif options.citation is not None: print('Invalid citation link format, must be one of "bt", "en", "rm", or "rw".') return 1 querier.apply_settings(settings) if options.cluster_id: query = sc.ClusterScholarQuery(cluster=options.cluster_id) else: query = sc.SearchScholarQuery() if options.author: query.set_author(options.author) if options.allw: query.set_words(options.allw) if options.some: query.set_words_some(options.some) if options.none: query.set_words_none(options.none) if options.phrase: query.set_phrase(options.phrase) if options.title_only: query.set_scope(True) if options.pub: query.set_pub(options.pub) if options.after or options.before: query.set_timeframe(options.after, options.before) if options.no_patents: query.set_include_patents(False) if options.no_citations: query.set_include_citations(False) if options.count is not None: options.count = min(options.count, sc.ScholarConf.MAX_PAGE_RESULTS) query.set_num_page_results(options.count) querier.send_query(query) if options.csv: sc.csv(querier) elif options.csv_header: sc.csv(querier, header=True) elif options.citation is not None: sc.citation_export(querier) else: sc.txt(querier, with_globals=options.txt_globals) if options.cookie_file: querier.save_cookies() return 0
def add_title_using_gscholar( input_data_file_name, output_data_file_name, input_field_delimiter=',', output_field_delimiter=',', ): found_match_count = 0 nofound_match_count = 0 # create csv reader for input records timestamp.timestamp("Reading input records from '{0}'.".format(input_data_file_name)) fr = open(input_data_file_name,'rb') input_data = csv.DictReader(fr, delimiter = input_field_delimiter) # fieldnames/keys of original input data (dictionary) original_data_fieldnames = input_data.fieldnames # find corresponding column position for specified header author_pos = fuzzymatch(original_data_fieldnames,'author') year_pos = fuzzymatch(original_data_fieldnames,'year') title_pos = fuzzymatch(original_data_fieldnames,'title') journal_pos = fuzzymatch(original_data_fieldnames,'journal') publisher_pos = fuzzymatch(original_data_fieldnames,'publisher') volume_pos = fuzzymatch(original_data_fieldnames,'volume') issue_pos = fuzzymatch(original_data_fieldnames,'issue') page_pos = fuzzymatch(original_data_fieldnames,'page') #count total data records record_num = 0 for original_record in input_data: record_num += 1 print timestamp.timestamp("Reading input record '{0:05d}'.".format(record_num)) # exact values of fields to be validated original_record_title = original_record[title_pos.values()[0]] original_record_author = original_record[author_pos.values()[0]] original_record_year = original_record[year_pos.values()[0]] original_record_journal = original_record[journal_pos.values()[0]] if original_record_author == "[no agent data]": original_record_author = None if original_record_year == "0" or original_record_year == "": original_record_year = None if original_record_title == "no article title available": original_record_title = None if original_record_journal == "": original_record_journal = None output_record = original_record gscholar_match_result = None # try match search of the combination of author, publicaton year, title (if existing), and journal name. timestamp.timestamp("Trying Google Scholar match search for original record: '{0}'.".format(original_record)) querier1 = scholar.ScholarQuerier() query1 = scholar.SearchScholarQuery() query1.set_num_page_results(1) query1.set_author(original_record_author) query1.set_pub(original_record_journal) query1.set_scope(original_record_title) if original_record_year is not None: # extend the publisher year to an interval [original_record_year-10 original_record_year+10] query1.set_timeframe(str(int(original_record_year)-10),str(int(original_record_year)+10)) else: query1.set_timeframe(None,None) settings1 = scholar.ScholarSettings() settings1.set_citation_format(scholar.ScholarSettings.CITFORM_BIBTEX) querier1.apply_settings(settings1) querier1.send_query(query1) gscholar_record = scholar.citation_export(querier1) if len(gscholar_record) < 1: timestamp.timestamp('Google Scholar match FAILED!') nofound_match_count += 1 else: timestamp.timestamp('Google Scholar match was SUCESSFUL!') gscholar_match_result = True found_match_count += 1 gscholar_record_str = gscholar_record[0] if gscholar_record_str.find('@') > -1: at_type = gscholar_record_str.find('@') sp_type = gscholar_record_str.find('{',at_type) new_type = gscholar_record_str[at_type+1 : sp_type] output_record['type'] = new_type if gscholar_record_str.find('title={') > -1: at_title = gscholar_record_str.find('title={') sp_title = gscholar_record_str.find('}',at_title) new_title = gscholar_record_str[at_title+7 : sp_title] output_record['title'] = new_title if gscholar_record_str.find('author={') > -1: at_author = gscholar_record_str.find('author={') sp_author = gscholar_record_str.find('}',at_author) new_author = gscholar_record_str[at_author+8 : sp_author] output_record['author'] = new_author if gscholar_record_str.find('journal={') > -1: at_journal = gscholar_record_str.find('journal={') sp_journal = gscholar_record_str.find('}',at_journal) new_journal = gscholar_record_str[at_journal+9 : sp_journal] output_record['journal'] = new_journal if gscholar_record_str.find('volume={') > -1: at_volume = gscholar_record_str.find('volume={') sp_volume = gscholar_record_str.find('}',at_volume) new_volume = gscholar_record_str[at_volume+8 : sp_volume] output_record['volume'] = new_volume if gscholar_record_str.find('pages={') > -1: at_pages = gscholar_record_str.find('pages={') sp_pages = gscholar_record_str.find('}',at_pages) new_pages = gscholar_record_str[at_pages+7 : sp_pages] output_record['pages'] = new_pages if gscholar_record_str.find('year={') > -1: at_year = gscholar_record_str.find('year={') sp_year = gscholar_record_str.find('}',at_year) new_year = gscholar_record_str[at_year+6 : sp_year] output_record['year'] = new_year if gscholar_record_str.find('publisher={') > -1: at_publisher = gscholar_record_str.find('publisher={') sp_publisher = gscholar_record_str.find('}',at_publisher) new_publisher = gscholar_record_str[at_publisher+11 : sp_publisher] output_record['publisher'] = new_publisher # open file for storing output data if not already open if 'output_data' not in locals(): extra_fieldnames = ['type','title','author','journal','volume','pages','year','publisher'] output_data_fieldnames = input_data.fieldnames + extra_fieldnames fw = open(output_data_file_name,'w') output_data = csv.DictWriter(fw,output_data_fieldnames, delimiter = output_field_delimiter) output_data.writeheader() output_data.writerow(output_record) print timestamp.timestamp("Summary: {0} matches found and {1} matches not found to '{2}'.".format(found_match_count, nofound_match_count, output_data_file_name)) fr.close() fw.close()