def update_rule_last_run(rule_name): """ Set the last time a rule was run to now. This function should be called after a rule has been ran. """ if task_has_option('record_ids') or task_get_option('no_upload', False) \ or task_get_option('no_tickets', False): return # We don't want to update the database in this case updated = run_sql("UPDATE bibcheck_rules SET last_run=%s WHERE name=%s;", (task_get_task_param('task_starting_time'), rule_name,)) if not updated: # rule not in the database, insert it run_sql("INSERT INTO bibcheck_rules(name, last_run) VALUES (%s, %s)", (rule_name, task_get_task_param('task_starting_time')))
def iterate_over_new(list, fmt): "Iterate over list of IDs" global total_rec formatted_records = '' # (string-)List of formatted record of an iteration tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call start_date = task_get_task_param('task_starting_time') # Time at which the record was formatted tot = len(list) count = 0 for recID in list: t1 = os.times()[4] start_date = time.strftime('%Y-%m-%d %H:%M:%S') formatted_record = zlib.compress(format_record(recID, fmt, on_the_fly=True)) if run_sql('SELECT id FROM bibfmt WHERE id_bibrec=%s AND format=%s', (recID, fmt)): run_sql('UPDATE bibfmt SET last_updated=%s, value=%s WHERE id_bibrec=%s AND format=%s', (start_date, formatted_record, recID, fmt)) else: run_sql('INSERT INTO bibfmt(id_bibrec, format, last_updated, value) VALUES(%s, %s, %s, %s)', (recID, fmt, start_date, formatted_record)) t2 = os.times()[4] tbibformat += (t2 - t1) count += 1 if (count % 100) == 0: write_message(" ... formatted %s records out of %s" % (count, tot)) task_update_progress('Formatted %s out of %s' % (count, tot)) task_sleep_now_if_required(can_stop_too=True) if (tot % 100) != 0: write_message(" ... formatted %s records out of %s" % (count, tot)) return (tot, tbibformat, tbibupload)
def _dbdump_run_task_core(): """ Run DB dumper core stuff. Note: do not use task_can_sleep() stuff here because we don't want other tasks to interrupt us while we are dumping the DB content. """ # read params: task_update_progress("Reading parameters") write_message("Reading parameters started") output_dir = task_get_option('output', CFG_LOGDIR) output_num = task_get_option('number', 5) output_fil_prefix = CFG_DATABASE_NAME + '-dbdump-' output_fil_suffix = task_get_task_param('task_starting_time').replace(' ','_') + '.sql' output_fil = output_fil_prefix + output_fil_suffix write_message("Reading parameters ended") # make dump: task_update_progress("Dumping database") write_message("Database dump started") _dump_database(output_dir, output_fil) write_message("Database dump ended") # prune old dump files: task_update_progress("Pruning old dump files") write_message("Pruning old dump files started") _delete_old_dumps(output_dir, output_fil_prefix, output_num) write_message("Pruning old dump files ended") # we are done: task_update_progress("Done.") return True
def watch_directory(new_job_dir=CFG_BIBENCODE_DAEMON_DIR_NEWJOBS, old_job_dir=CFG_BIBENCODE_DAEMON_DIR_OLDJOBS): """ Checks a folder job files, parses and executes them @param new_job_dir: path to the directory with new jobs @type new_job_dir: string @param old_job_dir: path to the directory where the old jobs are moved @type old_job_dir: string """ global _NUMBER, _TASKID write_message('Checking directory %s for new jobs' % new_job_dir) task_update_progress('Checking for new jobs') _TASKID = task_get_task_param('task_id') files = os.listdir(new_job_dir) for file in files: file_fullpath = os.path.join(new_job_dir, file) if has_signature(file_fullpath): write_message('New Job found: %s' % file) job = json_decode_file(file_fullpath) if not getval(job, 'isbatch'): args = job_to_args(job) if not launch_task(args): write_message('Error submitting task') else: ## We need the job description for the batch engine ## So we need to use the new path inside the oldjobs dir process_batch(os.path.join(old_job_dir, file)) ## Move the file to the done dir shutil.move(file_fullpath, os.path.join(old_job_dir, file)) ## Update number for next job _NUMBER += 1 return 1
def _dbdump_run_task_core(): """ Run DB dumper core stuff. Note: do not use task_can_sleep() stuff here because we don't want other tasks to interrupt us while we are dumping the DB content. """ # read params: task_update_progress("Reading parameters") write_message("Reading parameters started") output_dir = task_get_option('output', CFG_LOGDIR) output_num = task_get_option('number', 5) output_fil_prefix = CFG_DATABASE_NAME + '-dbdump-' output_fil_suffix = task_get_task_param('task_starting_time').replace( ' ', '_') + '.sql.gz' output_fil = output_fil_prefix + output_fil_suffix write_message("Reading parameters ended") # make dump: task_update_progress("Dumping database") write_message("Database dump started") _dump_database(output_dir, output_fil) write_message("Database dump ended") # prune old dump files: task_update_progress("Pruning old dump files") write_message("Pruning old dump files started") _delete_old_dumps(output_dir, output_fil_prefix, output_num) write_message("Pruning old dump files ended") # we are done: task_update_progress("Done.") return True
def iterate_over_new(list, fmt): """ Iterate over list of IDs @param list: the list of record IDs to format @param fmt: the output format to use @return: tuple (total number of records, time taken to format, time taken to insert) """ global total_rec formatted_records = '' # (string-)List of formatted record of an iteration tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call start_date = task_get_task_param('task_starting_time') # Time at which the record was formatted tot = len(list) count = 0 for recID in list: t1 = os.times()[4] start_date = time.strftime('%Y-%m-%d %H:%M:%S') formatted_record = zlib.compress(format_record(recID, fmt, on_the_fly=True)) run_sql('REPLACE LOW_PRIORITY INTO bibfmt (id_bibrec, format, last_updated, value) VALUES (%s, %s, %s, %s)', (recID, fmt, start_date, formatted_record)) t2 = os.times()[4] tbibformat += (t2 - t1) count += 1 if (count % 100) == 0: write_message(" ... formatted %s records out of %s" % (count, tot)) task_update_progress('Formatted %s out of %s' % (count, tot)) task_sleep_now_if_required(can_stop_too=True) if (tot % 100) != 0: write_message(" ... formatted %s records out of %s" % (count, tot)) return (tot, tbibformat, tbibupload)
def _update_job_lastrun_time(jobname): """Update expJOB table and set lastrun time of JOBNAME to the task starting time.""" run_sql("UPDATE expJOB SET lastrun=%s WHERE jobname=%s", ( task_get_task_param('task_starting_time'), jobname, ))
def get_next_starting_date(rule): """Calculate the date the next bibcheck run should consider as initial. If no filter has been specified then the time that is set is the time the task was started. Otherwise, it is set to the earliest date among last time webcoll was run and the last bibindex last_update as the last_run to prevent records that have yet to be categorized from being perpetually ignored. """ def dt(t): return datetime.strptime(t, "%Y-%m-%d %H:%M:%S") # Upper limit task_starting_time = dt(task_get_task_param('task_starting_time')) for key, val in rule.iteritems(): if key.startswith("filter_") and val: break else: return task_starting_time # Lower limit min_last_updated = run_sql("select min(last_updated) from idxINDEX")[0][0] cache_last_updated = dt(get_cache_last_updated_timestamp()) if not min_last_updated or not cache_last_updated: # Some tables have never been initialized. Let's return the Epoch return datetime(1970, 1, 1) return min(min_last_updated, task_starting_time, cache_last_updated)
def update_rule_last_run(rule_name): """ Set the last time a rule was run to now. This function should be called after a rule has been ran. """ if task_has_option('record_ids') or task_get_option('no_upload', False) \ or task_get_option('no_tickets', False): return # We don't want to update the database in this case updated = run_sql("UPDATE bibcheck_rules SET last_run=%s WHERE name=%s;", ( task_get_task_param('task_starting_time'), rule_name, )) if not updated: # rule not in the database, insert it run_sql("INSERT INTO bibcheck_rules(name, last_run) VALUES (%s, %s)", (rule_name, task_get_task_param('task_starting_time')))
def check_slave_is_in_consistent_state(connection=None): """ Check if the slave is already aware that dbdump task is running. dbdump being a monotask, guarantee that no other task is currently running and it's hence safe to detach the slave and start the actual dump. """ if connection is None: connection = get_connection_for_dump_on_slave() i = 0 ## Let's take the current status of dbdump (e.g. RUNNING, ABOUT TO STOP, etc.)... current_status = run_sql("SELECT status FROM schTASK WHERE id=%s", (task_get_task_param('task_id'), ))[0][0] while True: if i == 10: ## Timeout!! raise StandardError("The slave seems not to pick up with the master") ## ...and let's see if it matches with what the slave sees. if run_sql("SELECT status FROM schTASK WHERE id=%s AND status=%s", (task_get_task_param('task_id'), current_status), connection=connection): ## Bingo! return time.sleep(3) i += 1
def _generate_default_xml_out(): """Generates the default output xml file directory, corresponding to this refextract task id. This will be called in a user specified xml out file has not been provided. @return: (string) output xml file directory""" results_dir = os.path.join(CFG_TMPDIR, "refextract") # Write the changes to a temporary file. filename = "refextract_task_%d.xml" % task_get_task_param('task_id', 0) abs_path = os.path.join(results_dir, filename) ## Make the folder, if not exists if not os.path.isdir(results_dir): os.mkdir(results_dir) return abs_path
def parse_option(key, value, opts, args): """ Elaborate task submission parameter. """ if args: # There should be no standalone arguments raise StandardError("Error: Unrecognised argument '%s'." % args[0]) if key in ('-i', '--id'): recids = task_get_task_param('recids') if not recids: recids = set() task_set_task_param('recids', recids) recids.update(split_cli_ids_arg(value)) elif key in ('-a', '--all'): task_set_task_param('all', True) return True
def iterate_over_new(list, fmt): "Iterate over list of IDs" global total_rec formatted_records = '' # (string-)List of formatted record of an iteration tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call start_date = task_get_task_param( 'task_starting_time') # Time at which the record was formatted tot = len(list) count = 0 for recID in list: t1 = os.times()[4] start_date = time.strftime('%Y-%m-%d %H:%M:%S') formatted_record = zlib.compress( format_record(recID, fmt, on_the_fly=True)) if run_sql('SELECT id FROM bibfmt WHERE id_bibrec=%s AND format=%s', (recID, fmt)): run_sql( 'UPDATE bibfmt SET last_updated=%s, value=%s WHERE id_bibrec=%s AND format=%s', (start_date, formatted_record, recID, fmt)) else: run_sql( 'INSERT INTO bibfmt(id_bibrec, format, last_updated, value) VALUES(%s, %s, %s, %s)', (recID, fmt, start_date, formatted_record)) t2 = os.times()[4] tbibformat += (t2 - t1) count += 1 if (count % 100) == 0: write_message(" ... formatted %s records out of %s" % (count, tot)) task_update_progress('Formatted %s out of %s' % (count, tot)) task_sleep_now_if_required(can_stop_too=True) if (tot % 100) != 0: write_message(" ... formatted %s records out of %s" % (count, tot)) return (tot, tbibformat, tbibupload)
def _task_run_core(): """Runs analyse_documents for each ontology, collection, record ids set.""" automated_daemon_mode_p = True recids = bibtask.task_get_option("recids") collections = bibtask.task_get_option("collections") taxonomy = bibtask.task_get_option("taxonomy") if recids or collections: # We want to run some records/collection only, so we are not # in the automated daemon mode; this will be useful later. automated_daemon_mode_p = False # Check if the user specified which documents to extract keywords from. if recids: onto_recids = _get_recids_foreach_ontology(recids=recids, taxonomy=taxonomy) elif collections: onto_recids = _get_recids_foreach_ontology(collections=collections, taxonomy=taxonomy) else: onto_recids = _get_recids_foreach_ontology() if not onto_recids: # Nothing to do. if automated_daemon_mode_p: _update_date_of_last_run(bibtask.task_get_task_param("task_starting_time")) return 1 # We will write to a temporary file as we go, because we might be processing # big collections with many docs _rid = time.strftime("%Y%m%d%H%M%S", time.localtime()) abs_path = bibclassify_engine.get_tmp_file(_rid) fo = open(abs_path, "w") fo.write('<?xml version="1.0" encoding="UTF-8"?>\n') fo.write('<collection xmlns="http://www.loc.gov/MARC21/slim">\n') # Count the total number of records in order to update the progression. global _RECIDS_NUMBER for onto_rec in onto_recids: _RECIDS_NUMBER += len(onto_rec["recIDs"]) rec_added = False for onto_rec in onto_recids: bibtask.task_sleep_now_if_required(can_stop_too=False) if onto_rec["collection"] is not None: bibtask.write_message( "INFO: Applying taxonomy %s to collection %s (%s " "records)" % (onto_rec["ontology"], onto_rec["collection"], len(onto_rec["recIDs"])), stream=sys.stderr, verbose=3, ) else: bibtask.write_message( "INFO: Applying taxonomy %s to recIDs %s. " % (onto_rec["ontology"], ", ".join([str(recid) for recid in onto_rec["recIDs"]])), stream=sys.stderr, verbose=3, ) if onto_rec["recIDs"]: xml = _analyze_documents(onto_rec["recIDs"], onto_rec["ontology"], onto_rec["collection"]) if len(xml) > 5: fo.write(xml) rec_added = True fo.write("</collection>\n") fo.close() # Apply the changes. if rec_added: if bconfig.CFG_DB_SAVE_KW: bibclassify_webinterface.upload_keywords(abs_path) else: bibtask.write_message("INFO: CFG_DB_SAVE_KW is false, we don't save results", stream=sys.stderr, verbose=0) else: bibtask.write_message("WARNING: No keywords found, recids: %s" % onto_recids, stream=sys.stderr, verbose=0) os.remove(abs_path) # Update the date of last run in the clsMETHOD table, but only if # we were running in an automated mode. if automated_daemon_mode_p: _update_date_of_last_run(bibtask.task_get_task_param("task_starting_time")) return 1
def ref_analyzer(citation_informations, dicts, updated_recids, tags, do_catchup=True): """Analyze the citation informations and calculate the citation weight and cited by list dictionary. """ citations_weight = dicts['cites_weight'] citations = dicts['cites'] references = dicts['refs'] selfcites = dicts['selfcites'] selfrefs = dicts['selfrefs'] authorcites = dicts['authorcites'] def step(msg_prefix, recid, done, total): if done % 30 == 0: task_sleep_now_if_required() if done % 1000 == 0: mesg = "%s done %s of %s" % (msg_prefix, done, total) write_message(mesg) task_update_progress(mesg) write_message("Processing: %s" % recid, verbose=9) def add_to_dicts(citer, cited): # Make sure we don't add ourselves # Workaround till we know why we are adding ourselves. if citer == cited: return if cited not in citations_weight: citations_weight[cited] = 0 # Citations and citations weight if citer not in citations.setdefault(cited, []): citations[cited].append(citer) citations_weight[cited] += 1 # References if cited not in references.setdefault(citer, []): references[citer].append(cited) # dict of recid -> institute_give_publ_id records_info, references_info = citation_informations t1 = os.times()[4] write_message("Phase 0: temporarily remove changed records from " \ "citation dictionaries; they will be filled later") if do_catchup: for somerecid in updated_recids: try: del citations[somerecid] except KeyError: pass for somerecid in updated_recids: try: del references[somerecid] except KeyError: pass # Try to find references based on 999C5r # e.g 8 -> ([astro-ph/9889],[hep-ph/768]) # meaning: rec 8 contains these in bibliography write_message("Phase 1: Report numbers references") done = 0 for thisrecid, refnumbers in references_info['report-numbers'].iteritems(): step("Report numbers references", thisrecid, done, len(references_info['report-numbers'])) done += 1 for refnumber in (r for r in refnumbers if r): field = 'reportnumber' refnumber = standardize_report_number(refnumber) # Search for "hep-th/5644654 or such" in existing records recids = get_recids_matching_query(p=refnumber, f=field) write_message("These match searching %s in %s: %s" % \ (refnumber, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, refnumber) else: remove_from_missing(refnumber) if len(recids) > 1: store_citation_warning('multiple-matches', refnumber) msg = "Whoops: record '%d' report number value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, refnumber, repr(recids)) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_dicts(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t2 = os.times()[4] # Try to find references based on 999C5s # e.g. Phys.Rev.Lett. 53 (1986) 2285 write_message("Phase 2: Journal references") done = 0 for thisrecid, refs in references_info['journals'].iteritems(): step("Journal references", thisrecid, done, len(references_info['journals'])) done += 1 for reference in (r for r in refs if r): p = reference field = 'journal' # check reference value to see whether it is well formed: if not re_CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK.match(p): store_citation_warning('not-well-formed', p) msg = "Whoops, record '%d' reference value '%s' " \ "is not well formed; skipping it." % (thisrecid, p) write_message(msg, stream=sys.stderr) continue # skip this ill-formed value recids = search_unit(p, field) - INTBITSET_OF_DELETED_RECORDS write_message("These match searching %s in %s: %s" \ % (reference, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, p) else: remove_from_missing(p) if len(recids) > 1: store_citation_warning('multiple-matches', p) msg = "Whoops: record '%d' reference value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, p, repr(recids)) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_dicts(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t3 = os.times()[4] # Try to find references based on 999C5a # e.g. 10.1007/BF03170733 write_message("Phase 3: DOI references") done = 0 for thisrecid, refs in references_info['doi'].iteritems(): step("DOI references", thisrecid, done, len(references_info['doi'])) done += 1 for reference in (r for r in refs if r): p = reference field = 'doi' recids = get_recids_matching_query(p, field) write_message("These match searching %s in %s: %s" \ % (reference, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, p) else: remove_from_missing(p) if len(recids) > 1: store_citation_warning('multiple-matches', p) msg = "Whoops: record '%d' DOI value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, p, repr(recids)) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_dicts(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t4 = os.times()[4] # Search for stuff like CERN-TH-4859/87 in list of refs write_message("Phase 4: report numbers catchup") done = 0 for thisrecid, reportcodes in records_info['report-numbers'].iteritems(): step("Report numbers catchup", thisrecid, done, len(records_info['report-numbers'])) done += 1 for reportcode in (r for r in reportcodes if r): if reportcode.startswith('arXiv'): std_reportcode = standardize_report_number(reportcode) report_pattern = r'^%s( *\[[a-zA-Z.-]*\])?' % \ re.escape(std_reportcode) recids = get_recids_matching_query(report_pattern, tags['refs_report_number'], 'r') else: recids = get_recids_matching_query(reportcode, tags['refs_report_number'], 'e') for recid in recids: add_to_dicts(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) # Find this record's pubinfo in other records' bibliography write_message("Phase 5: journals catchup") done = 0 t5 = os.times()[4] for thisrecid, rec_journals in records_info['journals'].iteritems(): step("Journals catchup", thisrecid, done, len(records_info['journals'])) done += 1 for journal in rec_journals: journal = journal.replace("\"", "") # Search the publication string like # Phys. Lett., B 482 (2000) 417 in 999C5s recids = search_unit(p=journal, f=tags['refs_journal'], m='a') \ - INTBITSET_OF_DELETED_RECORDS write_message("These records match %s in %s: %s" \ % (journal, tags['refs_journal'], list(recids)), verbose=9) for recid in recids: add_to_dicts(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) write_message("Phase 6: DOI catchup") done = 0 t6 = os.times()[4] for thisrecid, dois in records_info['doi'].iteritems(): step("DOI catchup", thisrecid, done, len(records_info['doi'])) done += 1 for doi in dois: # Search the publication string like # Phys. Lett., B 482 (2000) 417 in 999C5a recids = search_unit(p=doi, f=tags['refs_doi'], m='a') \ - INTBITSET_OF_DELETED_RECORDS write_message("These records match %s in %s: %s" \ % (doi, tags['refs_doi'], list(recids)), verbose=9) for recid in recids: add_to_dicts(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) write_message("Phase 7: remove empty lists from dicts") # Remove empty lists in citation and reference keys = citations.keys() for k in keys: if not citations[k]: del citations[k] keys = references.keys() for k in keys: if not references[k]: del references[k] if task_get_task_param('verbose') >= 3: # Print only X first to prevent flood write_message("citation_list (x is cited by y):") write_message(dict(islice(citations.iteritems(), 10))) write_message("size: %s" % len(citations)) write_message("reference_list (x cites y):") write_message(dict(islice(references.iteritems(), 10))) write_message("size: %s" % len(references)) write_message("selfcitedbydic (x is cited by y and one of the " \ "authors of x same as y's):") write_message(dict(islice(selfcites.iteritems(), 10))) write_message("size: %s" % len(selfcites)) write_message("selfdic (x cites y and one of the authors of x " \ "same as y's):") write_message(dict(islice(selfrefs.iteritems(), 10))) write_message("size: %s" % len(selfrefs)) write_message("authorcitdic (author is cited in recs):") write_message(dict(islice(authorcites.iteritems(), 10))) write_message("size: %s" % len(authorcites)) t7 = os.times()[4] write_message("Execution time for analyzing the citation information " \ "generating the dictionary:") write_message("... checking ref report numbers: %.2f sec" % (t2-t1)) write_message("... checking ref journals: %.2f sec" % (t3-t2)) write_message("... checking ref DOI: %.2f sec" % (t4-t3)) write_message("... checking rec report numbers: %.2f sec" % (t5-t4)) write_message("... checking rec journals: %.2f sec" % (t6-t5)) write_message("... checking rec DOI: %.2f sec" % (t7-t6)) write_message("... total time of ref_analyze: %.2f sec" % (t7-t1)) return citations_weight, citations, references, selfcites, \ selfrefs, authorcites
def ref_analyzer(citation_informations, dicts, updated_recids, tags, do_catchup=True): """Analyze the citation informations and calculate the citation weight and cited by list dictionary. """ citations_weight = dicts['cites_weight'] citations = dicts['cites'] references = dicts['refs'] selfcites = dicts['selfcites'] selfrefs = dicts['selfrefs'] authorcites = dicts['authorcites'] def step(msg_prefix, recid, done, total): if done % 30 == 0: task_sleep_now_if_required() if done % 1000 == 0: mesg = "%s done %s of %s" % (msg_prefix, done, total) write_message(mesg) task_update_progress(mesg) write_message("Processing: %s" % recid, verbose=9) def add_to_dicts(citer, cited): # Make sure we don't add ourselves # Workaround till we know why we are adding ourselves. if citer == cited: return if cited not in citations_weight: citations_weight[cited] = 0 # Citations and citations weight if citer not in citations.setdefault(cited, []): citations[cited].append(citer) citations_weight[cited] += 1 # References if cited not in references.setdefault(citer, []): references[citer].append(cited) # dict of recid -> institute_give_publ_id records_info, references_info = citation_informations t1 = os.times()[4] write_message("Phase 0: temporarily remove changed records from " \ "citation dictionaries; they will be filled later") if do_catchup: for somerecid in updated_recids: try: del citations[somerecid] except KeyError: pass for somerecid in updated_recids: try: del references[somerecid] except KeyError: pass # Try to find references based on 999C5r # e.g 8 -> ([astro-ph/9889],[hep-ph/768]) # meaning: rec 8 contains these in bibliography write_message("Phase 1: Report numbers references") done = 0 for thisrecid, refnumbers in references_info['report-numbers'].iteritems(): step("Report numbers references", thisrecid, done, len(references_info['report-numbers'])) done += 1 for refnumber in (r for r in refnumbers if r): field = 'reportnumber' refnumber = standardize_report_number(refnumber) # Search for "hep-th/5644654 or such" in existing records recids = get_recids_matching_query(p=refnumber, f=field) write_message("These match searching %s in %s: %s" % \ (refnumber, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, refnumber) else: remove_from_missing(refnumber) if len(recids) > 1: store_citation_warning('multiple-matches', refnumber) msg = "Whoops: record '%d' report number value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, refnumber, repr(recids)) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_dicts(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t2 = os.times()[4] # Try to find references based on 999C5s # e.g. Phys.Rev.Lett. 53 (1986) 2285 write_message("Phase 2: Journal references") done = 0 for thisrecid, refs in references_info['journals'].iteritems(): step("Journal references", thisrecid, done, len(references_info['journals'])) done += 1 for reference in (r for r in refs if r): p = reference field = 'journal' # check reference value to see whether it is well formed: if not re_CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK.match(p): store_citation_warning('not-well-formed', p) msg = "Whoops, record '%d' reference value '%s' " \ "is not well formed; skipping it." % (thisrecid, p) write_message(msg, stream=sys.stderr) continue # skip this ill-formed value recids = search_unit(p, field) - INTBITSET_OF_DELETED_RECORDS write_message("These match searching %s in %s: %s" \ % (reference, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, p) else: remove_from_missing(p) if len(recids) > 1: store_citation_warning('multiple-matches', p) msg = "Whoops: record '%d' reference value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, p, repr(recids)) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_dicts(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t3 = os.times()[4] # Try to find references based on 999C5a # e.g. 10.1007/BF03170733 write_message("Phase 3: DOI references") done = 0 for thisrecid, refs in references_info['doi'].iteritems(): step("DOI references", thisrecid, done, len(references_info['doi'])) done += 1 for reference in (r for r in refs if r): p = reference field = 'doi' recids = get_recids_matching_query(p, field) write_message("These match searching %s in %s: %s" \ % (reference, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, p) else: remove_from_missing(p) if len(recids) > 1: store_citation_warning('multiple-matches', p) msg = "Whoops: record '%d' DOI value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, p, repr(recids)) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_dicts(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t4 = os.times()[4] # Search for stuff like CERN-TH-4859/87 in list of refs write_message("Phase 4: report numbers catchup") done = 0 for thisrecid, reportcodes in records_info['report-numbers'].iteritems(): step("Report numbers catchup", thisrecid, done, len(records_info['report-numbers'])) done += 1 for reportcode in (r for r in reportcodes if r): if reportcode.startswith('arXiv'): std_reportcode = standardize_report_number(reportcode) report_pattern = r'^%s( *\[[a-zA-Z.-]*\])?' % \ re.escape(std_reportcode) recids = get_recids_matching_query(report_pattern, tags['refs_report_number'], 'r') else: recids = get_recids_matching_query(reportcode, tags['refs_report_number'], 'e') for recid in recids: add_to_dicts(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) # Find this record's pubinfo in other records' bibliography write_message("Phase 5: journals catchup") done = 0 t5 = os.times()[4] for thisrecid, rec_journals in records_info['journals'].iteritems(): step("Journals catchup", thisrecid, done, len(records_info['journals'])) done += 1 for journal in rec_journals: journal = journal.replace("\"", "") # Search the publication string like # Phys. Lett., B 482 (2000) 417 in 999C5s recids = search_unit(p=journal, f=tags['refs_journal'], m='a') \ - INTBITSET_OF_DELETED_RECORDS write_message("These records match %s in %s: %s" \ % (journal, tags['refs_journal'], list(recids)), verbose=9) for recid in recids: add_to_dicts(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) write_message("Phase 6: DOI catchup") done = 0 t6 = os.times()[4] for thisrecid, dois in records_info['doi'].iteritems(): step("DOI catchup", thisrecid, done, len(records_info['doi'])) done += 1 for doi in dois: # Search the publication string like # Phys. Lett., B 482 (2000) 417 in 999C5a recids = search_unit(p=doi, f=tags['refs_doi'], m='a') \ - INTBITSET_OF_DELETED_RECORDS write_message("These records match %s in %s: %s" \ % (doi, tags['refs_doi'], list(recids)), verbose=9) for recid in recids: add_to_dicts(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) write_message("Phase 7: remove empty lists from dicts") # Remove empty lists in citation and reference keys = citations.keys() for k in keys: if not citations[k]: del citations[k] keys = references.keys() for k in keys: if not references[k]: del references[k] if task_get_task_param('verbose') >= 3: # Print only X first to prevent flood write_message("citation_list (x is cited by y):") write_message(dict(islice(citations.iteritems(), 10))) write_message("size: %s" % len(citations)) write_message("reference_list (x cites y):") write_message(dict(islice(references.iteritems(), 10))) write_message("size: %s" % len(references)) write_message("selfcitedbydic (x is cited by y and one of the " \ "authors of x same as y's):") write_message(dict(islice(selfcites.iteritems(), 10))) write_message("size: %s" % len(selfcites)) write_message("selfdic (x cites y and one of the authors of x " \ "same as y's):") write_message(dict(islice(selfrefs.iteritems(), 10))) write_message("size: %s" % len(selfrefs)) write_message("authorcitdic (author is cited in recs):") write_message(dict(islice(authorcites.iteritems(), 10))) write_message("size: %s" % len(authorcites)) t7 = os.times()[4] write_message("Execution time for analyzing the citation information " \ "generating the dictionary:") write_message("... checking ref report numbers: %.2f sec" % (t2 - t1)) write_message("... checking ref journals: %.2f sec" % (t3 - t2)) write_message("... checking ref DOI: %.2f sec" % (t4 - t3)) write_message("... checking rec report numbers: %.2f sec" % (t5 - t4)) write_message("... checking rec journals: %.2f sec" % (t6 - t5)) write_message("... checking rec DOI: %.2f sec" % (t7 - t6)) write_message("... total time of ref_analyze: %.2f sec" % (t7 - t1)) return citations_weight, citations, references, selfcites, \ selfrefs, authorcites
def task_run_core(): """Run the harvesting task. The row argument is the oaiharvest task queue row, containing if, arguments, etc. Return 1 in case of success and 0 in case of failure. """ reposlist = [] datelist = [] dateflag = 0 possible_postmodes = [code for code, dummy in CFG_OAI_POSSIBLE_POSTMODES] filepath_prefix = tmpHARVESTpath + "_" + str( task_get_task_param("task_id")) ### go ahead: build up the reposlist if task_get_option("repository") is not None: ### user requests harvesting from selected repositories write_message("harvesting from selected repositories") for reposname in task_get_option("repository"): row = get_row_from_reposname(reposname) if row == []: write_message("source name " + reposname + " is not valid") continue else: reposlist.append(get_row_from_reposname(reposname)) else: ### user requests harvesting from all repositories write_message("harvesting from all repositories in the database") reposlist = get_all_rows_from_db() ### go ahead: check if user requested from-until harvesting if task_get_option("dates"): ### for each repos simply perform a from-until date harvesting... ### no need to update anything dateflag = 1 for element in task_get_option("dates"): datelist.append(element) error_happened_p = False j = 0 for repos in reposlist: j += 1 task_sleep_now_if_required() reponame = str(repos[0][6]) postmode = str(repos[0][9]) setspecs = str(repos[0][10]) harvested_files_list = [] if postmode in possible_postmodes: # Harvest phase harvestpath = filepath_prefix + "_" + str(j) + "_" + \ time.strftime("%Y%m%d%H%M%S") + "_harvested" if dateflag == 1: task_update_progress("Harvesting %s from %s to %s (%i/%i)" % \ (reponame, \ str(datelist[0]), str(datelist[1]), j, \ len(reposlist))) exit_code, file_list = oai_harvest_get(prefix=repos[0][2], baseurl=repos[0][1], harvestpath=harvestpath, fro=str(datelist[0]), until=str(datelist[1]), setspecs=setspecs) if exit_code == 1: write_message("source " + reponame + \ " was harvested from " + str(datelist[0]) \ + " to " + str(datelist[1])) harvested_files_list = file_list else: write_message("an error occurred while harvesting " "from source " + reponame + " for the dates chosen") error_happened_p = True continue elif dateflag != 1 and repos[0][7] is None and repos[0][8] != 0: write_message("source " + reponame + \ " was never harvested before - harvesting whole " "repository") task_update_progress("Harvesting %s (%i/%i)" % \ (reponame, j, \ len(reposlist))) exit_code, file_list = oai_harvest_get(prefix=repos[0][2], baseurl=repos[0][1], harvestpath=harvestpath, setspecs=setspecs) if exit_code == 1: update_lastrun(repos[0][0]) harvested_files_list = file_list else: write_message("an error occurred while harvesting from " "source " + reponame) error_happened_p = True continue elif dateflag != 1 and repos[0][8] != 0: ### check that update is actually needed, ### i.e. lastrun+frequency>today timenow = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) lastrundate = re.sub(r'\.[0-9]+$', '', str(repos[0][7])) # remove trailing .00 timeinsec = int(repos[0][8]) * 60 * 60 updatedue = add_timestamp_and_timelag(lastrundate, timeinsec) proceed = compare_timestamps_with_tolerance(updatedue, timenow) if proceed == 0 or proceed == -1: #update needed! write_message("source " + reponame + " is going to be updated") fromdate = str(repos[0][7]) fromdate = fromdate.split()[0] # get rid of time # of the day for the moment task_update_progress("Harvesting %s (%i/%i)" % \ (reponame, j, \ len(reposlist))) exit_code, file_list = oai_harvest_get( prefix=repos[0][2], baseurl=repos[0][1], harvestpath=harvestpath, fro=fromdate, setspecs=setspecs) if exit_code == 1: update_lastrun(repos[0][0]) harvested_files_list = file_list else: write_message("an error occurred while harvesting " "from source " + reponame) error_happened_p = True continue else: write_message("source " + reponame + " does not need updating") continue elif dateflag != 1 and repos[0][8] == 0: write_message("source " + reponame + \ " has frequency set to 'Never' so it will not be updated") continue # Harvesting done, now convert/extract/filter/upload as requested if len(harvested_files_list) < 1: write_message("No records harvested for %s" % (reponame, )) continue active_files_list = harvested_files_list # Convert phase if 'c' in postmode: converted_files_list = [] i = 0 for active_file in active_files_list: i += 1 task_sleep_now_if_required() task_update_progress("Converting material harvested from %s (%i/%i)" % \ (reponame, \ i, \ len(active_files_list))) converted_file = filepath_prefix + "_" + str(i) + "_" + \ time.strftime("%Y%m%d%H%M%S") + "_converted" converted_files_list.append(converted_file) (exitcode, err_msg) = call_bibconvert(config=str(repos[0][5]), harvestpath=active_file, convertpath=converted_file) if exitcode == 0: write_message("material harvested from source " + reponame + " was successfully converted") else: write_message( "an error occurred while converting from " + reponame + ': \n' + err_msg) error_happened_p = True continue # print stats: for converted_file in converted_files_list: write_message("File %s contains %i records." % \ (converted_file, get_nb_records_in_file(converted_file))) active_files_list = converted_files_list if 'e' in postmode: # Download tarball for each harvested/converted record, then run plotextrator. # Update converted xml files with generated xml or add it for upload extracted_files_list = [] i = 0 for active_file in active_files_list: i += 1 task_sleep_now_if_required() task_update_progress("Extracting material harvested from %s (%i/%i)" % \ (reponame, i, len(active_files_list))) extracted_file = filepath_prefix + "_" + str(i) + "_" + \ time.strftime("%Y%m%d%H%M%S") + "_extracted" extracted_files_list.append(extracted_file) (exitcode, err_msg) = call_plotextractor(active_file, extracted_file) if exitcode == 0: write_message("material harvested from source " + reponame + " was successfully extracted") else: write_message( "an error occurred while extracting from " + reponame + ': \n' + err_msg) error_happened_p = True continue # print stats: for extracted_file in extracted_files_list: write_message("File %s contains %i records." % \ (extracted_file, get_nb_records_in_file(extracted_file))) active_files_list = extracted_files_list # Filter-phase if 'f' in postmode: # first call bibfilter: res = 0 uploaded = False i = 0 for active_file in active_files_list: i += 1 task_sleep_now_if_required() task_update_progress("Filtering material harvested from %s (%i/%i)" % \ (reponame, \ i, \ len(active_files_list))) res += call_bibfilter(str(repos[0][11]), active_file) if len(active_files_list) > 0: if res == 0: write_message("material harvested from source " + reponame + " was successfully bibfiltered") else: write_message("an error occurred while bibfiltering " "harvest from " + reponame) error_happened_p = True continue # print stats: for active_file in active_files_list: write_message("File %s contains %i records." % \ (active_file + ".insert.xml", get_nb_records_in_file(active_file + ".insert.xml"))) write_message("File %s contains %i records." % \ (active_file + ".correct.xml", get_nb_records_in_file(active_file + ".correct.xml"))) write_message("File %s contains %i records." % \ (active_file + ".append.xml", get_nb_records_in_file(active_file + ".append.xml"))) write_message("File %s contains %i records." % \ (active_file + ".holdingpen.xml", get_nb_records_in_file(active_file + ".holdingpen.xml"))) # Upload files if "u" in postmode: if 'f' in postmode: # upload filtered files i = 0 for active_file in active_files_list: task_sleep_now_if_required() i += 1 if get_nb_records_in_file(active_file + ".insert.xml") > 0: task_update_progress("Uploading new records harvested from %s (%i/%i)" % \ (reponame, \ i, \ len(active_files_list))) res += call_bibupload(active_file + ".insert.xml", \ ["-i"], oai_src_id = repos[0][0]) uploaded = True task_sleep_now_if_required() if get_nb_records_in_file(active_file + ".correct.xml") > 0: task_update_progress("Uploading corrections for records harvested from %s (%i/%i)" % \ (reponame, \ i, \ len(active_files_list))) res += call_bibupload(active_file + ".correct.xml", \ ["-c"], oai_src_id = repos[0][0]) uploaded = True if get_nb_records_in_file(active_file + ".append.xml") > 0: task_update_progress("Uploading additions for records harvested from %s (%i/%i)" % \ (reponame, \ i, \ len(active_files_list))) res += call_bibupload(active_file + ".append.xml", \ ["-a"], oai_src_id = repos[0][0]) uploaded = True if get_nb_records_in_file(active_file + ".holdingpen.xml") > 0: task_update_progress("Uploading records harvested from %s to holding pen (%i/%i)" % \ (reponame, \ i, \ len(active_files_list))) res += call_bibupload(active_file + ".holdingpen.xml", \ ["-o"], oai_src_id = repos[0][0]) uploaded = True if len(active_files_list) > 0: if res == 0: if uploaded: write_message( "material harvested from source " + reponame + " was successfully uploaded") else: write_message("nothing to upload") else: write_message("an error occurred while uploading " "harvest from " + reponame) error_happened_p = True continue else: # upload files normally res = 0 i = 0 uploaded = False for active_file in active_files_list: i += 1 task_sleep_now_if_required() if get_nb_records_in_file(active_file) > 0: task_update_progress("Uploading records harvested from %s (%i/%i)" % \ (reponame, \ i, \ len(active_files_list))) res += call_bibupload(active_file, oai_src_id=repos[0][0]) uploaded = True if res == 0: if uploaded: write_message( "material harvested from source " + reponame + " was successfully uploaded") else: write_message("nothing to upload") else: write_message("an error occurred while uploading " "harvest from " + reponame) error_happened_p = True continue else: ### this should not happen write_message("invalid postprocess mode: " + postmode + " skipping repository") error_happened_p = True continue if error_happened_p: return False else: return True
def _dbdump_run_task_core(): """ Run DB dumper core stuff. Note: do not use task_can_sleep() stuff here because we don't want other tasks to interrupt us while we are dumping the DB content. """ # read params: host = CFG_DATABASE_HOST port = CFG_DATABASE_PORT connection = None try: if task_get_option('slave') and not task_get_option('dump_on_slave_helper_mode'): connection = get_connection_for_dump_on_slave() write_message("Dump on slave requested") write_message("... checking if slave is well up...") check_slave_is_up(connection) write_message("... checking if slave is in consistent state...") check_slave_is_in_consistent_state(connection) write_message("... detaching slave database...") detach_slave(connection) write_message("... scheduling dump on slave helper...") helper_arguments = [] if task_get_option("number"): helper_arguments += ["--number", str(task_get_option("number"))] if task_get_option("output"): helper_arguments += ["--output", str(task_get_option("output"))] if task_get_option("params"): helper_arguments += ["--params", str(task_get_option("params"))] if task_get_option("ignore_tables"): helper_arguments += ["--ignore-tables", str(task_get_option("ignore_tables"))] if task_get_option("compress"): helper_arguments += ["--compress"] if task_get_option("slave"): helper_arguments += ["--slave", str(task_get_option("slave"))] helper_arguments += ['-N', 'slavehelper', '--dump-on-slave-helper'] task_id = task_low_level_submission('dbdump', task_get_task_param('user'), '-P4', *helper_arguments) write_message("Slave scheduled with ID %s" % task_id) task_update_progress("DONE") return True elif task_get_option('dump_on_slave_helper_mode'): write_message("Dumping on slave mode") connection = get_connection_for_dump_on_slave() write_message("... checking if slave is well down...") check_slave_is_down(connection) host = CFG_DATABASE_SLAVE task_update_progress("Reading parameters") write_message("Reading parameters started") output_dir = task_get_option('output', CFG_LOGDIR) output_num = task_get_option('number', 5) params = task_get_option('params', None) compress = task_get_option('compress', False) slave = task_get_option('slave', False) ignore_tables = task_get_option('ignore_tables', None) if ignore_tables: ignore_tables = get_table_names(ignore_tables) else: ignore_tables = None output_file_suffix = task_get_task_param('task_starting_time') output_file_suffix = output_file_suffix.replace(' ', '_') + '.sql' if compress: output_file_suffix = "%s.gz" % (output_file_suffix,) write_message("Reading parameters ended") # make dump: task_update_progress("Dumping database") write_message("Database dump started") if slave: output_file_prefix = 'slave-%s-dbdump-' % (CFG_DATABASE_NAME,) else: output_file_prefix = '%s-dbdump-' % (CFG_DATABASE_NAME,) output_file = output_file_prefix + output_file_suffix dump_path = output_dir + os.sep + output_file dump_database(dump_path, \ host=host, port=port, params=params, \ compress=compress, \ ignore_tables=ignore_tables) write_message("Database dump ended") finally: if connection and task_get_option('dump_on_slave_helper_mode'): write_message("Reattaching slave") attach_slave(connection) # prune old dump files: task_update_progress("Pruning old dump files") write_message("Pruning old dump files started") _delete_old_dumps(output_dir, output_file_prefix, output_num) write_message("Pruning old dump files ended") # we are done: task_update_progress("Done.") return True
def _task_run_core(): """Runs anayse_documents for each ontology, collection, record ids set.""" automated_daemon_mode_p = True recids = task_get_option('recids') collections = task_get_option('collections') taxonomy = task_get_option('taxonomy') if recids or collections: # We want to run some records/collection only, so we are not # in the automated daemon mode; this will be useful later. automated_daemon_mode_p = False # Check if the user specified which documents to extract keywords from. if recids: onto_recids = _get_recids_foreach_ontology(recids=recids, taxonomy=taxonomy) elif collections: onto_recids = _get_recids_foreach_ontology(collections=collections, taxonomy=taxonomy) else: onto_recids = _get_recids_foreach_ontology() if not onto_recids: # Nothing to do. if automated_daemon_mode_p: _update_date_of_last_run(task_get_task_param('task_starting_time')) return 1 changes = [] changes.append('<?xml version="1.0" encoding="UTF-8"?>') changes.append('<collection xmlns="http://www.loc.gov/MARC21/slim">') # Count the total number of records in order to update the progression. global _RECIDS_NUMBER for onto_rec in onto_recids: _RECIDS_NUMBER += len(onto_rec['recIDs']) for onto_rec in onto_recids: task_sleep_now_if_required(can_stop_too=False) if onto_rec['collection'] is not None: write_message('INFO: Applying taxonomy %s to collection %s (%s ' 'records)' % (onto_rec['ontology'], onto_rec['collection'], len(onto_rec['recIDs'])), stream=sys.stderr, verbose=3) else: write_message('INFO: Applying taxonomy %s to recIDs %s. ' % (onto_rec['ontology'], ', '.join( [str(recid) for recid in onto_rec['recIDs']])), stream=sys.stderr, verbose=3) if onto_rec['recIDs']: changes.append( _analyze_documents(onto_rec['recIDs'], onto_rec['ontology'], onto_rec['collection'])) changes.append('</collection>') # Write the changes to a temporary file. tmp_directory = "%s/bibclassify" % CFG_TMPDIR filename = "bibclassifyd_%s.xml" % time.strftime("%Y%m%d%H%M%S", time.localtime()) abs_path = os.path.join(tmp_directory, filename) if not os.path.isdir(tmp_directory): os.mkdir(tmp_directory) file_desc = open(abs_path, "w") file_desc.write('\n'.join(changes)) file_desc.close() # Apply the changes. if changes: cmd = "%s/bibupload -n -c '%s' " % (CFG_BINDIR, abs_path) errcode = 0 try: errcode = os.system(cmd) except OSError, exc: write_message('ERROR: Command %s failed [%s].' % (cmd, exc), stream=sys.stderr, verbose=0) if errcode != 0: write_message("ERROR: %s failed, error code is %d." % (cmd, errcode), stream=sys.stderr, verbose=0) return 0
cmd = "%s/bibupload -n -c '%s' " % (CFG_BINDIR, abs_path) errcode = 0 try: errcode = os.system(cmd) except OSError, exc: write_message('ERROR: Command %s failed [%s].' % (cmd, exc), stream=sys.stderr, verbose=0) if errcode != 0: write_message("ERROR: %s failed, error code is %d." % (cmd, errcode), stream=sys.stderr, verbose=0) return 0 # Update the date of last run in the clsMETHOD table, but only if # we were running in an automated mode. if automated_daemon_mode_p: _update_date_of_last_run(task_get_task_param('task_starting_time')) return 1 def _analyze_documents(records, ontology, collection): """For each collection, parse the documents attached to the records in collection with the corresponding ontology.""" global _INDEX if not records: # No records could be found. write_message("WARNING: No record were found in collection %s." % collection, stream=sys.stderr, verbose=2) return False # Process records: output = []
def _task_run_core(): """Runs anayse_documents for each ontology, collection, record ids set.""" automated_daemon_mode_p = True recids = task_get_option('recids') collections = task_get_option('collections') taxonomy = task_get_option('taxonomy') if recids or collections: # We want to run some records/collection only, so we are not # in the automated daemon mode; this will be useful later. automated_daemon_mode_p = False # Check if the user specified which documents to extract keywords from. if recids: onto_recids = _get_recids_foreach_ontology(recids=recids, taxonomy=taxonomy) elif collections: onto_recids = _get_recids_foreach_ontology(collections=collections, taxonomy=taxonomy) else: onto_recids = _get_recids_foreach_ontology() if not onto_recids: # Nothing to do. if automated_daemon_mode_p: _update_date_of_last_run(task_get_task_param('task_starting_time')) return 1 changes = [] changes.append('<?xml version="1.0" encoding="UTF-8"?>') changes.append('<collection xmlns="http://www.loc.gov/MARC21/slim">') # Count the total number of records in order to update the progression. global _RECIDS_NUMBER for onto_rec in onto_recids: _RECIDS_NUMBER += len(onto_rec['recIDs']) for onto_rec in onto_recids: task_sleep_now_if_required(can_stop_too=False) if onto_rec['collection'] is not None: write_message('INFO: Applying taxonomy %s to collection %s (%s ' 'records)' % (onto_rec['ontology'], onto_rec['collection'], len(onto_rec['recIDs'])), stream=sys.stderr, verbose=3) else: write_message('INFO: Applying taxonomy %s to recIDs %s. ' % (onto_rec['ontology'], ', '.join([str(recid) for recid in onto_rec['recIDs']])), stream=sys.stderr, verbose=3) if onto_rec['recIDs']: changes.append(_analyze_documents(onto_rec['recIDs'], onto_rec['ontology'], onto_rec['collection'])) changes.append('</collection>') # Write the changes to a temporary file. tmp_directory = "%s/bibclassify" % CFG_TMPDIR filename = "bibclassifyd_%s.xml" % time.strftime("%Y%m%d%H%M%S", time.localtime()) abs_path = os.path.join(tmp_directory, filename) if not os.path.isdir(tmp_directory): os.mkdir(tmp_directory) file_desc = open(abs_path, "w") file_desc.write('\n'.join(changes)) file_desc.close() # Apply the changes. if changes: cmd = "%s/bibupload -n -c '%s' " % (CFG_BINDIR, abs_path) errcode = 0 try: errcode = os.system(cmd) except OSError, exc: write_message('ERROR: Command %s failed [%s].' % (cmd, exc), stream=sys.stderr, verbose=0) if errcode != 0: write_message("ERROR: %s failed, error code is %d." % (cmd, errcode), stream=sys.stderr, verbose=0) return 0
def ref_analyzer(citation_informations, updated_recids, tags, config): """Analyze the citation informations and calculate the citation weight and cited by list dictionary. """ citations = {} for recid in updated_recids: citations[recid] = set() references = {} for recid in updated_recids: references[recid] = set() def step(msg_prefix, recid, done, total): if done % 30 == 0: task_sleep_now_if_required() if done % 1000 == 0: mesg = "%s done %s of %s" % (msg_prefix, done, total) write_message(mesg) task_update_progress(mesg) write_message("Processing: %s" % recid, verbose=9) def add_to_cites(citer, citee): # Make sure we don't add ourselves # Workaround till we know why we are adding ourselves. if citer == citee: return citations[citee].add(citer) if citer in updated_recids: references[citer].add(citee) def add_to_refs(citer, citee): # Make sure we don't add ourselves # Workaround till we know why we are adding ourselves. if citer == citee: return if citee in updated_recids: citations[citee].add(citer) references[citer].add(citee) # dict of recid -> institute_give_publ_id records_info, references_info = citation_informations t1 = os.times()[4] # Try to find references based on 999C5r # e.g 8 -> ([astro-ph/9889],[hep-ph/768]) # meaning: rec 8 contains these in bibliography write_message("Phase 1: Report numbers references") done = 0 for thisrecid, refnumbers in references_info['report-numbers'].iteritems(): step("Report numbers references", thisrecid, done, len(references_info['report-numbers'])) done += 1 for refnumber in (r for r in refnumbers if r): field = 'reportnumber' refnumber = standardize_report_number(refnumber) # Search for "hep-th/5644654 or such" in existing records recids = get_recids_matching_query(p=refnumber, f=field, config=config) write_message("These match searching %s in %s: %s" % (refnumber, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, refnumber) else: remove_from_missing(refnumber) if len(recids) > 1: store_citation_warning('multiple-matches', refnumber) msg = "Whoops: record '%d' report number value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, refnumber, repr(recids)) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_refs(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t2 = os.times()[4] # Try to find references based on 999C5s # e.g. Phys.Rev.Lett. 53 (1986) 2285 write_message("Phase 2: Journal references") done = 0 for thisrecid, refs in references_info['journals'].iteritems(): step("Journal references", thisrecid, done, len(references_info['journals'])) done += 1 for reference in (r for r in refs if r): p = reference field = 'journal' # check reference value to see whether it is well formed: if not re_CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK.match(p): store_citation_warning('not-well-formed', p) msg = "Whoops, record '%d' reference value '%s' " \ "is not well formed; skipping it." % (thisrecid, p) write_message(msg, stream=sys.stderr) continue # skip this ill-formed value recids = get_recids_matching_query(p=p, f=field, config=config) write_message("These match searching %s in %s: %s" % (reference, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, p) else: remove_from_missing(p) if len(recids) > 1: store_citation_warning('multiple-matches', p) msg = "Whoops: record '%d' reference value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, p, repr(recids)) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_refs(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t3 = os.times()[4] # Try to find references based on 999C5a # e.g. 10.1007/BF03170733 write_message("Phase 3: DOI references") done = 0 for thisrecid, refs in references_info['doi'].iteritems(): step("DOI references", thisrecid, done, len(references_info['doi'])) done += 1 for reference in (r for r in refs if r): p = reference field = 'doi' recids = get_recids_matching_query(p=p, f=field, config=config) write_message("These match searching %s in %s: %s" % (reference, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, p) else: remove_from_missing(p) if len(recids) > 1: store_citation_warning('multiple-matches', p) msg = "Whoops: record '%d' DOI value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, p, repr(recids)) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_refs(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t4 = os.times()[4] # Try to find references based on 999C5a (hdl references) # e.g. 4263537/4000 write_message("Phase 4: HDL references") done = 0 for thisrecid, refs in references_info['hdl'].iteritems(): step("HDL references", thisrecid, done, len(references_info['hdl'])) done += 1 for reference in (r for r in refs if r): p = reference field = 'hdl' recids = get_recids_matching_query(p=p, f=field, config=config) write_message("These match searching %s in %s: %s" % (reference, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, p) else: remove_from_missing(p) if len(recids) > 1: store_citation_warning('multiple-matches', p) msg = "Whoops: record '%d' HDL value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, p, repr(recids)) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_refs(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t5 = os.times()[4] # Try to find references based on 999C50 # e.g. 1244 write_message("Phase 5: Record ID references") done = 0 for thisrecid, refs in references_info['record_id'].iteritems(): step("Record ID references", thisrecid, done, len(references_info['record_id'])) done += 1 field = "001" for recid in (r for r in refs if r): valid = get_recids_matching_query(p=recid, f=field, config=config) write_message("These match searching %s in %s: %s" % (recid, field, list(valid)), verbose=9) if valid: add_to_refs(thisrecid, valid[0]) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t6 = os.times()[4] # Try to find references based on 999C5i # e.g. 978-3-942171-73-1 write_message("Phase 6: ISBN references") done = 0 for thisrecid, refs in references_info['isbn'].iteritems(): step("ISBN references", thisrecid, done, len(references_info['isbn'])) done += 1 for reference in (r for r in refs if r): p = reference field = 'isbn' recids = get_recids_matching_query(p=p, f=field, config=config) write_message("These match searching %s in %s: %s" % (reference, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, p) else: remove_from_missing(p) if len(recids) > 1: store_citation_warning('multiple-matches', p) msg = "Whoops: record '%d' ISBN value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, p, repr(recids)) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_refs(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t7 = os.times()[4] # Search for stuff like CERN-TH-4859/87 in list of refs write_message("Phase 7: report numbers catchup") done = 0 for thisrecid, reportcodes in records_info['report-numbers'].iteritems(): step("Report numbers catchup", thisrecid, done, len(records_info['report-numbers'])) done += 1 for reportcode in (r for r in reportcodes if r): if reportcode.startswith('arXiv'): std_reportcode = standardize_report_number(reportcode) report_pattern = r'^%s( *\[[a-zA-Z.-]*\])?' % \ re.escape(std_reportcode) recids = get_recids_matching_query(p=report_pattern, f=tags['refs_report_number'], m='r', config=config) else: recids = get_recids_matching_query(p=reportcode, f=tags['refs_report_number'], config=config) for recid in recids: add_to_cites(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) # Find this record's pubinfo in other records' bibliography write_message("Phase 8: journals catchup") done = 0 t8 = os.times()[4] for thisrecid, rec_journals in records_info['journals'].iteritems(): step("Journals catchup", thisrecid, done, len(records_info['journals'])) done += 1 for journal in rec_journals: journal = journal.replace("\"", "") # Search the publication string like # Phys. Lett., B 482 (2000) 417 in 999C5s recids = get_recids_matching_query(p=journal, f=tags['refs_journal'], config=config) write_message("These records match %s in %s: %s" % (journal, tags['refs_journal'], list(recids)), verbose=9) for recid in recids: add_to_cites(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) write_message("Phase 9: DOI catchup") done = 0 t9 = os.times()[4] for thisrecid, dois in records_info['doi'].iteritems(): step("DOI catchup", thisrecid, done, len(records_info['doi'])) done += 1 for doi in dois: recids = get_recids_matching_query(p=doi, f=tags['refs_doi'], config=config) write_message("These records match %s in %s: %s" % (doi, tags['refs_doi'], list(recids)), verbose=9) for recid in recids: add_to_cites(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) write_message("Phase 10: HDL catchup") done = 0 t10 = os.times()[4] for thisrecid, hdls in records_info['hdl'].iteritems(): step("HDL catchup", thisrecid, done, len(records_info['hdl'])) done += 1 for hdl in hdls: recids = get_recids_matching_query(p=hdl, f=tags['refs_doi'], config=config) write_message("These records match %s in %s: %s" % (hdl, tags['refs_doi'], list(recids)), verbose=9) for recid in recids: add_to_cites(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) write_message("Phase 11: ISBN catchup") done = 0 t11 = os.times()[4] for thisrecid, isbns in records_info['isbn'].iteritems(): step("ISBN catchup", thisrecid, done, len(records_info['isbn'])) done += 1 for isbn in isbns: recids = get_recids_matching_query(p=isbn, f=tags['refs_isbn'], config=config) write_message("These records match %s in %s: %s" % (isbn, tags['refs_isbn'], list(recids)), verbose=9) for recid in recids: add_to_cites(recid, thisrecid) write_message("Phase 12: Record ID catchup") done = 0 t12 = os.times()[4] for thisrecid, record_ids in records_info['record_id'].iteritems(): step("Record ID catchup", thisrecid, done, len(records_info['record_id'])) done += 1 for record_id in record_ids: recids = get_recids_matching_query(p=record_id, f=tags['refs_record_id'], config=config) write_message("These records match %s in %s: %s" % (record_id, tags['refs_record_id'], list(recids)), verbose=9) for recid in recids: add_to_cites(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) if task_get_task_param('verbose') >= 3: # Print only X first to prevent flood write_message("citation_list (x is cited by y):") write_message(dict(islice(citations.iteritems(), 10))) write_message("size: %s" % len(citations)) write_message("reference_list (x cites y):") write_message(dict(islice(references.iteritems(), 10))) write_message("size: %s" % len(references)) t13 = os.times()[4] write_message("Execution time for analyzing the citation information " "generating the dictionary:") write_message("... checking ref report numbers: %.2f sec" % (t2-t1)) write_message("... checking ref journals: %.2f sec" % (t3-t2)) write_message("... checking ref DOI: %.2f sec" % (t4-t3)) write_message("... checking ref HDL: %.2f sec" % (t5-t4)) write_message("... checking ref Record ID: %.2f sec" % (t6-t5)) write_message("... checking ref ISBN: %.2f sec" % (t7-t6)) write_message("... checking rec report numbers: %.2f sec" % (t8-t7)) write_message("... checking rec journals: %.2f sec" % (t9-t8)) write_message("... checking rec DOI: %.2f sec" % (t10-t9)) write_message("... checking rec HDL: %.2f sec" % (t11-t10)) write_message("... checking rec ISBN: %.2f sec" % (t12-t11)) write_message("... checking rec Record ID: %.2f sec" % (t13-t12)) write_message("... total time of ref_analyze: %.2f sec" % (t13-t1)) return citations, references
errcode = os.system(cmd) except OSError, exc: write_message('ERROR: Command %s failed [%s].' % (cmd, exc), stream=sys.stderr, verbose=0) if errcode != 0: write_message("ERROR: %s failed, error code is %d." % (cmd, errcode), stream=sys.stderr, verbose=0) return 0 # Update the date of last run in the clsMETHOD table, but only if # we were running in an automated mode. if automated_daemon_mode_p: _update_date_of_last_run(task_get_task_param('task_starting_time')) return 1 def _analyze_documents(records, ontology, collection): """For each collection, parse the documents attached to the records in collection with the corresponding ontology.""" global _INDEX if not records: # No records could be found. write_message("WARNING: No record were found in collection %s." % collection, stream=sys.stderr, verbose=2) return False
def _task_run_core(): """Runs analyse_documents for each ontology, collection, record ids set.""" automated_daemon_mode_p = True recids = bibtask.task_get_option('recids') collections = bibtask.task_get_option('collections') taxonomy = bibtask.task_get_option('taxonomy') if recids or collections: # We want to run some records/collection only, so we are not # in the automated daemon mode; this will be useful later. automated_daemon_mode_p = False # Check if the user specified which documents to extract keywords from. if recids: onto_recids = _get_recids_foreach_ontology(recids=recids, taxonomy=taxonomy) elif collections: onto_recids = _get_recids_foreach_ontology(collections=collections, taxonomy=taxonomy) else: onto_recids = _get_recids_foreach_ontology() if not onto_recids: # Nothing to do. if automated_daemon_mode_p: _update_date_of_last_run( bibtask.task_get_task_param('task_starting_time')) return 1 # We will write to a temporary file as we go, because we might be processing # big collections with many docs _rid = time.strftime("%Y%m%d%H%M%S", time.localtime()) abs_path = bibclassify_engine.get_tmp_file(_rid) fo = open(abs_path, 'w') fo.write('<?xml version="1.0" encoding="UTF-8"?>\n') fo.write('<collection xmlns="http://www.loc.gov/MARC21/slim">\n') # Count the total number of records in order to update the progression. global _RECIDS_NUMBER for onto_rec in onto_recids: _RECIDS_NUMBER += len(onto_rec['recIDs']) rec_added = False for onto_rec in onto_recids: bibtask.task_sleep_now_if_required(can_stop_too=False) if onto_rec['collection'] is not None: bibtask.write_message( 'INFO: Applying taxonomy %s to collection %s (%s ' 'records)' % (onto_rec['ontology'], onto_rec['collection'], len(onto_rec['recIDs'])), stream=sys.stderr, verbose=3) else: bibtask.write_message( 'INFO: Applying taxonomy %s to recIDs %s. ' % (onto_rec['ontology'], ', '.join( [str(recid) for recid in onto_rec['recIDs']])), stream=sys.stderr, verbose=3) if onto_rec['recIDs']: xml = _analyze_documents(onto_rec['recIDs'], onto_rec['ontology'], onto_rec['collection']) if len(xml) > 5: fo.write(xml) rec_added = True fo.write('</collection>\n') fo.close() # Apply the changes. if rec_added: if bconfig.CFG_DB_SAVE_KW: bibclassify_webinterface.upload_keywords(abs_path) else: bibtask.write_message( "INFO: CFG_DB_SAVE_KW is false, we don't save results", stream=sys.stderr, verbose=0) else: bibtask.write_message("WARNING: No keywords found, recids: %s" % onto_recids, stream=sys.stderr, verbose=0) os.remove(abs_path) # Update the date of last run in the clsMETHOD table, but only if # we were running in an automated mode. if automated_daemon_mode_p: _update_date_of_last_run( bibtask.task_get_task_param('task_starting_time')) return 1
def _update_job_lastrun_time(jobname): """Update expJOB table and set lastrun time of JOBNAME to the task starting time.""" run_sql("UPDATE expJOB SET lastrun=%s WHERE jobname=%s", (task_get_task_param('task_starting_time'), jobname,))
def get_citation_informations(recid_list, config): """scans the collections searching references (999C5x -fields) and citations for items in the recid_list returns a 4 list of dictionaries that contains the citation information of cds records examples: [ {} {} {} {} ] [ {5: 'SUT-DP-92-70-5'}, { 93: ['astro-ph/9812088']}, { 93: ['Phys. Rev. Lett. 96 (2006) 081301'] }, {} ] NB: stuff here is for analysing new or changed records. see "ref_analyzer" for more. """ begin_time = os.times()[4] d_reports_numbers = {} #dict of recid -> institute-given-report-code d_references_report_numbers = {} #dict of recid -> ['astro-ph/xyz'] d_references_s = { } #dict of recid -> list_of_the_entries_of_this_recs_bibliography d_records_s = {} #dict of recid -> this_records_publication_info citation_informations = [] write_message("config function " + config.get("rank_method", "function"), verbose=9) function = "" try: function = config.get("rank_method", "function") except: register_exception( prefix="cfg section [rank_method] has no attribute called function", alert_admin=True) #we cannot continue return [{}, {}, {}, {}] record_pri_number_tag = "" try: record_pri_number_tag = config.get(function, "primary_report_number") except: register_exception(prefix="cfg section " + function + " has no attribute primary_report_number", alert_admin=True) return [{}, {}, {}, {}] record_add_number_tag = "" try: record_add_number_tag = config.get( config.get("rank_method", "function"), "additional_report_number") except: register_exception(prefix="config error. cfg section " + function + " has no attribute additional_report_number", alert_admin=True) return [{}, {}, {}, {}] reference_number_tag = "" try: reference_number_tag = config.get( config.get("rank_method", "function"), "reference_via_report_number") except: register_exception(prefix="config error. cfg section " + function + " has no attribute reference_via_report_number", alert_admin=True) return [{}, {}, {}, {}] reference_tag = "" try: reference_tag = config.get(config.get("rank_method", "function"), "reference_via_pubinfo") except: register_exception(prefix="config error. cfg section " + function + " has no attribute reference_via_pubinfo", alert_admin=True) return [{}, {}, {}, {}] p_record_pri_number_tag = tagify(parse_tag(record_pri_number_tag)) #037a: contains (often) the "hep-ph/0501084" tag of THIS record p_record_add_number_tag = tagify(parse_tag(record_add_number_tag)) #088a: additional short identifier for the record p_reference_number_tag = tagify(parse_tag(reference_number_tag)) #999C5r. this is in the reference list, refers to other records. Looks like: hep-ph/0408002 p_reference_tag = tagify(parse_tag(reference_tag)) #999C5s. A standardized way of writing a reference in the reference list. Like: Nucl. Phys. B 710 (2000) 371 #fields needed to construct the pubinfo for this record publication_pages_tag = "" publication_year_tag = "" publication_journal_tag = "" publication_volume_tag = "" publication_format_string = "p v (y) c" try: tag = config.get(function, "pubinfo_journal_page") publication_pages_tag = tagify(parse_tag(tag)) tag = config.get(function, "pubinfo_journal_year") publication_year_tag = tagify(parse_tag(tag)) tag = config.get(function, "pubinfo_journal_title") publication_journal_tag = tagify(parse_tag(tag)) tag = config.get(function, "pubinfo_journal_volume") publication_volume_tag = tagify(parse_tag(tag)) publication_format_string = config.get(function, "pubinfo_journal_format") except: pass #print values for tags for debugging if task_get_task_param('verbose') >= 9: write_message("tag values") write_message("p_record_pri_number_tag " + str(p_record_pri_number_tag)) write_message("p_reference_tag " + str(p_reference_tag)) write_message("publication_journal_tag " + str(publication_journal_tag)) write_message("publication_format_string is " + publication_format_string) done = 0 #for status reporting numrecs = len(recid_list) # perform quick check to see if there are some records with # reference tags, because otherwise get.cit.inf would be slow even # if there is nothing to index: if run_sql("SELECT value FROM bib%sx WHERE tag=%%s LIMIT 1" % p_reference_tag[0:2], (p_reference_tag,)) or \ run_sql("SELECT value FROM bib%sx WHERE tag=%%s LIMIT 1" % p_reference_number_tag[0:2], (p_reference_number_tag,)): for recid in recid_list: if (done % 10 == 0): task_sleep_now_if_required() #in fact we can sleep any time here if (done % 1000 == 0): mesg = "get cit.inf done " + str(done) + " of " + str(numrecs) write_message(mesg) task_update_progress(mesg) done = done + 1 pri_report_numbers = get_fieldvalues(recid, p_record_pri_number_tag) add_report_numbers = get_fieldvalues(recid, p_record_add_number_tag) reference_report_numbers = get_fieldvalues(recid, p_reference_number_tag) references_s = get_fieldvalues(recid, p_reference_tag) l_report_numbers = pri_report_numbers l_report_numbers.extend(add_report_numbers) d_reports_numbers[recid] = l_report_numbers if reference_report_numbers: d_references_report_numbers[recid] = reference_report_numbers references_s = get_fieldvalues(recid, p_reference_tag) write_message(str(recid) + "'s " + str(p_reference_tag) + " values " + str(references_s), verbose=9) if references_s: d_references_s[recid] = references_s #get a combination of #journal vol (year) pages if publication_pages_tag and publication_journal_tag and \ publication_volume_tag and publication_year_tag and publication_format_string: tagsvalues = {} #we store the tags and their values here #like c->444 y->1999 p->"journal of foo",v->20 tagsvalues["p"] = "" tagsvalues["y"] = "" tagsvalues["c"] = "" tagsvalues["v"] = "" tmp = get_fieldvalues(recid, publication_journal_tag) if tmp: tagsvalues["p"] = tmp[0] tmp = get_fieldvalues(recid, publication_volume_tag) if tmp: tagsvalues["v"] = tmp[0] tmp = get_fieldvalues(recid, publication_year_tag) if tmp: tagsvalues["y"] = tmp[0] tmp = get_fieldvalues(recid, publication_pages_tag) if tmp: #if the page numbers have "x-y" take just x pages = tmp[0] hpos = pages.find("-") if hpos > 0: pages = pages[:hpos] tagsvalues["c"] = pages #format the publ infostring according to the format publ = "" ok = 1 for i in range(0, len(publication_format_string)): current = publication_format_string[i] #these are supported if current == "p" or current == "c" or current == "v" \ or current == "y": if tagsvalues[current]: #add the value in the string publ += tagsvalues[current] else: ok = 0 break #it was needed and not found else: publ += current #just add the character in the format string if ok: write_message("d_records_s (publication info) for " + str(recid) + " is " + publ, verbose=9) d_records_s[recid] = publ else: mesg = "Warning: there are no records with tag values for " mesg += p_reference_number_tag + " or " + p_reference_tag + ". Nothing to do." write_message(mesg) mesg = "get cit.inf done fully" write_message(mesg) task_update_progress(mesg) citation_informations.append(d_reports_numbers) citation_informations.append(d_references_report_numbers) citation_informations.append(d_references_s) citation_informations.append(d_records_s) end_time = os.times()[4] write_message("Execution time for generating citation info from record: %.2f sec" % \ (end_time - begin_time)) return citation_informations
def get_citation_weight(rank_method_code, config): """return a dictionary which is used by bibrank daemon for generating the index of sorted research results by citation information """ begin_time = time.time() last_update_time = get_bibrankmethod_lastupdate(rank_method_code) if task_get_option("quick") == "no": last_update_time = "0000-00-00 00:00:00" write_message("running thorough indexing since quick option not used", verbose=3) last_modified_records = get_last_modified_rec(last_update_time) #id option forces re-indexing a certain range even if there are no new recs if last_modified_records or task_get_option("id"): if task_get_option("id"): #construct a range of records to index taskid = task_get_option("id") first = taskid[0][0] last = taskid[0][1] #make range, last+1 so that e.g. -i 1-2 really means [1,2] not [1] updated_recid_list = range(first, last+1) else: updated_recid_list = create_recordid_list(last_modified_records) write_message("Last update "+str(last_update_time)+" records: "+ \ str(len(last_modified_records))+" updates: "+ \ str(len(updated_recid_list))) #write_message("updated_recid_list: "+str(updated_recid_list)) result_intermediate = last_updated_result(rank_method_code) #result_intermed should be warranted to exists! #but if the user entered a "-R" (do all) option, we need to #make an empty start set if task_get_option("quick") == "no": result_intermediate = [{}, {}, {}] else: # check indexing times of `journal' and `reportnumber` # indexes, since if they are not up to date yet, then we # should wait and not run citation indexing as of yet: last_timestamp_bibrec = run_sql("SELECT DATE_FORMAT(MAX(modification_date), '%%Y-%%m-%%d %%H:%%i:%%s') FROM bibrec", (), 1)[0][0] last_timestamp_indexes = run_sql("SELECT DATE_FORMAT(MAX(last_updated), '%%Y-%%m-%%d %%H:%%i:%%s') FROM idxINDEX WHERE name IN (%s,%s)", ('journal', 'reportnumber'), 1)[0][0] if not last_timestamp_indexes or \ not last_timestamp_bibrec or \ last_timestamp_bibrec > last_timestamp_indexes: write_message("Not running citation indexer since journal/reportnumber indexes are not up to date yet.") return {} citation_weight_dic_intermediate = result_intermediate[0] citation_list_intermediate = result_intermediate[1] reference_list_intermediate = result_intermediate[2] # Enrich updated_recid_list so that it would contain also # records citing or referring to updated records, so that # their citation information would be updated too. Not the # most efficient way to treat this problem, but the one that # requires least code changes until ref_analyzer() is more # nicely re-factored. updated_recid_list_set = intbitset(updated_recid_list) for somerecid in updated_recid_list: # add both citers and citees: updated_recid_list_set |= intbitset(citation_list_intermediate.get(somerecid, [])) updated_recid_list_set |= intbitset(reference_list_intermediate.get(somerecid, [])) updated_recid_list = list(updated_recid_list_set) #call the procedure that does the hard work by reading fields of #citations and references in the updated_recid's (but nothing else)! if task_get_task_param('verbose') >= 9: write_message("Entering get_citation_informations") citation_informations = get_citation_informations(updated_recid_list, config) #write_message("citation_informations: "+str(citation_informations)) #create_analysis_tables() #temporary.. #test how much faster in-mem indexing is write_message("Entering ref_analyzer", verbose=9) #call the analyser that uses the citation_informations to really #search x-cites-y in the coll.. dic = ref_analyzer(citation_informations, citation_weight_dic_intermediate, citation_list_intermediate, reference_list_intermediate, config,updated_recid_list) #dic is docid-numberofreferences like {1: 2, 2: 0, 3: 1} #write_message("Docid-number of known references "+str(dic)) end_time = time.time() write_message("Total time of get_citation_weight(): %.2f sec" % (end_time - begin_time)) task_update_progress("citation analysis done") else: dic = {} write_message("No new records added since last time this rank method was executed") return dic
def ref_analyzer(citation_informations, initialresult, initial_citationlist, initial_referencelist, config, updated_rec_list): """Analyze the citation informations and calculate the citation weight and cited by list dictionary. """ function = "" try: function = config.get("rank_method", "function") except: register_exception( prefix="cfg section [rank_method] has no attr function", alert_admin=True) return {} pubrefntag = "" try: pubrefntag = config.get(function, "reference_via_report_number") except: register_exception(prefix="cfg section " + function + " has no attr reference_via_report_number", alert_admin=True) return {} pubreftag = "" try: pubreftag = config.get(function, "reference_via_pubinfo") except: register_exception(prefix="cfg section " + function + " has no attr reference_via_pubinfo", alert_admin=True) return {} #pubrefntag is often 999C5r, pubreftag 999C5s if task_get_task_param('verbose') >= 9: write_message("pubrefntag " + pubrefntag) write_message("pubreftag " + pubreftag) citation_list = initial_citationlist reference_list = initial_referencelist result = initialresult d_reports_numbers = citation_informations[ 0] #dict of recid -> institute_give_publ_id d_references_report_numbers = citation_informations[ 1] #dict of recid -> ['astro-ph/xyz'..] d_references_s = citation_informations[2] #dict of recid -> publication_infos_in_its_bibliography d_records_s = citation_informations[3] #recid -> its publication inf t1 = os.times()[4] write_message("Phase 1: d_references_report_numbers") #d_references_report_numbers: e.g 8 -> ([astro-ph/9889],[hep-ph/768]) #meaning: rec 8 contains these in bibliography done = 0 numrecs = len(d_references_report_numbers) for thisrecid, refnumbers in d_references_report_numbers.iteritems(): if (done % 1000 == 0): mesg = "d_references_report_numbers done " + str( done) + " of " + str(numrecs) write_message(mesg) task_update_progress(mesg) #write to db! insert_into_cit_db(reference_list, "reversedict") insert_into_cit_db(citation_list, "citationdict") #it's ok to sleep too, we got something done task_sleep_now_if_required() done = done + 1 for refnumber in refnumbers: if refnumber: p = refnumber f = 'reportnumber' #sanitise p p.replace("\n", '') #search for "hep-th/5644654 or such" in existing records rec_ids = get_recids_matching_query(p, f) if rec_ids and rec_ids[0]: write_citer_cited(thisrecid, rec_ids[0]) remove_from_missing(p) if not result.has_key(rec_ids[0]): result[rec_ids[0]] = 0 # Citation list should have rec_ids[0] but check anyway if not citation_list.has_key(rec_ids[0]): citation_list[rec_ids[0]] = [] #append unless this key already has the item if not thisrecid in citation_list[rec_ids[0]]: citation_list[rec_ids[0]].append(thisrecid) #and update result result[rec_ids[0]] += 1 if not reference_list.has_key(thisrecid): reference_list[thisrecid] = [] if not rec_ids[0] in reference_list[thisrecid]: reference_list[thisrecid].append(rec_ids[0]) else: #the reference we wanted was not found among our records. #put the reference in the "missing".. however, it will look #bad.. gfhgf/1254312, so get the corresponding 999C5s (full ref) too #This should really be done in the next loop d_references_s #but the 999C5s fields are not yet normalized #rectext = print_record(thisrecid, format='hm', ot=pubreftag[:-1]) rectext = "" # print_record() call disabled to speed things up lines = rectext.split("\n") rpart = p #to be used.. for l in lines: if ( l.find(p) > 0 ): #the gfhgf/1254312 was found.. get the s-part of it st = l.find('$s') if (st > 0): end = l.find('$', st) if (end == st): end = len(l) rpart = l[st + 2:end] insert_into_missing(thisrecid, rpart) mesg = "d_references_report_numbers done fully" write_message(mesg) task_update_progress(mesg) t2 = os.times()[4] #try to find references based on 999C5s, like Phys.Rev.Lett. 53 (1986) 2285 write_message("Phase 2: d_references_s") done = 0 numrecs = len(d_references_s) for thisrecid, refss in d_references_s.iteritems(): if (done % 1000 == 0): mesg = "d_references_s done " + str(done) + " of " + str(numrecs) write_message(mesg) task_update_progress(mesg) #write to db! insert_into_cit_db(reference_list, "reversedict") insert_into_cit_db(citation_list, "citationdict") task_sleep_now_if_required() done = done + 1 for refs in refss: if refs: p = refs #remove the latter page number if it is like 67-74 matches = re.compile("(.*)(-\d+$)").findall(p) if matches and matches[0]: p = matches[0][0] rec_id = None try: rec_ids = list(search_unit(p, 'journal')) except: rec_ids = None write_message("These match searching " + p + " in journal: " + str(rec_id), verbose=9) if rec_ids and rec_ids[0]: #the refered publication is in our collection, remove #from missing remove_from_missing(p) else: #it was not found so add in missing insert_into_missing(thisrecid, p) #check citation and reference for this.. if rec_ids and rec_ids[0]: #the above should always hold if not result.has_key(rec_ids[0]): result[rec_ids[0]] = 0 if not citation_list.has_key(rec_ids[0]): citation_list[rec_ids[0]] = [] if not thisrecid in citation_list[rec_ids[0]]: citation_list[rec_ids[0]].append( thisrecid) #append actual list result[rec_ids[0]] += 1 #add count for this.. #update reference_list accordingly if not reference_list.has_key(thisrecid): reference_list[thisrecid] = [] if not rec_ids[0] in reference_list[thisrecid]: reference_list[thisrecid].append(rec_ids[0]) mesg = "d_references_s done fully" write_message(mesg) task_update_progress(mesg) t3 = os.times()[4] done = 0 numrecs = len(d_reports_numbers) write_message("Phase 3: d_reports_numbers") #search for stuff like CERN-TH-4859/87 in list of refs for thisrecid, reportcodes in d_reports_numbers.iteritems(): if (done % 1000 == 0): mesg = "d_report_numbers done " + str(done) + " of " + str(numrecs) write_message(mesg) task_update_progress(mesg) done = done + 1 for reportcode in reportcodes: if reportcode: rec_ids = [] try: rec_ids = get_recids_matching_query(reportcode, pubrefntag) except: rec_ids = [] if rec_ids: for recid in rec_ids: #normal checks.. if not citation_list.has_key(thisrecid): citation_list[thisrecid] = [] if not reference_list.has_key(recid): reference_list[recid] = [] if not result.has_key(thisrecid): result[thisrecid] = 0 #normal updates if not recid in citation_list[thisrecid]: result[thisrecid] += 1 citation_list[thisrecid].append(recid) if not thisrecid in reference_list[recid]: reference_list[recid].append(thisrecid) mesg = "d_report_numbers done fully" write_message(mesg) task_update_progress(mesg) #find this record's pubinfo in other records' bibliography write_message("Phase 4: d_records_s") done = 0 numrecs = len(d_records_s) t4 = os.times()[4] for thisrecid, recs in d_records_s.iteritems(): if (done % 1000 == 0): mesg = "d_records_s done " + str(done) + " of " + str(numrecs) write_message(mesg) task_update_progress(mesg) done = done + 1 p = recs.replace("\"", "") #search the publication string like Phys. Lett., B 482 (2000) 417 in 999C5s rec_ids = list(search_unit(f=pubreftag, p=p, m='a')) write_message("These records match " + p + " in " + pubreftag + " : " + str(rec_ids), verbose=9) if rec_ids: for rec_id in rec_ids: #normal checks if not result.has_key(thisrecid): result[thisrecid] = 0 if not citation_list.has_key(thisrecid): citation_list[thisrecid] = [] if not reference_list.has_key(rec_id): reference_list[rec_id] = [] if not rec_id in citation_list[thisrecid]: result[thisrecid] += 1 citation_list[thisrecid].append(rec_id) if not thisrecid in reference_list[rec_id]: reference_list[rec_id].append(thisrecid) mesg = "d_records_s done fully" write_message(mesg) task_update_progress(mesg) write_message("Phase 5: reverse lists") #remove empty lists in citation and reference keys = citation_list.keys() for k in keys: if not citation_list[k]: del citation_list[k] keys = reference_list.keys() for k in keys: if not reference_list[k]: del reference_list[k] write_message("Phase 6: self-citations") selfdic = {} #get the initial self citation dict initial_self_dict = get_cit_dict("selfcitdict") selfdic = initial_self_dict #add new records to selfdic acit = task_get_option("author-citations") if not acit: write_message( "Self cite processing disabled. Use -A option to enable it.") else: write_message("self cite and author citations enabled") selfdic = get_self_citations(updated_rec_list, citation_list, initial_self_dict, config) #selfdic consists of #key k -> list of values [v1,v2,..] #where k is a record with author A and k cites v1,v2.. and A appears in v1,v2.. #create a reverse "x cited by y" self cit dict selfcitedbydic = {} for k in selfdic.keys(): vlist = selfdic[k] for v in vlist: if selfcitedbydic.has_key(v): tmplist = selfcitedbydic[v] if not k in tmplist: tmplist.append(k) else: tmplist = [k] selfcitedbydic[v] = tmplist write_message("Getting author citations") #get author citations for records in updated_rec_list initial_author_dict = get_initial_author_dict() authorcitdic = initial_author_dict acit = task_get_option("author-citations") if not acit: print "Author cites disabled. Use -A option to enable it." else: write_message("author citations enabled") authorcitdic = get_author_citations(updated_rec_list, citation_list, initial_author_dict, config) if task_get_task_param('verbose') >= 3: #print only X first to prevent flood tmpdict = {} tmp = citation_list.keys()[0:10] for t in tmp: tmpdict[t] = citation_list[t] write_message("citation_list (x is cited by y): " + str(tmpdict)) write_message("size: " + str(len(citation_list.keys()))) tmp = reference_list.keys()[0:10] tmpdict = {} for t in tmp: tmpdict[t] = reference_list[t] write_message("reference_list (x cites y): " + str(tmpdict)) write_message("size: " + str(len(reference_list.keys()))) tmp = selfcitedbydic.keys()[0:10] tmpdict = {} for t in tmp: tmpdict[t] = selfcitedbydic[t] mesg = "selfcitedbydic (x is cited by y and one of the authors of x same as y's):" mesg += str(tmpdict) write_message(mesg) write_message("size: " + str(len(selfcitedbydic.keys()))) tmp = selfdic.keys()[0:100] tmpdict = {} for t in tmp: tmpdict[t] = selfdic[t] mesg = "selfdic (x cites y and one of the authors of x same as y's): " + str( tmpdict) write_message(mesg) write_message("size: " + str(len(selfdic.keys()))) tmp = authorcitdic.keys()[0:10] tmpdict = {} for t in tmp: tmpdict[t] = authorcitdic[t] write_message("authorcitdic (author is cited in recs): " + str(tmpdict)) write_message("size: " + str(len(authorcitdic.keys()))) insert_cit_ref_list_intodb(citation_list, reference_list, selfcitedbydic, selfdic, authorcitdic) t5 = os.times()[4] write_message( "Execution time for analyzing the citation information generating the dictionary:" ) write_message("... checking ref number: %.2f sec" % (t2 - t1)) write_message("... checking ref ypvt: %.2f sec" % (t3 - t2)) write_message("... checking rec number: %.2f sec" % (t4 - t3)) write_message("... checking rec ypvt: %.2f sec" % (t5 - t4)) write_message("... total time of ref_analyze: %.2f sec" % (t5 - t1)) return result
def get_citation_weight(rank_method_code, config): """return a dictionary which is used by bibrank daemon for generating the index of sorted research results by citation information """ begin_time = time.time() last_update_time = get_bibrankmethod_lastupdate(rank_method_code) if task_get_option("quick") == "no": last_update_time = "0000-00-00 00:00:00" write_message("running thorough indexing since quick option not used", verbose=3) last_modified_records = get_last_modified_rec(last_update_time) #id option forces re-indexing a certain range even if there are no new recs if last_modified_records or task_get_option("id"): if task_get_option("id"): #construct a range of records to index taskid = task_get_option("id") first = taskid[0][0] last = taskid[0][1] #make range, last+1 so that e.g. -i 1-2 really means [1,2] not [1] updated_recid_list = range(first, last+1) else: updated_recid_list = create_recordid_list(last_modified_records) write_message("Last update "+str(last_update_time)+" records: "+ \ str(len(last_modified_records))+" updates: "+ \ str(len(updated_recid_list))) #write_message("updated_recid_list: "+str(updated_recid_list)) result_intermediate = last_updated_result(rank_method_code) #result_intermed should be warranted to exists! #but if the user entered a "-R" (do all) option, we need to #make an empty start set if task_get_option("quick") == "no": result_intermediate = [{}, {}, {}] citation_weight_dic_intermediate = result_intermediate[0] citation_list_intermediate = result_intermediate[1] reference_list_intermediate = result_intermediate[2] #call the procedure that does the hard work by reading fields of #citations and references in the updated_recid's (but nothing else)! if task_get_task_param('verbose') >= 9: write_message("Entering get_citation_informations") citation_informations = get_citation_informations(updated_recid_list, config) #write_message("citation_informations: "+str(citation_informations)) #create_analysis_tables() #temporary.. #test how much faster in-mem indexing is write_message("Entering ref_analyzer", verbose=9) #call the analyser that uses the citation_informations to really #search x-cites-y in the coll.. dic = ref_analyzer(citation_informations, citation_weight_dic_intermediate, citation_list_intermediate, reference_list_intermediate, config,updated_recid_list) #dic is docid-numberofreferences like {1: 2, 2: 0, 3: 1} #write_message("Docid-number of known references "+str(dic)) end_time = time.time() write_message("Total time of get_citation_weight(): %.2f sec" % (end_time - begin_time)) task_update_progress("citation analysis done") else: dic = {} write_message("No new records added since last time this rank method was executed") return dic
def task_run_core(): """Run the harvesting task. The row argument is the oaiharvest task queue row, containing if, arguments, etc. Return 1 in case of success and 0 in case of failure. """ reposlist = [] datelist = [] dateflag = 0 possible_postmodes = [code for code, dummy in CFG_OAI_POSSIBLE_POSTMODES] filepath_prefix = tmpHARVESTpath + "_" + str(task_get_task_param("task_id")) ### go ahead: build up the reposlist if task_get_option("repository") is not None: ### user requests harvesting from selected repositories write_message("harvesting from selected repositories") for reposname in task_get_option("repository"): row = get_row_from_reposname(reposname) if row == []: write_message("source name " + reposname + " is not valid") continue else: reposlist.append(get_row_from_reposname(reposname)) else: ### user requests harvesting from all repositories write_message("harvesting from all repositories in the database") reposlist = get_all_rows_from_db() ### go ahead: check if user requested from-until harvesting if task_get_option("dates"): ### for each repos simply perform a from-until date harvesting... ### no need to update anything dateflag = 1 for element in task_get_option("dates"): datelist.append(element) error_happened_p = False j = 0 for repos in reposlist: j += 1 task_sleep_now_if_required() reponame = str(repos[0][6]) postmode = str(repos[0][9]) setspecs = str(repos[0][10]) harvested_files_list = [] if postmode in possible_postmodes: # Harvest phase harvestpath = filepath_prefix + "_" + str(j) + "_" + \ time.strftime("%Y%m%d%H%M%S") + "_harvested" if dateflag == 1: task_update_progress("Harvesting %s from %s to %s (%i/%i)" % \ (reponame, \ str(datelist[0]), str(datelist[1]), j, \ len(reposlist))) exit_code, file_list = oai_harvest_get(prefix = repos[0][2], baseurl = repos[0][1], harvestpath = harvestpath, fro = str(datelist[0]), until = str(datelist[1]), setspecs = setspecs) if exit_code == 1 : write_message("source " + reponame + \ " was harvested from " + str(datelist[0]) \ + " to " + str(datelist[1])) harvested_files_list = file_list else: write_message("an error occurred while harvesting " "from source " + reponame + " for the dates chosen") error_happened_p = True continue elif dateflag != 1 and repos[0][7] is None and repos[0][8] != 0: write_message("source " + reponame + \ " was never harvested before - harvesting whole " "repository") task_update_progress("Harvesting %s (%i/%i)" % \ (reponame, j, \ len(reposlist))) exit_code, file_list = oai_harvest_get(prefix = repos[0][2], baseurl = repos[0][1], harvestpath = harvestpath, setspecs = setspecs) if exit_code == 1 : update_lastrun(repos[0][0]) harvested_files_list = file_list else : write_message("an error occurred while harvesting from " "source " + reponame) error_happened_p = True continue elif dateflag != 1 and repos[0][8] != 0: ### check that update is actually needed, ### i.e. lastrun+frequency>today timenow = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) lastrundate = re.sub(r'\.[0-9]+$', '', str(repos[0][7])) # remove trailing .00 timeinsec = int(repos[0][8]) * 60 * 60 updatedue = add_timestamp_and_timelag(lastrundate, timeinsec) proceed = compare_timestamps_with_tolerance(updatedue, timenow) if proceed == 0 or proceed == -1 : #update needed! write_message("source " + reponame + " is going to be updated") fromdate = str(repos[0][7]) fromdate = fromdate.split()[0] # get rid of time # of the day for the moment task_update_progress("Harvesting %s (%i/%i)" % \ (reponame, j, \ len(reposlist))) exit_code, file_list = oai_harvest_get(prefix = repos[0][2], baseurl = repos[0][1], harvestpath = harvestpath, fro = fromdate, setspecs = setspecs) if exit_code == 1 : update_lastrun(repos[0][0]) harvested_files_list = file_list else : write_message("an error occurred while harvesting " "from source " + reponame) error_happened_p = True continue else: write_message("source " + reponame + " does not need updating") continue elif dateflag != 1 and repos[0][8] == 0: write_message("source " + reponame + \ " has frequency set to 'Never' so it will not be updated") continue # Harvesting done, now convert/extract/filter/upload as requested if len(harvested_files_list) < 1: write_message("No records harvested for %s" % (reponame,)) continue active_files_list = harvested_files_list # Convert phase if 'c' in postmode: converted_files_list = [] i = 0 for active_file in active_files_list: i += 1 task_sleep_now_if_required() task_update_progress("Converting material harvested from %s (%i/%i)" % \ (reponame, \ i, \ len(active_files_list))) converted_file = filepath_prefix + "_" + str(i) + "_" + \ time.strftime("%Y%m%d%H%M%S") + "_converted" converted_files_list.append(converted_file) (exitcode, err_msg) = call_bibconvert(config = str(repos[0][5]), harvestpath = active_file, convertpath = converted_file) if exitcode == 0: write_message("material harvested from source " + reponame + " was successfully converted") else: write_message("an error occurred while converting from " + reponame + ': \n' + err_msg) error_happened_p = True continue # print stats: for converted_file in converted_files_list: write_message("File %s contains %i records." % \ (converted_file, get_nb_records_in_file(converted_file))) active_files_list = converted_files_list if 'e' in postmode: # Download tarball for each harvested/converted record, then run plotextrator. # Update converted xml files with generated xml or add it for upload extracted_files_list = [] i = 0 for active_file in active_files_list: i += 1 task_sleep_now_if_required() task_update_progress("Extracting material harvested from %s (%i/%i)" % \ (reponame, i, len(active_files_list))) extracted_file = filepath_prefix + "_" + str(i) + "_" + \ time.strftime("%Y%m%d%H%M%S") + "_extracted" extracted_files_list.append(extracted_file) (exitcode, err_msg) = call_plotextractor(active_file, extracted_file) if exitcode == 0: write_message("material harvested from source " + reponame + " was successfully extracted") else: write_message("an error occurred while extracting from " + reponame + ': \n' + err_msg) error_happened_p = True continue # print stats: for extracted_file in extracted_files_list: write_message("File %s contains %i records." % \ (extracted_file, get_nb_records_in_file(extracted_file))) active_files_list = extracted_files_list # Filter-phase if 'f' in postmode: # first call bibfilter: res = 0 uploaded = False i = 0 for active_file in active_files_list: i += 1 task_sleep_now_if_required() task_update_progress("Filtering material harvested from %s (%i/%i)" % \ (reponame, \ i, \ len(active_files_list))) res += call_bibfilter(str(repos[0][11]), active_file) if len(active_files_list) > 0: if res == 0: write_message("material harvested from source " + reponame + " was successfully bibfiltered") else: write_message("an error occurred while bibfiltering " "harvest from " + reponame) error_happened_p = True continue # print stats: for active_file in active_files_list: write_message("File %s contains %i records." % \ (active_file + ".insert.xml", get_nb_records_in_file(active_file + ".insert.xml"))) write_message("File %s contains %i records." % \ (active_file + ".correct.xml", get_nb_records_in_file(active_file + ".correct.xml"))) write_message("File %s contains %i records." % \ (active_file + ".append.xml", get_nb_records_in_file(active_file + ".append.xml"))) write_message("File %s contains %i records." % \ (active_file + ".holdingpen.xml", get_nb_records_in_file(active_file + ".holdingpen.xml"))) # Upload files if "u" in postmode: if 'f' in postmode: # upload filtered files i = 0 for active_file in active_files_list: task_sleep_now_if_required() i += 1 if get_nb_records_in_file(active_file + ".insert.xml") > 0: task_update_progress("Uploading new records harvested from %s (%i/%i)" % \ (reponame, \ i, \ len(active_files_list))) res += call_bibupload(active_file + ".insert.xml", \ ["-i"], oai_src_id = repos[0][0]) uploaded = True task_sleep_now_if_required() if get_nb_records_in_file(active_file + ".correct.xml") > 0: task_update_progress("Uploading corrections for records harvested from %s (%i/%i)" % \ (reponame, \ i, \ len(active_files_list))) res += call_bibupload(active_file + ".correct.xml", \ ["-c"], oai_src_id = repos[0][0]) uploaded = True if get_nb_records_in_file(active_file + ".append.xml") > 0: task_update_progress("Uploading additions for records harvested from %s (%i/%i)" % \ (reponame, \ i, \ len(active_files_list))) res += call_bibupload(active_file + ".append.xml", \ ["-a"], oai_src_id = repos[0][0]) uploaded = True if get_nb_records_in_file(active_file + ".holdingpen.xml") > 0: task_update_progress("Uploading records harvested from %s to holding pen (%i/%i)" % \ (reponame, \ i, \ len(active_files_list))) res += call_bibupload(active_file + ".holdingpen.xml", \ ["-o"], oai_src_id = repos[0][0]) uploaded = True if len(active_files_list) > 0: if res == 0: if uploaded: write_message("material harvested from source " + reponame + " was successfully uploaded") else: write_message("nothing to upload") else: write_message("an error occurred while uploading " "harvest from " + reponame) error_happened_p = True continue else: # upload files normally res = 0 i = 0 uploaded = False for active_file in active_files_list: i += 1 task_sleep_now_if_required() if get_nb_records_in_file(active_file) > 0: task_update_progress("Uploading records harvested from %s (%i/%i)" % \ (reponame, \ i, \ len(active_files_list))) res += call_bibupload(active_file, oai_src_id = repos[0][0]) uploaded = True if res == 0: if uploaded: write_message("material harvested from source " + reponame + " was successfully uploaded") else: write_message("nothing to upload") else: write_message("an error occurred while uploading " "harvest from " + reponame) error_happened_p = True continue else: ### this should not happen write_message("invalid postprocess mode: " + postmode + " skipping repository") error_happened_p = True continue if error_happened_p: return False else: return True
def get_author_citations(updated_redic_list, citedbydict, initial_author_dict, config): """Traverses citedbydict in order to build "which author is quoted where" dict. The keys of this are author names. An entry like "Apollinaire"->[1,2,3] means Apollinaire is cited in records 1,2 and 3. Input: citedbydict, updated_redic_list = records to be searched, initial_author_dict: the dicts from the database. Output: authorciteddict. It is initially set to initial_author_dict """ #sorry bout repeated code to get the tags tags = ['first_author', 'additional_author', 'alternative_author_name'] tagvals = {} for t in tags: try: x = config.get(config.get("rank_method", "function"), t) tagvals[t] = x except: register_exception(prefix="attribute " + t + " missing in config", alert_admin=True) return initial_author_dict #parse the tags mainauthortag = tagify(parse_tag(tagvals['first_author'])) coauthortag = tagify(parse_tag(tagvals['additional_author'])) extauthortag = tagify(parse_tag(tagvals['alternative_author_name'])) if task_get_task_param('verbose') >= 9: write_message("mainauthortag " + mainauthortag) write_message("coauthortag " + coauthortag) write_message("extauthortag " + extauthortag) author_cited_in = initial_author_dict if citedbydict: i = 0 #just a counter for debug write_message("Checking records referred to in new records") for u in updated_redic_list: if (i % 1000 == 0): mesg = "Author ref done " + str(i) + " of " + str( len(updated_redic_list)) + " records" write_message(mesg) task_update_progress(mesg) i = i + 1 if citedbydict.has_key(u): these_cite_k = citedbydict[u] if (these_cite_k is None): these_cite_k = [] #verify it is an empty list, not None authors = get_fieldvalues(u, mainauthortag) coauthl = get_fieldvalues(u, coauthortag) extauthl = get_fieldvalues(u, extauthortag) authors.extend(coauthl) authors.extend(extauthl) for a in authors: if a and author_cited_in.has_key(a): #add all elements in these_cite_k #that are not there already for citer in these_cite_k: tmplist = author_cited_in[a] if (tmplist.count(citer) == 0): tmplist.append(citer) author_cited_in[a] = tmplist else: author_cited_in[a] = these_cite_k mesg = "Author ref done fully" write_message(mesg) task_update_progress(mesg) #go through the dictionary again: all keys but search only if new records are cited write_message("Checking authors in new records") i = 0 for k in citedbydict.keys(): if (i % 1000 == 0): mesg = "Author cit done " + str(i) + " of " + str( len(citedbydict.keys())) + " records" write_message(mesg) task_update_progress(mesg) i = i + 1 these_cite_k = citedbydict[k] if (these_cite_k is None): these_cite_k = [] #verify it is an empty list, not None #do things only if these_cite_k contains any new stuff intersec_list = list(set(these_cite_k) & set(updated_redic_list)) if intersec_list: authors = get_fieldvalues(k, mainauthortag) coauthl = get_fieldvalues(k, coauthortag) extauthl = get_fieldvalues(k, extauthortag) authors.extend(coauthl) authors.extend(extauthl) for a in authors: if a and author_cited_in.has_key(a): #add all elements in these_cite_k #that are not there already for citer in these_cite_k: tmplist = author_cited_in[a] if (tmplist.count(citer) == 0): tmplist.append(citer) author_cited_in[a] = tmplist else: author_cited_in[a] = these_cite_k mesg = "Author cit done fully" write_message(mesg) task_update_progress(mesg) return author_cited_in
def get_citation_informations(recid_list, config): """scans the collections searching references (999C5x -fields) and citations for items in the recid_list returns a 4 list of dictionaries that contains the citation information of cds records examples: [ {} {} {} {} ] [ {5: 'SUT-DP-92-70-5'}, { 93: ['astro-ph/9812088']}, { 93: ['Phys. Rev. Lett. 96 (2006) 081301'] }, {} ] NB: stuff here is for analysing new or changed records. see "ref_analyzer" for more. """ begin_time = os.times()[4] d_reports_numbers = {} #dict of recid -> institute-given-report-code d_references_report_numbers = {} #dict of recid -> ['astro-ph/xyz'] d_references_s = {} #dict of recid -> list_of_the_entries_of_this_recs_bibliography d_records_s = {} #dict of recid -> this_records_publication_info citation_informations = [] write_message("config function "+config.get("rank_method", "function"), verbose=9) function = "" try: function = config.get("rank_method", "function") except: register_exception(prefix="cfg section [rank_method] has no attribute called function", alert_admin=True) #we cannot continue return [ {}, {}, {}, {} ] record_pri_number_tag = "" try: record_pri_number_tag = config.get(function, "primary_report_number") except: register_exception(prefix="cfg section "+function+" has no attribute primary_report_number", alert_admin=True) return [ {}, {}, {}, {} ] record_add_number_tag = "" try: record_add_number_tag = config.get(config.get("rank_method", "function"), "additional_report_number") except: register_exception(prefix="config error. cfg section "+function+" has no attribute additional_report_number", alert_admin=True) return [ {}, {}, {}, {} ] reference_number_tag = "" try: reference_number_tag = config.get(config.get("rank_method", "function"), "reference_via_report_number") except: register_exception(prefix="config error. cfg section "+function+" has no attribute reference_via_report_number", alert_admin=True) return [ {}, {}, {}, {} ] reference_tag = "" try: reference_tag = config.get(config.get("rank_method", "function"), "reference_via_pubinfo") except: register_exception(prefix="config error. cfg section "+function+" has no attribute reference_via_pubinfo", alert_admin=True) return [ {}, {}, {}, {} ] p_record_pri_number_tag = tagify(parse_tag(record_pri_number_tag)) #037a: contains (often) the "hep-ph/0501084" tag of THIS record p_record_add_number_tag = tagify(parse_tag(record_add_number_tag)) #088a: additional short identifier for the record p_reference_number_tag = tagify(parse_tag(reference_number_tag)) #999C5r. this is in the reference list, refers to other records. Looks like: hep-ph/0408002 p_reference_tag = tagify(parse_tag(reference_tag)) #999C5s. A standardized way of writing a reference in the reference list. Like: Nucl. Phys. B 710 (2000) 371 #fields needed to construct the pubinfo for this record publication_pages_tag = "" publication_year_tag = "" publication_journal_tag = "" publication_volume_tag = "" publication_format_string = "p v (y) c" try: tag = config.get(function, "pubinfo_journal_page") publication_pages_tag = tagify(parse_tag(tag)) tag = config.get(function, "pubinfo_journal_year") publication_year_tag = tagify(parse_tag(tag)) tag = config.get(function, "pubinfo_journal_title") publication_journal_tag = tagify(parse_tag(tag)) tag = config.get(function, "pubinfo_journal_volume") publication_volume_tag = tagify(parse_tag(tag)) publication_format_string = config.get(function, "pubinfo_journal_format") except: pass #print values for tags for debugging if task_get_task_param('verbose') >= 9: write_message("tag values") write_message("p_record_pri_number_tag "+str(p_record_pri_number_tag)) write_message("p_reference_tag "+str(p_reference_tag)) write_message("publication_journal_tag "+str(publication_journal_tag)) write_message("publication_format_string is "+publication_format_string) done = 0 #for status reporting numrecs = len(recid_list) # perform quick check to see if there are some records with # reference tags, because otherwise get.cit.inf would be slow even # if there is nothing to index: if run_sql("SELECT value FROM bib%sx WHERE tag=%%s LIMIT 1" % p_reference_tag[0:2], (p_reference_tag,)) or \ run_sql("SELECT value FROM bib%sx WHERE tag=%%s LIMIT 1" % p_reference_number_tag[0:2], (p_reference_number_tag,)): for recid in recid_list: if (done % 10 == 0): task_sleep_now_if_required() #in fact we can sleep any time here if (done % 1000 == 0): mesg = "get cit.inf done "+str(done)+" of "+str(numrecs) write_message(mesg) task_update_progress(mesg) done = done+1 if recid in INTBITSET_OF_DELETED_RECORDS: # do not treat this record since it was deleted; we # skip it like this in case it was only soft-deleted # e.g. via bibedit (i.e. when collection tag 980 is # DELETED but other tags like report number or journal # publication info remained the same, so the calls to # get_fieldvalues() below would return old values) continue pri_report_numbers = get_fieldvalues(recid, p_record_pri_number_tag) add_report_numbers = get_fieldvalues(recid, p_record_add_number_tag) reference_report_numbers = get_fieldvalues(recid, p_reference_number_tag) references_s = get_fieldvalues(recid, p_reference_tag) l_report_numbers = pri_report_numbers l_report_numbers.extend(add_report_numbers) d_reports_numbers[recid] = l_report_numbers if reference_report_numbers: d_references_report_numbers[recid] = reference_report_numbers references_s = get_fieldvalues(recid, p_reference_tag) write_message(str(recid)+"'s "+str(p_reference_tag)+" values "+str(references_s), verbose=9) if references_s: d_references_s[recid] = references_s #get a combination of #journal vol (year) pages if publication_pages_tag and publication_journal_tag and \ publication_volume_tag and publication_year_tag and publication_format_string: tagsvalues = {} #we store the tags and their values here #like c->444 y->1999 p->"journal of foo",v->20 tagsvalues["p"] = "" tagsvalues["y"] = "" tagsvalues["c"] = "" tagsvalues["v"] = "" tmp = get_fieldvalues(recid, publication_journal_tag) if tmp: tagsvalues["p"] = tmp[0] tmp = get_fieldvalues(recid, publication_volume_tag) if tmp: tagsvalues["v"] = tmp[0] tmp = get_fieldvalues(recid, publication_year_tag) if tmp: tagsvalues["y"] = tmp[0] tmp = get_fieldvalues(recid, publication_pages_tag) if tmp: #if the page numbers have "x-y" take just x pages = tmp[0] hpos = pages.find("-") if hpos > 0: pages = pages[:hpos] tagsvalues["c"] = pages #format the publ infostring according to the format publ = "" ok = 1 for i in range (0, len(publication_format_string)): current = publication_format_string[i] #these are supported if current == "p" or current == "c" or current == "v" \ or current == "y": if tagsvalues[current]: #add the value in the string publ += tagsvalues[current] else: ok = 0 break #it was needed and not found else: publ += current #just add the character in the format string if ok: write_message("d_records_s (publication info) for "+str(recid)+" is "+publ, verbose=9) d_records_s[recid] = publ else: mesg = "Warning: there are no records with tag values for " mesg += p_reference_number_tag+" or "+p_reference_tag+". Nothing to do." write_message(mesg) mesg = "get cit.inf done fully" write_message(mesg) task_update_progress(mesg) citation_informations.append(d_reports_numbers) citation_informations.append(d_references_report_numbers) citation_informations.append(d_references_s) citation_informations.append(d_records_s) end_time = os.times()[4] write_message("Execution time for generating citation info from record: %.2f sec" % \ (end_time - begin_time)) return citation_informations
def get_citation_weight(rank_method_code, config): """return a dictionary which is used by bibrank daemon for generating the index of sorted research results by citation information """ begin_time = time.time() last_update_time = get_bibrankmethod_lastupdate(rank_method_code) if task_get_option("quick") == "no": last_update_time = "0000-00-00 00:00:00" write_message("running thorough indexing since quick option not used", verbose=3) last_modified_records = get_last_modified_rec(last_update_time) #id option forces re-indexing a certain range even if there are no new recs if last_modified_records or task_get_option("id"): if task_get_option("id"): #construct a range of records to index taskid = task_get_option("id") first = taskid[0][0] last = taskid[0][1] #make range, last+1 so that e.g. -i 1-2 really means [1,2] not [1] updated_recid_list = range(first, last + 1) else: updated_recid_list = create_recordid_list(last_modified_records) write_message("Last update "+str(last_update_time)+" records: "+ \ str(len(last_modified_records))+" updates: "+ \ str(len(updated_recid_list))) #write_message("updated_recid_list: "+str(updated_recid_list)) result_intermediate = last_updated_result(rank_method_code) #result_intermed should be warranted to exists! #but if the user entered a "-R" (do all) option, we need to #make an empty start set if task_get_option("quick") == "no": result_intermediate = [{}, {}, {}] citation_weight_dic_intermediate = result_intermediate[0] citation_list_intermediate = result_intermediate[1] reference_list_intermediate = result_intermediate[2] #call the procedure that does the hard work by reading fields of #citations and references in the updated_recid's (but nothing else)! if task_get_task_param('verbose') >= 9: write_message("Entering get_citation_informations") citation_informations = get_citation_informations( updated_recid_list, config) #write_message("citation_informations: "+str(citation_informations)) #create_analysis_tables() #temporary.. #test how much faster in-mem indexing is write_message("Entering ref_analyzer", verbose=9) #call the analyser that uses the citation_informations to really #search x-cites-y in the coll.. dic = ref_analyzer(citation_informations, citation_weight_dic_intermediate, citation_list_intermediate, reference_list_intermediate, config, updated_recid_list) #dic is docid-numberofreferences like {1: 2, 2: 0, 3: 1} #write_message("Docid-number of known references "+str(dic)) end_time = time.time() write_message("Total time of get_citation_weight(): %.2f sec" % (end_time - begin_time)) task_update_progress("citation analysis done") else: dic = {} write_message( "No new records added since last time this rank method was executed" ) return dic
def get_author_citations(updated_redic_list, citedbydict, initial_author_dict, config): """Traverses citedbydict in order to build "which author is quoted where" dict. The keys of this are author names. An entry like "Apollinaire"->[1,2,3] means Apollinaire is cited in records 1,2 and 3. Input: citedbydict, updated_redic_list = records to be searched, initial_author_dict: the dicts from the database. Output: authorciteddict. It is initially set to initial_author_dict """ #sorry bout repeated code to get the tags tags = ['first_author', 'additional_author', 'alternative_author_name'] tagvals = {} for t in tags: try: x = config.get(config.get("rank_method", "function"), t) tagvals[t] = x except: register_exception(prefix="attribute "+t+" missing in config", alert_admin=True) return initial_author_dict #parse the tags mainauthortag = tagify(parse_tag(tagvals['first_author'])) coauthortag = tagify(parse_tag(tagvals['additional_author'])) extauthortag = tagify(parse_tag(tagvals['alternative_author_name'])) if task_get_task_param('verbose') >= 9: write_message("mainauthortag "+mainauthortag) write_message("coauthortag "+coauthortag) write_message("extauthortag "+extauthortag) author_cited_in = initial_author_dict if citedbydict: i = 0 #just a counter for debug write_message("Checking records referred to in new records") for u in updated_redic_list: if (i % 1000 == 0): mesg = "Author ref done "+str(i)+" of "+str(len(updated_redic_list))+" records" write_message(mesg) task_update_progress(mesg) i = i + 1 if citedbydict.has_key(u): these_cite_k = citedbydict[u] if (these_cite_k is None): these_cite_k = [] #verify it is an empty list, not None authors = get_fieldvalues(u, mainauthortag) coauthl = get_fieldvalues(u, coauthortag) extauthl = get_fieldvalues(u, extauthortag) authors.extend(coauthl) authors.extend(extauthl) for a in authors: if a and author_cited_in.has_key(a): #add all elements in these_cite_k #that are not there already for citer in these_cite_k: tmplist = author_cited_in[a] if (tmplist.count(citer) == 0): tmplist.append(citer) author_cited_in[a] = tmplist else: author_cited_in[a] = these_cite_k mesg = "Author ref done fully" write_message(mesg) task_update_progress(mesg) #go through the dictionary again: all keys but search only if new records are cited write_message("Checking authors in new records") i = 0 for k in citedbydict.keys(): if (i % 1000 == 0): mesg = "Author cit done "+str(i)+" of "+str(len(citedbydict.keys()))+" records" write_message(mesg) task_update_progress(mesg) i = i + 1 these_cite_k = citedbydict[k] if (these_cite_k is None): these_cite_k = [] #verify it is an empty list, not None #do things only if these_cite_k contains any new stuff intersec_list = list(set(these_cite_k)&set(updated_redic_list)) if intersec_list: authors = get_fieldvalues(k, mainauthortag) coauthl = get_fieldvalues(k, coauthortag) extauthl = get_fieldvalues(k, extauthortag) authors.extend(coauthl) authors.extend(extauthl) for a in authors: if a and author_cited_in.has_key(a): #add all elements in these_cite_k #that are not there already for citer in these_cite_k: tmplist = author_cited_in[a] if (tmplist.count(citer) == 0): tmplist.append(citer) author_cited_in[a] = tmplist else: author_cited_in[a] = these_cite_k mesg = "Author cit done fully" write_message(mesg) task_update_progress(mesg) return author_cited_in
def task_run_core(name=NAME): """ Performs a search to find records without a texkey, generates a new one and uploads the changes in chunks """ recids = task_get_task_param('recids') if recids: start_date = None write_message("processing recids from commandline") else: start_date = datetime.now() recids = intbitset() recids |= intbitset( perform_request_search(p='-035:spirestex -035:inspiretex', cc='HEP')) if task_get_task_param('all'): write_message("processing all records without texkey") else: _, last_date = fetch_last_updated(name) recids = recids & fetch_records_modified_since(last_date) write_message("processing records modified since: %s" % last_date) write_message("Found %s records to assign texkeys" % len(recids)) processed_recids = [] xml_to_process = [] for count, recid in enumerate(recids): write_message("processing recid %s" % recid) # Check that the record does not have already a texkey has_texkey = False recstruct = get_record(recid) for instance in record_get_field_instances(recstruct, tag="035", ind1="", ind2=""): try: provenance = field_get_subfield_values(instance, "9")[0] except IndexError: provenance = "" try: value = field_get_subfield_values(instance, "a")[0] except IndexError: value = "" provenances = ["SPIRESTeX", "INSPIRETeX"] if provenance in provenances and value: has_texkey = True write_message("INFO: Record %s has already texkey %s" % (recid, value)) if not has_texkey: TexKeySeq = TexkeySeq() new_texkey = "" try: new_texkey = TexKeySeq.next_value(recid) except TexkeyNoAuthorError: write_message( "WARNING: Record %s has no first author or collaboration" % recid) continue except TexkeyNoYearError: write_message("WARNING: Record %s has no year" % recid) continue write_message("Created texkey %s for record %d" % (new_texkey, recid)) xml = create_xml(recid, new_texkey) processed_recids.append(recid) xml_to_process.append(xml) task_update_progress("Done %d out of %d." % (count, len(recids))) task_sleep_now_if_required() # sequence ID to be used in all subsequent tasks sequence_id = str(random.randrange(1, 4294967296)) if xml_to_process: process_chunk(xml_to_process, sequence_id) # Finally, index all the records processed # FIXME: Waiting for sequence id to be fixed # if processed_recids: # submit_bibindex_task(processed_recids, sequence_id) if start_date: store_last_updated(0, start_date, name) return True
def ref_analyzer(citation_informations, initialresult, initial_citationlist, initial_referencelist,config, updated_rec_list ): """Analyze the citation informations and calculate the citation weight and cited by list dictionary. """ function = "" try: function = config.get("rank_method", "function") except: register_exception(prefix="cfg section [rank_method] has no attr function", alert_admin=True) return {} pubrefntag = "" try: pubrefntag = config.get(function, "reference_via_report_number") except: register_exception(prefix="cfg section "+function+" has no attr reference_via_report_number", alert_admin=True) return {} pubreftag = "" try: pubreftag = config.get(function, "reference_via_pubinfo") except: register_exception(prefix="cfg section "+function+" has no attr reference_via_pubinfo", alert_admin=True) return {} #pubrefntag is often 999C5r, pubreftag 999C5s if task_get_task_param('verbose') >= 9: write_message("pubrefntag "+pubrefntag) write_message("pubreftag "+pubreftag) citation_list = initial_citationlist reference_list = initial_referencelist result = initialresult d_reports_numbers = citation_informations[0] #dict of recid -> institute_give_publ_id d_references_report_numbers = citation_informations[1] #dict of recid -> ['astro-ph/xyz'..] d_references_s = citation_informations[2] #dict of recid -> publication_infos_in_its_bibliography d_records_s = citation_informations[3] #recid -> its publication inf t1 = os.times()[4] write_message("Phase 0: temporarily remove changed records from citation dictionaries; they will be filled later") for somerecid in updated_rec_list: try: del citation_list[somerecid] except KeyError: pass try: del reference_list[somerecid] except KeyError: pass write_message("Phase 1: d_references_report_numbers") #d_references_report_numbers: e.g 8 -> ([astro-ph/9889],[hep-ph/768]) #meaning: rec 8 contains these in bibliography done = 0 numrecs = len(d_references_report_numbers) for thisrecid, refnumbers in d_references_report_numbers.iteritems(): if (done % 1000 == 0): mesg = "d_references_report_numbers done "+str(done)+" of "+str(numrecs) write_message(mesg) task_update_progress(mesg) task_sleep_now_if_required() done = done+1 for refnumber in refnumbers: if refnumber: p = refnumber f = 'reportnumber' #sanitise p p.replace("\n",'') #search for "hep-th/5644654 or such" in existing records rec_ids = get_recids_matching_query(p, f) if rec_ids and rec_ids[0]: write_citer_cited(thisrecid, rec_ids[0]) remove_from_missing(p) if not result.has_key(rec_ids[0]): result[rec_ids[0]] = 0 # Citation list should have rec_ids[0] but check anyway if not citation_list.has_key(rec_ids[0]): citation_list[rec_ids[0]] = [] #append unless this key already has the item if not thisrecid in citation_list[rec_ids[0]]: citation_list[rec_ids[0]].append(thisrecid) #and update result result[rec_ids[0]] += 1 if not reference_list.has_key(thisrecid): reference_list[thisrecid] = [] if not rec_ids[0] in reference_list[thisrecid]: reference_list[thisrecid].append(rec_ids[0]) else: #the reference we wanted was not found among our records. #put the reference in the "missing".. however, it will look #bad.. gfhgf/1254312, so get the corresponding 999C5s (full ref) too #This should really be done in the next loop d_references_s #but the 999C5s fields are not yet normalized #rectext = print_record(thisrecid, format='hm', ot=pubreftag[:-1]) rectext = "" # print_record() call disabled to speed things up lines = rectext.split("\n") rpart = p #to be used.. for l in lines: if (l.find(p) > 0): #the gfhgf/1254312 was found.. get the s-part of it st = l.find('$s') if (st > 0): end = l.find('$', st) if (end == st): end = len(l) rpart = l[st+2:end] insert_into_missing(thisrecid, rpart) mesg = "d_references_report_numbers done fully" write_message(mesg) task_update_progress(mesg) t2 = os.times()[4] #try to find references based on 999C5s, like Phys.Rev.Lett. 53 (1986) 2285 write_message("Phase 2: d_references_s") done = 0 numrecs = len(d_references_s) for thisrecid, refss in d_references_s.iteritems(): if (done % 1000 == 0): mesg = "d_references_s done "+str(done)+" of "+str(numrecs) write_message(mesg) task_update_progress(mesg) task_sleep_now_if_required() done = done+1 for refs in refss: if refs: p = refs #remove the latter page number if it is like 67-74 matches = re.compile("(.*)(-\d+$)").findall(p) if matches and matches[0]: p = matches[0][0] rec_id = None try: rec_ids = list(search_unit(p, 'journal') - INTBITSET_OF_DELETED_RECORDS) except: rec_ids = None write_message("These match searching "+p+" in journal: "+str(rec_id), verbose=9) if rec_ids and rec_ids[0]: #the refered publication is in our collection, remove #from missing remove_from_missing(p) else: #it was not found so add in missing insert_into_missing(thisrecid, p) #check citation and reference for this.. if rec_ids and rec_ids[0]: #the above should always hold if not result.has_key(rec_ids[0]): result[rec_ids[0]] = 0 if not citation_list.has_key(rec_ids[0]): citation_list[rec_ids[0]] = [] if not thisrecid in citation_list[rec_ids[0]]: citation_list[rec_ids[0]].append(thisrecid) #append actual list result[rec_ids[0]] += 1 #add count for this.. #update reference_list accordingly if not reference_list.has_key(thisrecid): reference_list[thisrecid] = [] if not rec_ids[0] in reference_list[thisrecid]: reference_list[thisrecid].append(rec_ids[0]) mesg = "d_references_s done fully" write_message(mesg) task_update_progress(mesg) t3 = os.times()[4] done = 0 numrecs = len(d_reports_numbers) write_message("Phase 3: d_reports_numbers") #search for stuff like CERN-TH-4859/87 in list of refs for thisrecid, reportcodes in d_reports_numbers.iteritems(): if (done % 1000 == 0): mesg = "d_report_numbers done "+str(done)+" of "+str(numrecs) write_message(mesg) task_update_progress(mesg) done = done+1 for reportcode in reportcodes: if reportcode: rec_ids = [] try: rec_ids = get_recids_matching_query(reportcode, pubrefntag) except: rec_ids = [] if rec_ids: for recid in rec_ids: #normal checks.. if not citation_list.has_key(thisrecid): citation_list[thisrecid] = [] if not reference_list.has_key(recid): reference_list[recid] = [] if not result.has_key(thisrecid): result[thisrecid] = 0 #normal updates if not recid in citation_list[thisrecid]: result[thisrecid] += 1 citation_list[thisrecid].append(recid) if not thisrecid in reference_list[recid]: reference_list[recid].append(thisrecid) mesg = "d_report_numbers done fully" write_message(mesg) task_update_progress(mesg) #find this record's pubinfo in other records' bibliography write_message("Phase 4: d_records_s") done = 0 numrecs = len(d_records_s) t4 = os.times()[4] for thisrecid, recs in d_records_s.iteritems(): if (done % 1000 == 0): mesg = "d_records_s done "+str(done)+" of "+str(numrecs) write_message(mesg) task_update_progress(mesg) done = done+1 p = recs.replace("\"","") #search the publication string like Phys. Lett., B 482 (2000) 417 in 999C5s rec_ids = list(search_unit(f=pubreftag, p=p, m='a') - INTBITSET_OF_DELETED_RECORDS) write_message("These records match "+p+" in "+pubreftag+" : "+str(rec_ids), verbose=9) if rec_ids: for rec_id in rec_ids: #normal checks if not result.has_key(thisrecid): result[thisrecid] = 0 if not citation_list.has_key(thisrecid): citation_list[thisrecid] = [] if not reference_list.has_key(rec_id): reference_list[rec_id] = [] if not rec_id in citation_list[thisrecid]: result[thisrecid] += 1 citation_list[thisrecid].append(rec_id) if not thisrecid in reference_list[rec_id]: reference_list[rec_id].append(thisrecid) mesg = "d_records_s done fully" write_message(mesg) task_update_progress(mesg) write_message("Phase 5: reverse lists") #remove empty lists in citation and reference keys = citation_list.keys() for k in keys: if not citation_list[k]: del citation_list[k] keys = reference_list.keys() for k in keys: if not reference_list[k]: del reference_list[k] write_message("Phase 6: self-citations") selfdic = {} #get the initial self citation dict initial_self_dict = get_cit_dict("selfcitdict") selfdic = initial_self_dict #add new records to selfdic acit = task_get_option("author-citations") if not acit: write_message("Self cite processing disabled. Use -A option to enable it.") else: write_message("self cite and author citations enabled") selfdic = get_self_citations(updated_rec_list, citation_list, initial_self_dict, config) #selfdic consists of #key k -> list of values [v1,v2,..] #where k is a record with author A and k cites v1,v2.. and A appears in v1,v2.. #create a reverse "x cited by y" self cit dict selfcitedbydic = {} for k in selfdic.keys(): vlist = selfdic[k] for v in vlist: if selfcitedbydic.has_key(v): tmplist = selfcitedbydic[v] if not k in tmplist: tmplist.append(k) else: tmplist = [k] selfcitedbydic[v] = tmplist write_message("Getting author citations") #get author citations for records in updated_rec_list initial_author_dict = get_initial_author_dict() authorcitdic = initial_author_dict acit = task_get_option("author-citations") if not acit: print "Author cites disabled. Use -A option to enable it." else: write_message("author citations enabled") authorcitdic = get_author_citations(updated_rec_list, citation_list, initial_author_dict, config) if task_get_task_param('verbose') >= 3: #print only X first to prevent flood tmpdict = {} tmp = citation_list.keys()[0:10] for t in tmp: tmpdict[t] = citation_list[t] write_message("citation_list (x is cited by y): "+str(tmpdict)) write_message("size: "+str(len(citation_list.keys()))) tmp = reference_list.keys()[0:10] tmpdict = {} for t in tmp: tmpdict[t] = reference_list[t] write_message("reference_list (x cites y): "+str(tmpdict)) write_message("size: "+str(len(reference_list.keys()))) tmp = selfcitedbydic.keys()[0:10] tmpdict = {} for t in tmp: tmpdict[t] = selfcitedbydic[t] mesg = "selfcitedbydic (x is cited by y and one of the authors of x same as y's):" mesg += str(tmpdict) write_message(mesg) write_message("size: "+str(len(selfcitedbydic.keys()))) tmp = selfdic.keys()[0:100] tmpdict = {} for t in tmp: tmpdict[t] = selfdic[t] mesg = "selfdic (x cites y and one of the authors of x same as y's): "+str(tmpdict) write_message(mesg) write_message("size: "+str(len(selfdic.keys()))) tmp = authorcitdic.keys()[0:10] tmpdict = {} for t in tmp: tmpdict[t] = authorcitdic[t] write_message("authorcitdic (author is cited in recs): "+str(tmpdict)) write_message("size: "+str(len(authorcitdic.keys()))) insert_cit_ref_list_intodb(citation_list, reference_list, selfcitedbydic, selfdic, authorcitdic) t5 = os.times()[4] write_message("Execution time for analyzing the citation information generating the dictionary:") write_message("... checking ref number: %.2f sec" % (t2-t1)) write_message("... checking ref ypvt: %.2f sec" % (t3-t2)) write_message("... checking rec number: %.2f sec" % (t4-t3)) write_message("... checking rec ypvt: %.2f sec" % (t5-t4)) write_message("... total time of ref_analyze: %.2f sec" % (t5-t1)) return result
def ref_analyzer(citation_informations, updated_recids, tags, config): """Analyze the citation informations and calculate the citation weight and cited by list dictionary. """ citations = {} for recid in updated_recids: citations[recid] = set() references = {} for recid in updated_recids: references[recid] = set() def step(msg_prefix, recid, done, total): if done % 30 == 0: task_sleep_now_if_required() if done % 1000 == 0: mesg = "%s done %s of %s" % (msg_prefix, done, total) write_message(mesg) task_update_progress(mesg) write_message("Processing: %s" % recid, verbose=9) def add_to_cites(citer, citee): # Make sure we don't add ourselves # Workaround till we know why we are adding ourselves. if citer == citee: return citations[citee].add(citer) if citer in updated_recids: references[citer].add(citee) def add_to_refs(citer, citee): # Make sure we don't add ourselves # Workaround till we know why we are adding ourselves. if citer == citee: return if citee in updated_recids: citations[citee].add(citer) references[citer].add(citee) # dict of recid -> institute_give_publ_id records_info, references_info = citation_informations t1 = os.times()[4] # Try to find references based on 999C5r # e.g 8 -> ([astro-ph/9889],[hep-ph/768]) # meaning: rec 8 contains these in bibliography write_message("Phase 1: Report numbers references") done = 0 for thisrecid, refnumbers in references_info['report-numbers'].iteritems(): step("Report numbers references", thisrecid, done, len(references_info['report-numbers'])) done += 1 for refnumber in (r for r in refnumbers if r): field = 'reportnumber' refnumber = standardize_report_number(refnumber) # Search for "hep-th/5644654 or such" in existing records recids = get_recids_matching_query(p=refnumber, f=field, config=config) write_message("These match searching %s in %s: %s" % (refnumber, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, refnumber) else: remove_from_missing(refnumber) if len(recids) > 1: store_citation_warning('multiple-matches', refnumber) msg = "Whoops: record '%d' report number value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, refnumber, repr(recids)) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_refs(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t2 = os.times()[4] # Try to find references based on 999C5s # e.g. Phys.Rev.Lett. 53 (1986) 2285 write_message("Phase 2: Journal references") done = 0 for thisrecid, refs in references_info['journals'].iteritems(): step("Journal references", thisrecid, done, len(references_info['journals'])) done += 1 for reference in (r for r in refs if r): p = reference field = 'journal' # check reference value to see whether it is well formed: if not re_CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK.match(p): store_citation_warning('not-well-formed', p) msg = "Whoops, record '%d' reference value '%s' " \ "is not well formed; skipping it." % (thisrecid, p) write_message(msg, stream=sys.stderr) continue # skip this ill-formed value recids = get_recids_matching_query(p=p, f=field, config=config) write_message("These match searching %s in %s: %s" % (reference, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, p) else: remove_from_missing(p) if len(recids) > 1: store_citation_warning('multiple-matches', p) msg = "Whoops: record '%d' reference value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, p, repr(recids)) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_refs(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t3 = os.times()[4] # Try to find references based on 999C5a # e.g. 10.1007/BF03170733 write_message("Phase 3: DOI references") done = 0 for thisrecid, refs in references_info['doi'].iteritems(): step("DOI references", thisrecid, done, len(references_info['doi'])) done += 1 for reference in (r for r in refs if r): p = reference field = 'doi' recids = get_recids_matching_query(p=p, f=field, config=config) write_message("These match searching %s in %s: %s" % (reference, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, p) else: remove_from_missing(p) if len(recids) > 1: store_citation_warning('multiple-matches', p) msg = "Whoops: record '%d' DOI value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, p, repr(recids)) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_refs(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t4 = os.times()[4] # Try to find references based on 999C5a (hdl references) # e.g. 4263537/4000 write_message("Phase 4: HDL references") done = 0 for thisrecid, refs in references_info['hdl'].iteritems(): step("HDL references", thisrecid, done, len(references_info['hdl'])) done += 1 for reference in (r for r in refs if r): p = reference field = 'hdl' recids = get_recids_matching_query(p=p, f=field, config=config) write_message("These match searching %s in %s: %s" % (reference, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, p) else: remove_from_missing(p) if len(recids) > 1: store_citation_warning('multiple-matches', p) msg = "Whoops: record '%d' HDL value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, p, repr(recids)) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_refs(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t5 = os.times()[4] # Try to find references based on 999C50 # e.g. 1244 write_message("Phase 5: Record ID references") done = 0 for thisrecid, refs in references_info['record_id'].iteritems(): step("Record ID references", thisrecid, done, len(references_info['record_id'])) done += 1 field = "001" for recid in (r for r in refs if r): valid = get_recids_matching_query(p=recid, f=field, config=config) write_message("These match searching %s in %s: %s" % (recid, field, list(valid)), verbose=9) if valid: add_to_refs(thisrecid, valid[0]) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t6 = os.times()[4] # Try to find references based on 999C5i # e.g. 978-3-942171-73-1 write_message("Phase 6: ISBN references") done = 0 for thisrecid, refs in references_info['isbn'].iteritems(): step("ISBN references", thisrecid, done, len(references_info['isbn'])) done += 1 for reference in (r for r in refs if r): p = reference field = 'isbn' recids = get_recids_matching_query(p=p, f=field, config=config) write_message("These match searching %s in %s: %s" % (reference, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, p) else: remove_from_missing(p) if len(recids) > 1: store_citation_warning('multiple-matches', p) msg = "Whoops: record '%d' ISBN value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, p, repr(recids)) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_refs(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t7 = os.times()[4] # Search for stuff like CERN-TH-4859/87 in list of refs write_message("Phase 7: report numbers catchup") done = 0 for thisrecid, reportcodes in records_info['report-numbers'].iteritems(): step("Report numbers catchup", thisrecid, done, len(records_info['report-numbers'])) done += 1 for reportcode in (r for r in reportcodes if r): if reportcode.startswith('arXiv'): std_reportcode = standardize_report_number(reportcode) report_pattern = r'^%s( *\[[a-zA-Z.-]*\])?' % \ re.escape(std_reportcode) recids = get_recids_matching_query( p=report_pattern, f=tags['refs_report_number'], m='r', config=config) else: recids = get_recids_matching_query( p=reportcode, f=tags['refs_report_number'], config=config) for recid in recids: add_to_cites(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) # Find this record's pubinfo in other records' bibliography write_message("Phase 8: journals catchup") done = 0 t8 = os.times()[4] for thisrecid, rec_journals in records_info['journals'].iteritems(): step("Journals catchup", thisrecid, done, len(records_info['journals'])) done += 1 for journal in rec_journals: journal = journal.replace("\"", "") # Search the publication string like # Phys. Lett., B 482 (2000) 417 in 999C5s recids = get_recids_matching_query(p=journal, f=tags['refs_journal'], config=config) write_message("These records match %s in %s: %s" % (journal, tags['refs_journal'], list(recids)), verbose=9) for recid in recids: add_to_cites(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) write_message("Phase 9: DOI catchup") done = 0 t9 = os.times()[4] for thisrecid, dois in records_info['doi'].iteritems(): step("DOI catchup", thisrecid, done, len(records_info['doi'])) done += 1 for doi in dois: recids = get_recids_matching_query(p=doi, f=tags['refs_doi'], config=config) write_message("These records match %s in %s: %s" % (doi, tags['refs_doi'], list(recids)), verbose=9) for recid in recids: add_to_cites(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) write_message("Phase 10: HDL catchup") done = 0 t10 = os.times()[4] for thisrecid, hdls in records_info['hdl'].iteritems(): step("HDL catchup", thisrecid, done, len(records_info['hdl'])) done += 1 for hdl in hdls: recids = get_recids_matching_query(p=hdl, f=tags['refs_doi'], config=config) write_message("These records match %s in %s: %s" % (hdl, tags['refs_doi'], list(recids)), verbose=9) for recid in recids: add_to_cites(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) write_message("Phase 11: ISBN catchup") done = 0 t11 = os.times()[4] for thisrecid, isbns in records_info['isbn'].iteritems(): step("ISBN catchup", thisrecid, done, len(records_info['isbn'])) done += 1 for isbn in isbns: recids = get_recids_matching_query(p=isbn, f=tags['refs_isbn'], config=config) write_message("These records match %s in %s: %s" % (isbn, tags['refs_isbn'], list(recids)), verbose=9) for recid in recids: add_to_cites(recid, thisrecid) write_message("Phase 12: Record ID catchup") done = 0 t12 = os.times()[4] for thisrecid, record_ids in records_info['record_id'].iteritems(): step("Record ID catchup", thisrecid, done, len(records_info['record_id'])) done += 1 for record_id in record_ids: recids = get_recids_matching_query(p=record_id, f=tags['refs_record_id'], config=config) write_message("These records match %s in %s: %s" % (record_id, tags['refs_record_id'], list(recids)), verbose=9) for recid in recids: add_to_cites(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) if task_get_task_param('verbose') >= 3: # Print only X first to prevent flood write_message("citation_list (x is cited by y):") write_message(dict(islice(citations.iteritems(), 10))) write_message("size: %s" % len(citations)) write_message("reference_list (x cites y):") write_message(dict(islice(references.iteritems(), 10))) write_message("size: %s" % len(references)) t13 = os.times()[4] write_message("Execution time for analyzing the citation information " "generating the dictionary:") write_message("... checking ref report numbers: %.2f sec" % (t2 - t1)) write_message("... checking ref journals: %.2f sec" % (t3 - t2)) write_message("... checking ref DOI: %.2f sec" % (t4 - t3)) write_message("... checking ref HDL: %.2f sec" % (t5 - t4)) write_message("... checking ref Record ID: %.2f sec" % (t6 - t5)) write_message("... checking ref ISBN: %.2f sec" % (t7 - t6)) write_message("... checking rec report numbers: %.2f sec" % (t8 - t7)) write_message("... checking rec journals: %.2f sec" % (t9 - t8)) write_message("... checking rec DOI: %.2f sec" % (t10 - t9)) write_message("... checking rec HDL: %.2f sec" % (t11 - t10)) write_message("... checking rec ISBN: %.2f sec" % (t12 - t11)) write_message("... checking rec Record ID: %.2f sec" % (t13 - t12)) write_message("... total time of ref_analyze: %.2f sec" % (t13 - t1)) return citations, references