def fun(): try: return task_run_core(name, core_func, extra_vars) except Exception: # Remove extra '\n' write_message(traceback.format_exc()[:-1]) raise
def _dump_database(dirname, filename): """ Dump Invenio database into SQL file called FILENAME living in DIRNAME. """ write_message("... writing %s" % dirname + os.sep + filename) cmd = CFG_PATH_MYSQL + 'dump' if not os.path.exists(cmd): write_message("ERROR: cannot find %s." % cmd, stream=sys.stderr) task_update_status("ERROR") sys.exit(1) cmd += " --skip-opt --add-drop-table --add-locks --create-options " \ " --quick --extended-insert --set-charset --disable-keys " \ " --host=%s --user=%s --password=%s %s" % \ (escape_shell_arg(CFG_DATABASE_HOST), escape_shell_arg(CFG_DATABASE_USER), escape_shell_arg(CFG_DATABASE_PASS), escape_shell_arg(CFG_DATABASE_NAME)) dummy1, dummy2, dummy3 = run_shell_command(cmd, None, dirname + os.sep + filename) if dummy1: write_message("ERROR: mysqldump exit code is %s." % repr(dummy1), stream=sys.stderr) task_update_status("ERROR") sys.exit(1) if dummy2: write_message("ERROR: mysqldump stdout is %s." % repr(dummy1), stream=sys.stderr) task_update_status("ERROR") sys.exit(1) if dummy3: write_message("ERROR: mysqldump stderr is %s." % repr(dummy1), stream=sys.stderr) task_update_status("ERROR") sys.exit(1)
def pagerank_ext(conv_threshold, check_point, len_, sparse, semi_sparse): """the core function of the PAGERANK_EXT method returns an array with the ranks coresponding to each recid""" weights_old = array((), float32) weights_old = ones((len_), float32) weights_new = array((), float32) converged = False nr_of_check_points = 0 difference = len_ while not converged: nr_of_check_points += 1 for step in (range(check_point)): weights_new = zeros((len_), float32) for (i, j) in sparse.keys(): weights_new[i] += sparse[(i, j)]*weights_old[j] total_sum = 0.0 for j in semi_sparse: total_sum += semi_sparse[j]*weights_old[j] weights_new[1:len_] = weights_new[1:len_] + total_sum if step == check_point - 1: diff = weights_new - weights_old difference = sqrt(dot(diff, diff))/len_ write_message("Finished step: %s, %s " \ % (str(check_point*(nr_of_check_points-1) + step), \ str(difference)), verbose=5) weights_old = weights_new.copy() converged = (difference < conv_threshold) write_message("PageRank calculated for all recids finnished in %s steps. \ The threshold was %s" % (str(nr_of_check_points), \ str(difference)), verbose=2) #return weights_old[1:len_]/(len_ - weights_old[0]) return weights_old[1:len_]
def upload_amendments(records, holdingpen): """ Upload a modified record """ if task_get_option("no_upload", False) or len(records) == 0: return xml = '<collection xmlns="http://www.loc.gov/MARC21/slim">' for record in records: xml += record_xml_output(record) xml += "</collection>" tmp_file_fd, tmp_file = mkstemp( suffix='.xml', prefix="bibcheckfile_%s" % time.strftime("%Y-%m-%d_%H:%M:%S"), dir=CFG_TMPSHAREDDIR ) os.write(tmp_file_fd, xml) os.close(tmp_file_fd) os.chmod(tmp_file, 0644) if holdingpen: flag = "-o" else: flag = "-r" task = task_low_level_submission('bibupload', 'bibcheck', flag, tmp_file) write_message("Submitted bibupload task %s" % task)
def get_config_parameter(jobname, parameter_name, is_parameter_collection = False): """Detect export method of JOBNAME. Basically, parse JOBNAME.cfg and return export_method. Return None if problem found.""" jobconfig = ConfigParser() jobconffile = CFG_ETCDIR + os.sep + 'bibexport' + os.sep + jobname + '.cfg' if not os.path.exists(jobconffile): write_message("ERROR: cannot find config file %s." % jobconffile) return None jobconfig.read(jobconffile) if is_parameter_collection: all_items = jobconfig.items(section='export_job') parameters = [] for item_name, item_value in all_items: if item_name.startswith(parameter_name): parameters.append(item_value) return parameters else: parameter = jobconfig.get('export_job', parameter_name) return parameter
def extract_package(package, batch_size, delete_zip, directory): try: extractAll(package, delete_zip, directory) except BadZipfile: write_message("Error BadZipfile %s", (package,)) task_update_status("CERROR") remove(package)
def get_external_links_from_db(ref, dict_of_ids, reference_indicator): """returns a dictionary containing the number of external links for each recid external link=citation that is not in our database """ ext_links = {} dict_all_ref = {} for recid in dict_of_ids: dict_all_ref[recid] = 0 ext_links[dict_of_ids[recid]] = 0 reference_db_id = reference_indicator[0:2] reference_tag_regex = reference_indicator + "[a-z]" tag_list = run_sql("select id from bib" + reference_db_id + \ "x where tag RLIKE %s", (reference_tag_regex, )) tag_set = set() for tag in tag_list: tag_set.add(tag[0]) ref_list = run_sql("select id_bibrec, id_bibxxx, field_number from \ bibrec_bib" + reference_db_id + "x group by \ id_bibrec, field_number") for item in ref_list: recid = int(item[0]) id_bib = int(item[1]) if recid in dict_of_ids and id_bib in tag_set: dict_all_ref[recid] += 1 for recid in dict_of_ids: total_links = dict_all_ref[recid] internal_links = ref[dict_of_ids[recid]] ext_links[dict_of_ids[recid]] = total_links - internal_links if ext_links[dict_of_ids[recid]] < 0: ext_links[dict_of_ids[recid]] = 0 write_message("External link information extracted", verbose=2) write_message("External links: %s" % str(ext_links), verbose=9) return ext_links
def submit_records_via_ftp(filename, location=""): """Submits given file to FTP server as defined. The FTP server uploaded to is controlled with the config variables: CFG_FTP_AUTHENTICATION_FILE (netrc_file) CFG_FTP_SERVER @param filename: file to upload @type filename: str @param location: location on FTP server. Defaults to root. @type location: str """ from invenio.config import (CFG_FTP_SERVER, CFG_FTP_AUTHENTICATION_FILE,) try: ftp = FtpHandler(CFG_FTP_SERVER, netrc_file=CFG_FTP_AUTHENTICATION_FILE) ftp.upload(filename, location) ftp.close() write_message("%s successfully uploaded to FTP server" % filename) except Exception as e: write_message("Failed to upload %s to FTP server: %s\n%s" % (filename, str(e), traceback.format_exc()))
def bst_inspire_authority_ids_synchronizer( url=SYNC_URL_INSPIRE_RECORDS_SRC, tmp_dir=SYNC_LOCAL_TMP_DIR): """Synchronize INSPIRE authority ids. :param string url: valid URL to the gzip (.gz) file :param string tmp_dir: existing directory path for temporary files """ xml_content = get_inspire_dump( url, os.path.join(tmp_dir, SYNC_LOCAL_INSPIRE_RECORDS_FILE_NAME)) task_sleep_now_if_required() authority_ids = parse_inspire_xml(xml_content) task_sleep_now_if_required() if authority_ids: record_ids = get_record_ids() write_message( "Info: {0} record ids have been requested".format(len(record_ids))) if record_ids: synchronize( record_ids, authority_ids, os.path.join(tmp_dir, SYNC_LOCAL_CDS_RECORDS_UPDATES_FILE_NAME))
def fetch_concerned_arxiv_records(name): task_update_progress("Fetching arxiv record ids") dummy, last_date = fetch_last_updated(name) # Fetch all records inserted since last run sql = "SELECT `id`, `modification_date` FROM `bibrec` " \ "WHERE `modification_date` >= %s " \ "AND `creation_date` > NOW() - INTERVAL 7 DAY " \ "ORDER BY `modification_date`" \ "LIMIT 5000" records = run_sql(sql, [last_date.isoformat()]) def check_arxiv(recid): record = get_record(recid) for report_tag in record_get_field_instances(record, "037"): for category in field_get_subfield_values(report_tag, 'a'): if category.startswith('arXiv'): return True return False def check_pdf_date(recid): doc = get_pdf_doc(recid) if doc: return doc.md > last_date return False records = [(r, mod_date) for r, mod_date in records if check_arxiv(r)] records = [(r, mod_date) for r, mod_date in records if check_pdf_date(r)] write_message("recids %s" % repr([(r, mod_date.isoformat()) \ for r, mod_date in records])) task_update_progress("Done fetching arxiv record ids") return records
def check_nbrecs_for_external_collection(self): """Check if the external collections has changed its total number of records, aka nbrecs. Rerurns True if the total number of records has changed and False if it's the same""" write_message("*** self.nbrecs = %s / self.cal...ion = %s ***" % (str(self.nbrecs), str(self.calculate_nbrecs_for_external_collection())), verbose=6) write_message("*** self.nbrecs != self.cal...ion = %s ***" % (str(self.nbrecs != self.calculate_nbrecs_for_external_collection()),), verbose=6) return self.nbrecs != self.calculate_nbrecs_for_external_collection(CFG_HOSTED_COLLECTION_TIMEOUT_NBRECS)
def get_records_to_harvest(parameters): """ Get APSRecord to harvest. Using the given parameters dict (from bst_apsharvest), we check how to get the list of records to process. Returns a tuple of (record_list, harvest_from_date, date_checked) where record_list is the list of APSRecord instances, harvest_from_date is the decided date to harvest from and date_checked is the datetime when the harvest was initiated. """ # This is the list of APSRecord objects to be harvested. final_record_list = APSRecordList() new_harvest_date = None harvest_from_date = None harvest_until_date = None if parameters.get("input_file"): # We get input from file with open(parameters.get("input_file")) as fd: for line in fd.readlines(): doi = line.strip() if not doi: continue final_record_list.append(APSRecord(doi=doi)) if parameters.get("threshold_date"): # Input from user. Validate date try: validate_date(parameters.get("threshold_date")) except ValueError, e: write_message("Error parsing from_date, use (YYYY-MM-DD): %s" % (str(e),), stream=sys.stderr) raise
def perform_insert_record(data_dict, data_dict_ordered, data_list_sorted, value, recid, spacing=CFG_BIBSORT_WEIGHT_DISTANCE): """Inserts a new record into all the data structures""" #data_dict data_dict[recid] = value #data_dict_ordered & data_list_sorted #calculate at which index the rec should be inserted in data_list_sorted index_for_insert = binary_search(data_list_sorted, value, data_dict) #we have to calculate the weight of this record in data_dict_ordered #and it will be the med between its neighbours in the data_list_sorted if index_for_insert == len(data_list_sorted):#insert at the end of the list #append at the end of the list data_list_sorted.append(recid) #weight = highest weight + the distance data_dict_ordered[recid] = data_dict_ordered[data_list_sorted[index_for_insert - 1]] + spacing else: if index_for_insert == 0: #insert at the begining of the list left_neighbor_weight = 0 else: left_neighbor_weight = data_dict_ordered[data_list_sorted[index_for_insert - 1]] right_neighbor_weight = data_dict_ordered[data_list_sorted[index_for_insert]] #the recid's weight will be the med between left and right weight = (right_neighbor_weight - left_neighbor_weight)/2 if weight < 1: #there is no more space to insert, we have to create some space data_list_sorted.insert(index_for_insert, recid) data_dict_ordered[recid] = left_neighbor_weight + spacing create_space_for_new_weight(index_for_insert, data_dict_ordered, data_list_sorted, spacing) else: data_list_sorted.insert(index_for_insert, recid) data_dict_ordered[recid] = left_neighbor_weight + weight write_message("Record %s done." %recid, verbose=5) return index_for_insert
def write_to_buckets_table(id_method, bucket_no, bucket_data, bucket_last_value, update_timestamp=True): """Serialize the date and write it to the bsrMEHODDATA_BUCKETS""" write_message('Writing the data for bucket number %s for ' \ 'method_id=%s to the database' \ %(bucket_no, id_method), verbose=5) write_message('Serializing data for bucket number %s' %bucket_no, verbose=5) serialized_bucket_data = bucket_data.fastdump() date = strftime("%Y-%m-%d %H:%M:%S", time.localtime()) if not update_timestamp: try: date = run_sql('SELECT last_update from bsrMETHODDATABUCKET WHERE id_bsrMETHOD = %s and bucket_no = %s', \ (id_method, bucket_no))[0][0] except IndexError: pass # keep the generated date try: write_message('Deleting old data.', verbose=5) run_sql("DELETE FROM bsrMETHODDATABUCKET \ WHERE id_bsrMETHOD = %s AND bucket_no = %s", \ (id_method, bucket_no, )) write_message('Inserting new data.', verbose=5) run_sql("INSERT into bsrMETHODDATABUCKET \ (id_bsrMETHOD, bucket_no, bucket_data, bucket_last_value, last_updated) \ VALUES (%s, %s, %s, %s, %s)", \ (id_method, bucket_no, serialized_bucket_data, bucket_last_value, date, )) except Error, err: write_message("The error [%s] occured when inserting new bibsort data " \ "into bsrMETHODATA_BUCKETS table" %err, sys.stderr) return False
def update_sorting(methods, recids): """Runs the updating of the sorting tables for methods and recids Recids is a list of integer numbers(record ids) but can also contain intervals""" method_list = [] if methods: method_list = methods.strip().split(',') recid_list = [] if recids: cli_recid_list = recids.strip().split(',') for recid in cli_recid_list: if recid.find('-') > 0: rec_range = recid.split('-') try: recid_min = int(rec_range[0]) recid_max = int(rec_range[1]) for rec in range(recid_min, recid_max + 1): recid_list.append(rec) except Error, err: write_message("Error: [%s] occured while trying \ to parse the recids argument." %err, sys.stderr) return False else: recid_list.append(int(recid))
def submit_xml(xml, mode, stamp): """ Write temporary xml file and submit for batchupload. Do nothing for empty xml. @param xml: body xml @param mode: mode for upload ['delete' | 'correct'] @param stamp: additional string in filename """ if not xml: return xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n'\ + xml + '\n</collection>\n' tmp_file_fd, tmp_file = mkstemp( suffix='.xml', prefix="bst_check4template-%s_%s" % (mode, stamp), dir=CFG_TMPSHAREDDIR ) os.write(tmp_file_fd, xml) os.close(tmp_file_fd) os.chmod(tmp_file, 0644) if mode == 'delete': flag = '-d' elif mode == 'correct': flag = '-c' else: write_message("Wrong mode: %s" % mode) return task = task_low_level_submission('bibupload', 'check4template', flag, tmp_file) write_message("Submitted bibupload task %s" % task)
def query_records(params): """Prduces record IDs from given query parameters By passing the appriopriate CLI options, we can query here for additional records. """ write_message("Querying database (records query)...") res = intbitset() if params['field'] or params['collection'] or params['pattern']: if not params['collection']: # use search_pattern() whenever possible, as it can search # even in private collections res = search_pattern(p=params['pattern'], f=params['field'], m=params['matching']) else: # use perform_request_search when '-c' argument has been # defined, as it is not supported by search_pattern() res = intbitset(perform_request_search(req=None, of='id', c=params['collection'], p=params['pattern'], f=params['field'])) return res
def iterate_over_new(list, fmt): "Iterate over list of IDs" global total_rec formatted_records = '' # (string-)List of formatted record of an iteration tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call start_date = task_get_task_param('task_starting_time') # Time at which the record was formatted tot = len(list) count = 0 for recID in list: t1 = os.times()[4] start_date = time.strftime('%Y-%m-%d %H:%M:%S') formatted_record = zlib.compress(format_record(recID, fmt, on_the_fly=True)) if run_sql('SELECT id FROM bibfmt WHERE id_bibrec=%s AND format=%s', (recID, fmt)): run_sql('UPDATE bibfmt SET last_updated=%s, value=%s WHERE id_bibrec=%s AND format=%s', (start_date, formatted_record, recID, fmt)) else: run_sql('INSERT INTO bibfmt(id_bibrec, format, last_updated, value) VALUES(%s, %s, %s, %s)', (recID, fmt, start_date, formatted_record)) t2 = os.times()[4] tbibformat += (t2 - t1) count += 1 if (count % 100) == 0: write_message(" ... formatted %s records out of %s" % (count, tot)) task_update_progress('Formatted %s out of %s' % (count, tot)) task_sleep_now_if_required(can_stop_too=True) if (tot % 100) != 0: write_message(" ... formatted %s records out of %s" % (count, tot)) return (tot, tbibformat, tbibupload)
def find_records(collection, subfields): """ Find records with VOLATILE content. @param collection: collection to be checked @type collection: string @param subfields: VOLATILE content in tagiic @type subfields: dict @return: dict {recid: array of tagiic} """ sf_keys = subfields.keys() sf_keys.sort() recs_collection = get_collection_reclist(collection) recs_to_change = {} for tagiic in sf_keys: for value in subfields[tagiic]: result = search_pattern(p=value, f=tagiic, m='e') & recs_collection if result: write_message('Update %i records with %s:"%s" -- %s' \ % (len(result), tagiic, value, list(result))) for recid in result: if recs_to_change.has_key(recid): recs_to_change[recid].append(tagiic) else: recs_to_change[recid] = [tagiic, ] return recs_to_change
def get_ancestors(self): "Returns list of ancestors of the current collection." ancestors = [] ancestors_ids = intbitset() id_son = self.id while 1: query = ( "SELECT cc.id_dad,c.name FROM collection_collection AS cc, collection AS c " "WHERE cc.id_son=%d AND c.id=cc.id_dad" % int(id_son) ) res = run_sql(query, None, 1) if res: col_ancestor = get_collection(res[0][1]) # looking for loops if self.id in ancestors_ids: write_message("Loop found in collection %s" % self.name, stream=sys.stderr) raise OverflowError("Loop found in collection %s" % self.name) else: ancestors.append(col_ancestor) ancestors_ids.add(col_ancestor.id) id_son = res[0][0] else: break ancestors.reverse() return ancestors
def iterate_over_new(recIDs, fmt): """ Iterate over list of IDs @param list: the list of record IDs to format @param fmt: the output format to use @return: tuple (total number of records, time taken to format, time taken to insert) """ tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call tot = len(recIDs) for count, recID in enumerate(recIDs): t1 = os.times()[4] formatted_record, needs_2nd_pass = format_record_1st_pass(recID=recID, of=fmt, on_the_fly=True, save_missing=False) save_preformatted_record(recID=recID, of=fmt, res=formatted_record, needs_2nd_pass=needs_2nd_pass, low_priority=True) t2 = os.times()[4] tbibformat += t2 - t1 if count % 100 == 0: write_message(" ... formatted %s records out of %s" % (count, tot)) task_update_progress('Formatted %s out of %s' % (count, tot)) task_sleep_now_if_required(can_stop_too=True) if tot % 100 != 0: write_message(" ... formatted %s records out of %s" % (tot, tot)) return tot, tbibformat, tbibupload
def build_bai_knowledge(): ret = {} for personid, tag, data in run_sql("SELECT personid, tag, data FROM aidPERSONIDDATA WHERE tag LIKE 'extid:%' OR tag = 'canonical_name' or tag = 'uid'"): if tag == 'canonical_name': tag = 'BAI' elif tag == 'extid:INSPIREID': tag = 'INSPIRE' elif tag == 'extid:ORCID': tag = 'ORCID' elif tag == 'extid:KAKEN': tag = 'KAKEN' elif tag == 'uid': tag = 'UID' else: continue data = data.strip() if personid not in ret: ret[personid] = {'personid': personid} if tag in ret[personid]: write_message("ERROR: http://inspirehep.net/author/profile/{personid} has invalid IDs".format(personid=personid), stream=sys.stderr) continue ret[personid][tag] = data.upper() if tag == 'BAI': ret[personid]['ORIGINAL_BAI'] = data return ret.values()
def filter_out_broken_ids(hepname_kb, bai_kb): broken = [] for id_type in ('BAI', 'INSPIRE', 'ORCID', 'KAKEN'): broken.extend(project_ids(hepname_kb, id_type)[1]) broken.extend(project_ids(bai_kb, id_type)[1]) write_message("Broken entries: {len}".format(len=len(broken)), stream=sys.stderr) broken_ids = {} for elem in broken: for key, value in elem.iteritems(): if key not in broken_ids: broken_ids[key] = set([value]) else: broken_ids[key].add(value) new_hepname_kb = [] for elem in hepname_kb: for key, value in elem.iteritems(): if value in broken_ids.get(key, set()): break else: new_hepname_kb.append(elem) new_bai_kb = [] for elem in bai_kb: for key, value in elem.iteritems(): if value in broken_ids.get(key, set()): break else: new_bai_kb.append(elem) return new_hepname_kb, new_bai_kb
def solr_commit_if_necessary(next_commit_counter, final_commit=False, recid=None): # Counter full or final commit if counter set if next_commit_counter == task_get_option("flush") - 1 or (final_commit and next_commit_counter > 0): recid_info = '' if recid: recid_info = ' for recid=%s' % recid status_msg = 'Solr ranking indexer COMMITTING' + recid_info write_message(status_msg) task_update_progress(status_msg) try: # Commits might cause an exception, most likely a # timeout while hitting a background merge # Changes will then be committed later by the # calling (periodical) task # Also, autocommits can be used in the solrconfig SOLR_CONNECTION.commit() except: register_exception(alert_admin=True) next_commit_counter = 0 task_sleep_now_if_required(can_stop_too=True) else: next_commit_counter = next_commit_counter + 1 return next_commit_counter
def convert_files(xml_files, els, prefix="", threshold_date=None): """Convert the list of publisher XML to MARCXML using given instance.""" results = {} for xml_file in xml_files: task_sleep_now_if_required() full_xml_filepath = join(prefix, xml_file) dom_xml = parse(full_xml_filepath) date = els.get_publication_information(dom_xml)[-2] if threshold_date and date < threshold_date: continue doctype = els.get_doctype(dom_xml).lower() if doctype in INTERESTING_DOCTYPES: new_full_xml_filepath = join(dirname(full_xml_filepath), "upload.xml") try: converted_xml = els.get_record( full_xml_filepath, refextract_callback=refextract) except Exception as e: _errors_detected.append(e) error_trace = traceback.format_exc() # Some error happened, lets gracefully quit results[full_xml_filepath] = (StatusCodes.CONVERSION_ERROR, error_trace) write_message('Error converting:' ' \n {0}'.format(error_trace)) continue with open(new_full_xml_filepath, "w") as marcfile: marcfile.write(converted_xml) results[full_xml_filepath] = (StatusCodes.OK, new_full_xml_filepath) else: results[full_xml_filepath] = (StatusCodes.DOCTYPE_WRONG, doctype) write_message("Doctype not interesting: {0}".format(doctype)) return results
def record_collect_oai_identifiers(record_xml): """ Collects all OAI identifiers from given MARCXML. Returns a list of found values in the tag CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG. @param record_xml: string containing MARCXML to parse @return list of identifiers """ result = None (record, status_code, list_of_errors) = create_record(record_xml) if not status_code: # Error happened write_message("Error collecting OAI identifier from record: %s" % ("\n".join(list_of_errors),)) else: # All OK! We can get the IDs result = record_get_field_values(record, CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5]) if not result: # No IDs found.. write_message("No OAI IDs found in record") return result
def solr_add_ranges(id_ranges): sub_range_length = task_get_option("flush") id_ranges_to_index = [] for id_range in id_ranges: lower_recid = id_range[0] upper_recid = id_range[1] i_low = lower_recid while i_low <= upper_recid: i_up = min(i_low + sub_range_length - 1, upper_recid) id_ranges_to_index.append((i_low, i_up)) i_low += sub_range_length tags_to_index = get_tags() # Indexes latest records first by reversing # This allows the ranker to return better results during long indexing # runs as the ranker cuts the hitset using latest records id_ranges_to_index.reverse() next_commit_counter = 0 for id_range_to_index in id_ranges_to_index: lower_recid = id_range_to_index[0] upper_recid = id_range_to_index[1] status_msg = "Solr ranking indexer called for %s-%s" % (lower_recid, upper_recid) write_message(status_msg) task_update_progress(status_msg) next_commit_counter = solr_add_range(lower_recid, upper_recid, tags_to_index, next_commit_counter) solr_commit_if_necessary(next_commit_counter, final_commit=True)
def replace_cites(recid, new_cites): """ Given a set of citations, replaces the citations of given recid in the database. The changes are logged into rnkCITATIONLOG. See @replace_refs """ old_cites = set(row[0] for row in run_sql("""SELECT citer FROM rnkCITATIONDICT WHERE citee = %s""", [recid])) cites_to_add = new_cites - old_cites cites_to_delete = old_cites - new_cites for cite in cites_to_add: write_message('adding cite %s %s' % (recid, cite), verbose=1) now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) run_sql("""INSERT INTO rnkCITATIONDICT (citee, citer, last_updated) VALUES (%s, %s, %s)""", (recid, cite, now)) run_sql("""INSERT INTO rnkCITATIONLOG (citee, citer, type, action_date) VALUES (%s, %s, %s, %s)""", (recid, cite, 'added', now)) for cite in cites_to_delete: write_message('deleting cite %s %s' % (recid, cite), verbose=1) now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) run_sql("""DELETE FROM rnkCITATIONDICT WHERE citee = %s and citer = %s""", (recid, cite)) run_sql("""INSERT INTO rnkCITATIONLOG (citee, citer, type, action_date) VALUES (%s, %s, %s, %s)""", (recid, cite, 'removed', now))
def generate_sitemaps(sitemap_index_writer, records, output_directory, sitemap_name): """ Generate sitemaps themselves. @param sitemap_index_writer: the instance of SitemapIndexWriter that will refer to these sitemaps @param records: the list of (recid, modification_date) tuples to process @param output_directory: directory where to store the sitemaps @param sitemap_name: the name (prefix) of the sitemap files(s) """ sitemap_id = 1 writer = SitemapWriter(sitemap_id, output_directory, sitemap_name) sitemap_index_writer.add_url(writer.get_sitemap_url()) nb_urls = 0 write_message("... Getting sitemap '%s'..." % sitemap_name) write_message("... Generating urls for %s records..." % len(records)) task_sleep_now_if_required(can_stop_too=True) for i, (recid, lastmod) in enumerate(records): if nb_urls % 100 == 0 and (writer.get_size() >= MAX_SIZE or nb_urls >= MAX_RECORDS): sitemap_id += 1 writer = SitemapWriter(sitemap_id, output_directory, sitemap_name) sitemap_index_writer.add_url(writer.get_sitemap_url()) nb_urls = writer.add_url(CFG_SITE_URL + '/%s/%s' % (CFG_SITE_RECORD, recid), lastmod = lastmod, changefreq = DEFAULT_CHANGEFREQ_RECORDS, priority = DEFAULT_PRIORITY_RECORDS) if i % 100 == 0: task_update_progress("Google Scholar sitemap '%s' for recid %s/%s" % (sitemap_name, i + 1, len(records))) task_sleep_now_if_required(can_stop_too=True)
def align_entries(hepname_kb, bai_kb): hepname_updates= {} bai_updates = {} for key in ('ORCID', 'BAI', 'INSPIRE', 'KAKEN'): projected_hepnames = project_ids(hepname_kb, key)[0] projected_bais = project_ids(bai_kb, key)[0] for id_value in set(projected_hepnames.iterkeys()) & set(projected_bais.iterkeys()): merged_entry = dict(projected_hepnames[id_value].items()) for key, value in projected_bais[id_value].iteritems(): if key in merged_entry and merged_entry[key].upper() != value.upper(): write_message("ERROR: conflicting entries {entry1} Vs. {entry2}".format(entry1=format_entry(projected_hepnames[id_value]), entry2=format_entry(projected_bais[id_value]))) break else: merged_entry.update(projected_bais[id_value]) if (set(merged_entry.keys()) ^ set(projected_hepnames[id_value].keys())) & set(['ORCID', 'BAI', 'INSPIRE', 'KAKEN']): write_message("INFO: {hepname} should be updated to {merged_entry}".format(hepname=format_entry(projected_hepnames[id_value]), merged_entry=format_entry(merged_entry))) recid = merged_entry['recid'] if recid in hepname_updates: if hepname_updates[recid] != merged_entry: write_message("ERROR: conflict for recid {recid}: {entry1} Vs. {entry2}".format(recid=recid, entry1=format_entry(hepname_updates[recid]), entry2=format_entry(merged_entry)), stream=sys.stderr) else: hepname_updates[recid] = merged_entry if (set(merged_entry.keys()) ^ set(projected_bais[id_value].keys())) & set(['ORCID', 'BAI', 'INSPIRE', 'KAKEN']): write_message("INFO: {bai} should be updated to {merged_entry}".format(bai=format_entry(projected_bais[id_value]), merged_entry=format_entry(merged_entry))) personid = merged_entry['personid'] if personid in bai_updates: if bai_updates[personid] != merged_entry: write_message("ERROR: conflict for personid {personid}: {entry1} Vs. {entry2}".format(personid=personid, entry1=format_entry(bai_updates[recid]), entry2=format_entry(merged_entry)), stream=sys.stderr) else: bai_updates[personid] = merged_entry return hepname_updates, bai_updates
def task_run_core(): """ Main daemon task. Returns True when run successfully. False otherwise. """ rules_to_reset = task_get_option("reset_rules") if rules_to_reset: write_message("Resetting the following rules: %s" % rules_to_reset) for rule in rules_to_reset: reset_rule_last_run(rule) plugins = load_plugins() rules = load_rules(plugins) write_message("Loaded rules: %s" % rules, verbose=9) task_set_option('plugins', plugins) recids_for_rules = get_recids_for_rules(rules) write_message("recids for rules: %s" % recids_for_rules, verbose=9) update_database = not (task_has_option('record_ids') or task_get_option( 'no_upload', False) or task_get_option('no_tickets', False)) if update_database: next_starting_dates = {} for rule_name, rule in rules.iteritems(): next_starting_dates[rule_name] = get_next_starting_date(rule) all_recids = intbitset([]) single_rules = set() batch_rules = set() for rule_name, rule_recids in recids_for_rules.iteritems(): all_recids.union_update(rule_recids) if plugins[rules[rule_name]["check"]]["batch"]: batch_rules.add(rule_name) else: single_rules.add(rule_name) records_to_upload_holdingpen = [] records_to_upload_replace = [] records_to_submit_tickets = [] for batch in iter_batches(all_recids, CFG_BATCH_SIZE): for rule_name in batch_rules: rule = rules[rule_name] rule_recids = recids_for_rules[rule_name] task_sleep_now_if_required(can_stop_too=True) records = [] for i, record_id, record in batch: if record_id in rule_recids: records.append(record) if len(records): check_records(rule, records) # Then run them through normal rules for i, record_id, record in batch: progress_percent = int(float(i) / len(all_recids) * 100) task_update_progress("Processing record %s/%s (%i%%)." % (i, len(all_recids), progress_percent)) write_message("Processing record %s" % record_id) for rule_name in single_rules: rule = rules[rule_name] rule_recids = recids_for_rules[rule_name] task_sleep_now_if_required(can_stop_too=True) if record_id in rule_recids: check_record(rule, record) if record.amended: if record.holdingpen: records_to_upload_holdingpen.append(record) else: records_to_upload_replace.append(record) if not record.valid: records_to_submit_tickets.append(record) if len(records_to_submit_tickets) >= CFG_BATCH_SIZE: Tickets(records_to_submit_tickets).submit() records_to_submit_tickets = [] if len(records_to_upload_holdingpen) >= CFG_BATCH_SIZE: upload_amendments(records_to_upload_holdingpen, True) records_to_upload_holdingpen = [] if len(records_to_upload_replace) >= CFG_BATCH_SIZE: upload_amendments(records_to_upload_replace, False) records_to_upload_replace = [] ## In case there are still some remaining amended records if records_to_submit_tickets: Tickets(records_to_submit_tickets).submit() if records_to_upload_holdingpen: upload_amendments(records_to_upload_holdingpen, True) if records_to_upload_replace: upload_amendments(records_to_upload_replace, False) # Update the database with the last time each rule was ran if update_database: for rule_name, rule in rules.iteritems(): update_rule_last_run(rule_name, next_starting_dates[rule_name]) return True
def bibrank_engine(run): """Run the indexing task. Return 1 in case of success and 0 in case of failure. """ startCreate = time.time() try: options["run"] = [] options["run"].append(run) for rank_method_code in options["run"]: task_sleep_now_if_required(can_stop_too=True) cfg_name = getName(rank_method_code) write_message("Running rank method: %s." % cfg_name) file = CFG_ETCDIR + "/bibrank/" + rank_method_code + ".cfg" config = ConfigParser.ConfigParser() try: config.readfp(open(file)) except StandardError, e: write_message("Cannot find configurationfile: %s" % file, sys.stderr) raise StandardError cfg_short = rank_method_code cfg_function = config.get("rank_method", "function") + "_exec" cfg_repair_function = config.get("rank_method", "function") + "_repair_exec" cfg_name = getName(cfg_short) options["validset"] = get_valid_range(rank_method_code) if task_get_option("collection"): l_of_colls = string.split(task_get_option("collection"), ", ") recIDs = perform_request_search(c=l_of_colls) recIDs_range = [] for recID in recIDs: recIDs_range.append([recID, recID]) options["recid_range"] = recIDs_range elif task_get_option("id"): options["recid_range"] = task_get_option("id") elif task_get_option("modified"): options["recid_range"] = add_recIDs_by_date( rank_method_code, task_get_option("modified")) elif task_get_option("last_updated"): options["recid_range"] = add_recIDs_by_date(rank_method_code) else: write_message("No records specified, updating all", verbose=2) min_id = run_sql("SELECT min(id) from bibrec")[0][0] max_id = run_sql("SELECT max(id) from bibrec")[0][0] options["recid_range"] = [[min_id, max_id]] if task_get_option("quick") == "no": write_message( "Recalculate parameter not used, parameter ignored.", verbose=9) if task_get_option("cmd") == "del": del_recids(cfg_short, options["recid_range"]) elif task_get_option("cmd") == "add": func_object = globals().get(cfg_function) func_object(rank_method_code, cfg_name, config) elif task_get_option("cmd") == "stat": rank_method_code_statistics(rank_method_code) elif task_get_option("cmd") == "check": check_method(rank_method_code) elif task_get_option("cmd") == "print-missing": func_object = globals().get(cfg_function) func_object(rank_method_code, cfg_name, config) elif task_get_option("cmd") == "repair": func_object = globals().get(cfg_repair_function) func_object() else: write_message( "Invalid command found processing %s" % rank_method_code, sys.stderr) raise StandardError except StandardError, e: write_message("\nException caught: %s" % e, sys.stderr) write_message(traceback.format_exc()[:-1]) register_exception() raise StandardError
def single_tag_rank_method_repair_exec(): """Repair single tag ranking method""" write_message( "Repairing for this ranking method is not defined. Skipping.") return
def file_similarity_by_times_downloaded_repair_exec(): """Repair file similarity by times downloaded ranking method""" write_message( "Repairing for this ranking method is not defined. Skipping.") return
def download_weight_total_repair_exec(): """Repair download weight total ranking method""" write_message( "Repairing for this ranking method is not defined. Skipping.") return
def download_weight_filtering_user_repair_exec(): """Repair download weight filtering user ranking method""" write_message( "Repairing for this ranking method is not defined. Skipping.") return
def showtime(timeused): """Show time used for method""" write_message("Time used: %d second(s)." % timeused, verbose=9)
def single_tag_rank(config): """Connect the given tag with the data from the kb file given""" write_message("Loading knowledgebase file", verbose=9) kb_data = {} records = [] write_message("Reading knowledgebase file: %s" % \ config.get(config.get("rank_method", "function"), "kb_src")) input = open(config.get(config.get("rank_method", "function"), "kb_src"), 'r') data = input.readlines() for line in data: if not line[0:1] == "#": kb_data[string.strip( (string.split(string.strip(line), "---"))[0])] = (string.split( string.strip(line), "---"))[1] write_message("Number of lines read from knowledgebase file: %s" % len(kb_data)) tag = config.get(config.get("rank_method", "function"), "tag") tags = config.get(config.get("rank_method", "function"), "check_mandatory_tags").split(", ") if tags == ['']: tags = "" records = [] for (recids, recide) in options["recid_range"]: task_sleep_now_if_required(can_stop_too=True) write_message("......Processing records #%s-%s" % (recids, recide)) recs = run_sql( "SELECT id_bibrec, value FROM bib%sx, bibrec_bib%sx WHERE tag=%%s AND id_bibxxx=id and id_bibrec >=%%s and id_bibrec<=%%s" % (tag[0:2], tag[0:2]), (tag, recids, recide)) valid = intbitset(trailing_bits=1) valid.discard(0) for key in tags: newset = intbitset() newset += [ recid[0] for recid in (run_sql( "SELECT id_bibrec FROM bib%sx, bibrec_bib%sx WHERE id_bibxxx=id AND tag=%%s AND id_bibxxx=id and id_bibrec >=%%s and id_bibrec<=%%s" % (tag[0:2], tag[0:2]), (key, recids, recide))) ] valid.intersection_update(newset) if tags: recs = filter(lambda x: x[0] in valid, recs) records = records + list(recs) write_message("Number of records found with the necessary tags: %s" % len(records)) records = filter(lambda x: x[0] in options["validset"], records) rnkset = {} for key, value in records: if kb_data.has_key(value): if not rnkset.has_key(key): rnkset[key] = float(kb_data[value]) else: if kb_data.has_key( rnkset[key]) and float(kb_data[value]) > float( (rnkset[key])[1]): rnkset[key] = float(kb_data[value]) else: rnkset[key] = 0 write_message("Number of records available in rank method: %s" % len(rnkset)) return rnkset
def guest_user_garbage_collector(): """Session Garbage Collector program flow/tasks: 1: delete expired sessions 1b:delete guest users without session 2: delete queries not attached to any user 3: delete baskets not attached to any user 4: delete alerts not attached to any user 5: delete expired mailcookies 5b: delete expired not confirmed email address 6: delete expired roles memberships verbose - level of program output. 0 - nothing 1 - default 9 - max, debug""" # dictionary used to keep track of number of deleted entries delcount = { 'session': 0, 'user': 0, 'user_query': 0, 'query': 0, 'bskBASKET': 0, 'user_bskBASKET': 0, 'bskREC': 0, 'bskRECORDCOMMENT': 0, 'bskEXTREC': 0, 'bskEXTFMT': 0, 'user_query_basket': 0, 'mail_cookie': 0, 'email_addresses': 0, 'role_membership': 0 } write_message("CLEANING OF GUEST SESSIONS STARTED") # 1 - DELETE EXPIRED SESSIONS write_message("- deleting expired sessions") timelimit = convert_datestruct_to_datetext(time.gmtime()) write_message(" DELETE FROM session WHERE" " session_expiry < %s \n" % (timelimit, ), verbose=9) delcount['session'] += run_sql( "DELETE FROM session WHERE" " session_expiry < %s " "", (timelimit, )) # 1b - DELETE GUEST USERS WITHOUT SESSION write_message("- deleting guest users without session") # get uids write_message( """ SELECT u.id\n FROM user AS u LEFT JOIN session AS s\n ON u.id = s.uid\n WHERE s.uid IS NULL AND u.email = ''""", verbose=9) result = run_sql("""SELECT u.id FROM user AS u LEFT JOIN session AS s ON u.id = s.uid WHERE s.uid IS NULL AND u.email = ''""") write_message(result, verbose=9) if result: # work on slices of result list in case of big result for i in range(0, len(result), CFG_MYSQL_ARGUMENTLIST_SIZE): # create string of uids uidstr = '' for (id_user, ) in result[i:i + CFG_MYSQL_ARGUMENTLIST_SIZE]: if uidstr: uidstr += ',' uidstr += "%s" % (id_user, ) # delete users write_message( " DELETE FROM user WHERE" " id IN (TRAVERSE LAST RESULT) AND email = '' \n", verbose=9) delcount['user'] += run_sql("DELETE FROM user WHERE" " id IN (%s) AND email = ''" % (uidstr, )) # 2 - DELETE QUERIES NOT ATTACHED TO ANY USER # first step, delete from user_query write_message("- deleting user_queries referencing" " non-existent users") # find user_queries referencing non-existent users write_message( " SELECT DISTINCT uq.id_user\n" " FROM user_query AS uq LEFT JOIN user AS u\n" " ON uq.id_user = u.id\n WHERE u.id IS NULL", verbose=9) result = run_sql("""SELECT DISTINCT uq.id_user FROM user_query AS uq LEFT JOIN user AS u ON uq.id_user = u.id WHERE u.id IS NULL""") write_message(result, verbose=9) # delete in user_query one by one write_message( " DELETE FROM user_query WHERE" " id_user = '******' \n", verbose=9) for (id_user, ) in result: delcount['user_query'] += run_sql("""DELETE FROM user_query WHERE id_user = %s""" % (id_user, )) # delete the actual queries write_message("- deleting queries not attached to any user") # select queries that must be deleted write_message( """ SELECT DISTINCT q.id\n FROM query AS q LEFT JOIN user_query AS uq\n ON uq.id_query = q.id\n WHERE uq.id_query IS NULL AND\n q.type <> 'p' """, verbose=9) result = run_sql("""SELECT DISTINCT q.id FROM query AS q LEFT JOIN user_query AS uq ON uq.id_query = q.id WHERE uq.id_query IS NULL AND q.type <> 'p'""") write_message(result, verbose=9) # delete queries one by one write_message( """ DELETE FROM query WHERE id = 'TRAVERSE LAST RESULT' \n""", verbose=9) for (id_user, ) in result: delcount['query'] += run_sql("""DELETE FROM query WHERE id = %s""", (id_user, )) # 3 - DELETE BASKETS NOT OWNED BY ANY USER write_message("- deleting baskets not owned by any user") # select basket ids write_message( """ SELECT ub.id_bskBASKET\n FROM user_bskBASKET AS ub LEFT JOIN user AS u\n ON u.id = ub.id_user\n WHERE u.id IS NULL""", verbose=9) try: result = run_sql("""SELECT ub.id_bskBASKET FROM user_bskBASKET AS ub LEFT JOIN user AS u ON u.id = ub.id_user WHERE u.id IS NULL""") except: result = [] write_message(result, verbose=9) # delete from user_basket and basket one by one write_message( """ DELETE FROM user_bskBASKET WHERE id_bskBASKET = 'TRAVERSE LAST RESULT' """, verbose=9) write_message( """ DELETE FROM bskBASKET WHERE id = 'TRAVERSE LAST RESULT' """, verbose=9) write_message( """ DELETE FROM bskREC WHERE id_bskBASKET = 'TRAVERSE LAST RESULT'""", verbose=9) write_message( """ DELETE FROM bskRECORDCOMMENT WHERE id_bskBASKET = 'TRAVERSE LAST RESULT' \n""", verbose=9) for (id_basket, ) in result: delcount['user_bskBASKET'] += run_sql( """DELETE FROM user_bskBASKET WHERE id_bskBASKET = %s""", (id_basket, )) delcount['bskBASKET'] += run_sql( """DELETE FROM bskBASKET WHERE id = %s""", (id_basket, )) delcount['bskREC'] += run_sql( """DELETE FROM bskREC WHERE id_bskBASKET = %s""", (id_basket, )) delcount['bskRECORDCOMMENT'] += run_sql( """DELETE FROM bskRECORDCOMMENT WHERE id_bskBASKET = %s""", (id_basket, )) write_message( """ SELECT DISTINCT ext.id, rec.id_bibrec_or_bskEXTREC FROM bskEXTREC AS ext \nLEFT JOIN bskREC AS rec ON ext.id=-rec.id_bibrec_or_bskEXTREC WHERE id_bibrec_or_bskEXTREC is NULL""", verbose=9) try: result = run_sql("""SELECT DISTINCT ext.id FROM bskEXTREC AS ext LEFT JOIN bskREC AS rec ON ext.id=-rec.id_bibrec_or_bskEXTREC WHERE id_bibrec_or_bskEXTREC is NULL""") except: result = [] write_message(result, verbose=9) write_message( """ DELETE FROM bskEXTREC WHERE id = 'TRAVERSE LAST RESULT' """, verbose=9) write_message( """ DELETE FROM bskEXTFMT WHERE id_bskEXTREC = 'TRAVERSE LAST RESULT' \n""", verbose=9) for (id_basket, ) in result: delcount['bskEXTREC'] += run_sql( """DELETE FROM bskEXTREC WHERE id=%s""", (id_basket, )) delcount['bskEXTFMT'] += run_sql( """DELETE FROM bskEXTFMT WHERE id_bskEXTREC=%s""", (id_basket, )) # 4 - DELETE ALERTS NOT OWNED BY ANY USER write_message('- deleting alerts not owned by any user') # select user ids in uqb that reference non-existent users write_message( """SELECT DISTINCT uqb.id_user FROM user_query_basket AS uqb LEFT JOIN user AS u ON uqb.id_user = u.id WHERE u.id IS NULL""", verbose=9) result = run_sql( """SELECT DISTINCT uqb.id_user FROM user_query_basket AS uqb LEFT JOIN user AS u ON uqb.id_user = u.id WHERE u.id IS NULL""" ) write_message(result, verbose=9) # delete all these entries for (id_user, ) in result: write_message( """DELETE FROM user_query_basket WHERE id_user = '******'user_query_basket'] += run_sql( """DELETE FROM user_query_basket WHERE id_user = %s """, (id_user, )) # 5 - delete expired mailcookies write_message("""mail_cookie_gc()""", verbose=9) delcount['mail_cookie'] = mail_cookie_gc() ## 5b - delete expired not confirmed email address write_message( """DELETE FROM user WHERE note='2' AND NOW()>ADDTIME(last_login, '%s 0:0:0')""" % CFG_WEBSESSION_NOT_CONFIRMED_EMAIL_ADDRESS_EXPIRE_IN_DAYS, verbose=9) delcount['email_addresses'] = run_sql( """DELETE FROM user WHERE note='2' AND NOW()>ADDTIME(last_login, '%s 0:0:0')""", (CFG_WEBSESSION_NOT_CONFIRMED_EMAIL_ADDRESS_EXPIRE_IN_DAYS, )) # 6 - delete expired roles memberships write_message("""DELETE FROM user_accROLE WHERE expiration<NOW()""", verbose=9) delcount['role_membership'] = run_sql( """DELETE FROM user_accROLE WHERE expiration<NOW()""") # print STATISTICS write_message("""- statistics about deleted data: """) write_message(""" %7s sessions.""" % (delcount['session'], )) write_message(""" %7s users.""" % (delcount['user'], )) write_message(""" %7s user_queries.""" % (delcount['user_query'], )) write_message(""" %7s queries.""" % (delcount['query'], )) write_message(""" %7s baskets.""" % (delcount['bskBASKET'], )) write_message(""" %7s user_baskets.""" % (delcount['user_bskBASKET'], )) write_message(""" %7s basket_records.""" % (delcount['bskREC'], )) write_message(""" %7s basket_external_records.""" % (delcount['bskEXTREC'], )) write_message(""" %7s basket_external_formats.""" % (delcount['bskEXTFMT'], )) write_message(""" %7s basket_comments.""" % (delcount['bskRECORDCOMMENT'], )) write_message(""" %7s user_query_baskets.""" % (delcount['user_query_basket'], )) write_message(""" %7s mail_cookies.""" % (delcount['mail_cookie'], )) write_message(""" %7s non confirmed email addresses.""" % delcount['email_addresses']) write_message(""" %7s role_memberships.""" % (delcount['role_membership'], )) write_message("""CLEANING OF GUEST SESSIONS FINISHED""")
def rank_method_code_statistics(rank_method_code): """Print statistics""" method = fromDB(rank_method_code) max = ('', -999999) maxcount = 0 min = ('', 999999) mincount = 0 for (recID, value) in method.iteritems(): if value < min and value > 0: min = value if value > max: max = value for (recID, value) in method.iteritems(): if value == min: mincount += 1 if value == max: maxcount += 1 write_message("Showing statistic for selected method") write_message("Method name: %s" % getName(rank_method_code)) write_message("Short name: %s" % rank_method_code) write_message("Last run: %s" % get_lastupdated(rank_method_code)) write_message("Number of records: %s" % len(method)) write_message("Lowest value: %s - Number of records: %s" % (min, mincount)) write_message("Highest value: %s - Number of records: %s" % (max, maxcount)) write_message("Divided into 10 sets:") for i in range(1, 11): setcount = 0 distinct_values = {} lower = -1.0 + ((float(max + 1) / 10)) * (i - 1) upper = -1.0 + ((float(max + 1) / 10)) * i for (recID, value) in method.iteritems(): if value >= lower and value <= upper: setcount += 1 distinct_values[value] = 1 write_message("Set %s (%s-%s) %s Distinct values: %s" % (i, lower, upper, len(distinct_values), setcount))
filenames = os.listdir(rss_cache_dir) except OSError: filenames = [] count = 0 for filename in filenames: filename = os.path.join(rss_cache_dir, filename) last_update_time = datetime.datetime.fromtimestamp( os.stat(os.path.abspath(filename)).st_mtime) if not (datetime.datetime.now() < last_update_time + datetime.timedelta(minutes=CFG_WEBSEARCH_RSS_TTL)): try: os.remove(filename) count += 1 except OSError, e: write_message("Error: %s" % e) write_message("""%s rss cache file pruned out of %s.""" % (count, len(filenames))) write_message("""CLEANING OF OLD CACHED RSS REQUEST FINISHED""") write_message("""CLEANING OF OLD CACHED WEBJOURNAL FILES STARTED""") webjournal_cache_dir = "%s/webjournal/" % CFG_CACHEDIR filenames = [] try: for root, dummy, files in os.walk(webjournal_cache_dir): filenames.extend( os.path.join(root, filename) for filename in files) except OSError: pass count = 0 for filename in filenames: filename = os.path.join(webjournal_cache_dir, filename) last_update_time = datetime.datetime.fromtimestamp(
def gc_exec_command(command): """ Exec the command logging in appropriate way its output.""" write_message(' %s' % command, verbose=9) (dummy, output, errors) = os.popen3(command) write_messages(errors.read()) write_messages(output.read())
def perform_fulltext_harvest(record_list, add_metadata, attach_fulltext, hidden_fulltext, out_folder, threshold_date=None, journal_mappings=None): """ For every record in given list APSRecord(record ID, DOI, date last updated), yield a APSRecord with added FFT dictionary containing URL to fulltext/metadata XML downloaded locally. If a download is unsuccessful, an error message is given. @return: tuple of (APSRecord, error_message) """ count = 0 request_end = None request_start = None for record in record_list: task_sleep_now_if_required(can_stop_too=False) # Unless this is the first request, lets sleep a bit if request_end and request_start: request_dt = request_end - request_start write_message("Checking request time (%d)" % (request_dt, ), verbose=3) if count and request_dt > 0 and request_dt < CFG_APSHARVEST_REQUEST_TIMEOUT: write_message("Initiating sleep for %.1f seconds" % (request_dt, ), verbose=3) time.sleep(request_dt) count += 1 task_update_progress("Harvesting record (%d/%d)" % (count, len(record_list))) if not record.doi: msg = "No DOI found for record %d" % (record.recid or "", ) write_message("Error: %s" % (msg, ), stream=sys.stderr) yield record, msg continue url = CFG_APSHARVEST_FULLTEXT_URL % {'doi': record.doi} result_file = os.path.join(out_folder, "%s.zip" % (record.doi.replace('/', '_'))) try: request_start = time.time() if os.path.exists(result_file): # File already downloaded recently, lets see if it is the same file_last_modified = get_file_modified_date(result_file) if not compare_datetime_to_iso8601_date( file_last_modified, record.last_modified): # File is not older than APS version, we should not download. raise APSHarvesterFileExits write_message("Trying to save to %s" % (result_file, ), verbose=5) result_file = download_url(url=url, download_to_file=result_file, content_type="zip", retry_count=5, timeout=60.0) write_message("Downloaded %s to %s" % (url, result_file), verbose=2) except InvenioFileDownloadError, e: msg = "URL could not be opened: %s" % (url, ) write_message("Error: %s" % (msg, ), stream=sys.stderr) yield record, msg continue except APSHarvesterFileExits: write_message("File exists at %s" % (result_file, ), verbose=2)
def clean_tempfiles(): """ Clean old temporary files. """ write_message("""CLEANING OF TMP FILES STARTED""") write_message("- deleting/gzipping temporary empty/old " "BibReformat xml files") vstr = task_get_option('verbose') > 1 and '-v' or '' gc_exec_command('find %s %s -name "rec_fmt_*"' ' -size 0c -exec rm %s -f {} \;' \ % (CFG_TMPDIR, CFG_TMPSHAREDDIR, vstr)) gc_exec_command('find %s %s -name "rec_fmt_*"' ' -atime +%s -exec rm %s -f {} \;' \ % (CFG_TMPDIR, CFG_TMPSHAREDDIR, \ CFG_MAX_ATIME_RM_FMT, vstr)) gc_exec_command('find %s %s -name "rec_fmt_*"' ' -atime +%s -exec gzip %s -9 {} \;' \ % (CFG_TMPDIR, CFG_TMPSHAREDDIR, \ CFG_MAX_ATIME_ZIP_FMT, vstr)) write_message("- deleting/gzipping temporary old " "OAIHarvest xml files") gc_exec_command('find %s %s -name "oaiharvestadmin.*"' ' -exec rm %s -f {} \;' \ % (CFG_TMPDIR, CFG_TMPSHAREDDIR, vstr)) gc_exec_command('find %s %s -name "bibconvertrun.*"' ' -exec rm %s -f {} \;' \ % (CFG_TMPDIR, CFG_TMPSHAREDDIR, vstr)) # Using mtime and -r here to include directories. gc_exec_command('find %s %s -name "oaiharvest*"' ' -mtime +%s -exec gzip %s -9 {} \;' \ % (CFG_TMPDIR, CFG_TMPSHAREDDIR, \ CFG_MAX_ATIME_ZIP_OAI, vstr)) gc_exec_command('find %s %s -name "oaiharvest*"' ' -mtime +%s -exec rm %s -rf {} \;' \ % (CFG_TMPDIR, CFG_TMPSHAREDDIR, \ CFG_MAX_ATIME_RM_OAI, vstr)) gc_exec_command('find %s %s -name "oai_archive*"' ' -mtime +%s -exec rm %s -rf {} \;' \ % (CFG_TMPDIR, CFG_TMPSHAREDDIR, \ CFG_MAX_ATIME_RM_OAI, vstr)) write_message("- deleting/gzipping temporary old " "BibSword files") gc_exec_command('find %s %s -name "bibsword_*"' ' -atime +%s -exec rm %s -f {} \;' \ % (CFG_TMPDIR, CFG_TMPSHAREDDIR, \ CFG_MAX_ATIME_RM_BIBSWORD, vstr)) gc_exec_command('find %s %s -name "bibsword_*"' ' -atime +%s -exec gzip %s -9 {} \;' \ % (CFG_TMPDIR, CFG_TMPSHAREDDIR, \ CFG_MAX_ATIME_ZIP_BIBSWORD, vstr)) # DELETE ALL FILES CREATED DURING VIDEO SUBMISSION write_message("- deleting old video submissions") gc_exec_command('find %s -name %s* -atime +%s -exec rm %s -f {} \;' \ % (CFG_TMPSHAREDDIR, CFG_WEBSUBMIT_TMP_VIDEO_PREFIX, CFG_MAX_ATIME_WEBSUBMIT_TMP_VIDEO, vstr)) write_message("- deleting temporary old " "RefExtract files") gc_exec_command('find %s %s -name "refextract*"' ' -atime +%s -exec rm %s -f {} \;' \ % (CFG_TMPDIR, CFG_TMPSHAREDDIR, CFG_MAX_ATIME_RM_REFEXTRACT, vstr)) write_message("- deleting temporary old bibdocfiles") gc_exec_command('find %s %s -name "bibdocfile_*"' ' -atime +%s -exec rm %s -f {} \;' \ % (CFG_TMPDIR, CFG_TMPSHAREDDIR, \ CFG_MAX_ATIME_RM_BIBDOC, vstr)) write_message("- deleting old temporary WebSubmit icons") gc_exec_command('find %s %s -name "websubmit_icon_creator_*"' ' -atime +%s -exec rm %s -f {} \;' \ % (CFG_TMPDIR, CFG_TMPSHAREDDIR, \ CFG_MAX_ATIME_RM_ICON, vstr)) write_message("- deleting old temporary WebSubmit stamps") gc_exec_command('find %s %s -name "websubmit_file_stamper_*"' ' -atime +%s -exec rm %s -f {} \;' \ % (CFG_TMPDIR, CFG_TMPSHAREDDIR, \ CFG_MAX_ATIME_RM_STAMP, vstr)) write_message("- deleting old temporary WebJournal XML files") gc_exec_command('find %s %s -name "webjournal_publish_*"' ' -atime +%s -exec rm %s -f {} \;' \ % (CFG_TMPDIR, CFG_TMPSHAREDDIR, \ CFG_MAX_ATIME_RM_WEBJOURNAL_XML, vstr)) write_message("- deleting old temporary files attached with CKEditor") gc_exec_command('find %s/var/tmp/attachfile/ ' ' -atime +%s -exec rm %s -f {} \;' \ % (CFG_PREFIX, CFG_MAX_ATIME_RM_WEBSUBMIT_CKEDITOR_FILE, vstr)) write_message("- deleting old temporary files attached with BibEdit") gc_exec_command('find %s -name "bibedit*.tmp"' ' -atime +%s -exec rm %s -f {} \;' \ % (CFG_TMPSHAREDDIR + '/bibedit-cache/', CFG_MAX_ATIME_BIBEDIT_TMP, vstr)) write_message("- deleting old XML files submitted via BibEdit") gc_exec_command('find %s -name "bibedit*.xml"' ' -atime +%s -exec rm %s -f {} \;' \ % (CFG_TMPSHAREDDIR + '/bibedit-cache/', CFG_MAX_ATIME_BIBEDIT_XML, vstr)) write_message("""CLEANING OF TMP FILES FINISHED""")
def download_one(recid, version): """Download given version of the PDF from arxiv""" write_message('fetching %s' % recid) for count, arxiv_id in enumerate(extract_arxiv_ids_from_recid(recid)): if count != 0: write_message("Warning: %s has multiple arxiv #" % recid) continue url_for_pdf = build_arxiv_url(arxiv_id, version) filename_arxiv_id = arxiv_id.replace('/', '_') temp_file = NamedTemporaryFile(prefix="arxiv-pdf-checker", dir=CFG_TMPSHAREDDIR, suffix="%s.pdf" % filename_arxiv_id) write_message('downloading pdf from %s' % url_for_pdf) path = download_external_url(url_for_pdf, temp_file.name, content_type='pdf') # Check if it is not an html not found page filesize = os.path.getsize(path) if filesize < 25000: f = open(path) try: for line in f: if 'PDF unavailable' in line: raise PdfNotAvailable() finally: f.close() docs = BibRecDocs(recid) bibdocfiles = docs.list_latest_files(doctype="arXiv") if not bibdocfiles: # Maybe that this is one of those INSPIRE-PUBLIC with # still an arXiv file in it for name, bibdoc in docs.list_bibdocs_by_names().items(): if name.startswith('arXiv:'): bibdocfiles = bibdoc.list_latest_files() bibdocfiles = [ bibdocfile for bibdocfile in bibdocfiles if bibdocfile.get_superformat() == '.pdf' ] needs_update = False try: bibdocfile = bibdocfiles[0] except IndexError: bibdocfile = None needs_update = True else: existing_md5 = calculate_md5(bibdocfile.fullpath) new_md5 = calculate_md5(path.encode('utf-8')) if new_md5 != existing_md5: write_message('md5 differs updating') needs_update = True else: write_message('md5 matches existing pdf, skipping') if needs_update: if bibdocfiles: write_message('adding as new version') docs.add_new_version(path, docname=bibdocfile.name) else: write_message('adding as new file') docs.add_new_file(path, doctype="arXiv", docname="arXiv:%s" % filename_arxiv_id) else: raise FoundExistingPdf()
def create_MARCXML(figures, id_fulltext, code, extracted, write_file=True): """ Function that creates a file MARCXML from the vector of figures @param figures: the list of all figures @param id_fulltext: the id of the fulltext @param code: The code for Latex, PDF or both @param extracted: where the file will be generated @param write_file: it's True when the user wants to write the data into file @return: the path to the MARCXML file """ both_doc = 0 no_latex = 1 no_pdf = 2 list = [] list.append('<?xml version="1.0" encoding="UTF-8"?>') list.append('<collection>') figure_number = 1 parent_id = -1 for figure in figures: if figure.subfigure != None: if 'is subfigure of' in figure.subfigure: print 'ok' print figure.identifier list.append('<record>') for i in range(len(figure.files)): text_references = "" if not figure.files[i].path.endswith("context"): list.append(' <datafield tag="FFT" ind1=" " ind2=" ">') list.append(' <subfield code="a">' + figure.files[i].path + '</subfield>') list.append( ' <subfield code="r">restricted_pict</subfield>') list.append(' <subfield code="n">' + figure.identifier + '</subfield>') list.append(' <subfield code="d">' + figure.caption + '</subfield>') if i == 0: if (figure.subfigure != None): if 'is subfigure of' in figure.subfigure: list.append( ' <subfield code="i">TMP:SUBFIGURE:' + str(id_fulltext) + ':' + str(figure.subfigure_id) + '</subfield>') list.append( ' <subfield code="v">TMP:SUBFIGURE:' + str(id_fulltext) + ':v' + str(figure.subfigure_id) + '</subfield>') else: list.append(' <subfield code="i">TMP:' + str(id_fulltext) + ':' + str(figure_number) + '</subfield>') list.append(' <subfield code="v">TMP:' + str(id_fulltext) + ':v' + str(figure_number) + '</subfield>') if figure.is_parent != None: if 'is parent' in figure.is_parent: parent_id = figure_number list.append(' </datafield>') else: text_references = figure.text_references if i < len(figure.files) - 1: list.append('\n') # if i == len(figure.files) - 1: # list.append(' </record>') # if we have the fulltext pdf we add the BDR tag after the FFT tag if code != no_pdf and i == len(figure.files) - 1: list.append(' <datafield tag="BDR" ind1=" " ind2=" ">') # list.append(' <subfield code="i">TMP:OAI:' + str(figure_number) + '</subfield>') # list.append(' <subfield code="v">TMP:OAI:' + str(figure_number) + 'v' + '</subfield>') # id, v1, id, v if (figure.subfigure != None): if 'is subfigure of' in figure.subfigure: list.append(' <subfield code="i">TMP:SUBFIGURE:' + str(id_fulltext) + ':' + str(figure.subfigure_id) + '</subfield>') list.append(' <subfield code="v">TMP:SUBFIGURE:' + str(id_fulltext) + ':v' + str(figure.subfigure_id) + '</subfield>') list.append(' <subfield code="j">TMP:' + str(id_fulltext) + ':' + str(parent_id) + '</subfield>') list.append(' <subfield code="w">TMP:' + str(id_fulltext) + ':v' + str(parent_id) + '</subfield>') list.append( ' <subfield code="t">is_subfigure_of</subfield>' ) figure_number = figure_number - 1 else: list.append(' <subfield code="i">TMP:' + str(id_fulltext) + ':' + str(figure_number) + '</subfield>') list.append(' <subfield code="v">TMP:' + str(id_fulltext) + ':v' + str(figure_number) + '</subfield>') list.append(' <subfield code="j">TMP:' + str(id_fulltext) + '</subfield>') list.append(' <subfield code="w">TMP:' + str(id_fulltext) + ':v' + '</subfield>') list.append( ' <subfield code="t">is_extracted_from</subfield>' ) dict = {} dict["figures"] = {} dict["figures"]["caption"] = figure.caption write_message("adding field figure.caption") v = ["location", "caption_location"] for i, item in enumerate(v): if (figure.get_location(i) != None): dict["figures"][item] = {} dict["figures"][item][ "page_num"] = figure.get_location(i).page_num write_message("adding figure.get_location.page_num") if (figure.get_location(i).page_resolution != None): dict["figures"][item]["page_resolution"] = {} dict["figures"][item]["page_resolution"][ "width"] = figure.get_location( i).page_resolution.width write_message( "adding figure.get_location.page_resolution.page_num_width" ) dict["figures"][item]["page_resolution"][ "height"] = figure.get_location( i).page_resolution.height write_message( "adding figure.get_location.page_resolution.height" ) if (figure.get_location(i).boundary != None): dict["figures"][item]["boundary"] = {} dict["figures"][item]["boundary"][ "width"] = figure.get_location( i).boundary.width write_message( "adding figure.get_location.boundary.width") dict["figures"][item]["boundary"][ "height"] = figure.get_location( i).boundary.height write_message( "adding figure.get_location.boundary.height") dict["figures"][item]["boundary"][ "x"] = figure.get_location(i).boundary.x write_message( "adding figure.get_location.boundary.x") dict["figures"][item]["boundary"][ "y"] = figure.get_location(i).boundary.y write_message( "adding figure.get_location.boundary.y") if i == 0: dict["figures"][item][ "page_scale"] = figure.get_location( i).page_scale write_message( "adding figure.get_location.page_scale") dict["figures"]["text_references"] = text_references write_message("adding figure.text_references") d = cPickle.dumps(dict) info = base64.encodestring(d) list.append(' <subfield code="m">' + info + '</subfield>') list.append(' </datafield>') figure_number = figure_number + 1 list.append('</record>') list.append('</collection>') marc = '\n'.join(list) if write_file: marc_path = str(extracted) + "/extracted.xml" f = codecs.open(marc_path, encoding="utf-8", mode="a") #f = open(marc_path, 'a') f.write(marc) f.close() return marc_path
def bst_apsharvest(dois="", recids="", query="", records="", new_mode="email", update_mode="email", from_date="", until_date=None, metadata="yes", fulltext="yes", hidden="yes", match="no", reportonly="no", threshold_date=None, devmode="no"): """ Task to download APS metadata + fulltext given a list of arguments. Operates in two ways: 1. Harvesting of new/updated metadata+fulltext from APS via REST API This means that new records are being looked for at APS servers. Active when from_date and until_date is given, in addition when a DOI not already in the system is given. If the value "last" is given to from_date the harvester will harvest any new records since last run. If match is set to "yes" the records harvested will be matched against the database and split into "new" and "updated" records. 2. Attachment of fulltext only from APS for existing records When the records to be processed already exists in the system, the task only harvests the fulltext's themselves and attaches them to the records. Examples: Get full update for existing records via record identifier: >>> bst_apsharvest(recids="13,513,333") Get full update for existing records via a search query and unhide fulltext: >>> bst_apsharvest(query="find j prstab", hidden="no") Get metadata only update for an existing doi: >>> bst_apsharvest(dois="10.1103/PhysRevB.87.235401", fulltext="no") Get fulltext only update for a record and append to record: >>> bst_apsharvest(recids="11139", metadata="no", update_mode="append") Get new records from APS, send update to holding pen and email new records >>> bst_apsharvest(from_date="last", update_mode="o") Get records from APS updated between given dates, insert new and correct >>> bst_apsharvest(from_date="2013-06-03", until_date="2013-06-04", new_mode="insert", update_mode="correct") @param dois: comma-separated list of DOIs to download fulltext/metadata for. @type dois: string @param recids: comma-separated list of recids of record containing a DOI to download fulltext for. @type recids: string @param query: an Invenio search query of records to download fulltext for. @type query: string @param records: get any records modified, created or both since last time in the database to download fulltext for, can be either: "new" - fetches all new records added "modified" - fetches all modified records added "both" - both of the above @type records: string @param new_mode: which mode should the fulltext files be submitted in: "email" - does NOT run bibupload and sends an email instead. Default. "insert" - inserts the records into the database "append" - appends the fulltext to the existing attached files "correct" - corrects existing attached fulltext files, or adds new "replace" - replaces all attached files with new fulltext file The fulltext is appended by default to new records. @type mode: string @param update_mode: which mode should the fulltext files be submitted in: "email" - does NOT run bibupload and sends an email instead. Default. "insert" - inserts the records into the database "append" - appends the fulltext to the existing attached files "correct" - corrects existing attached fulltext files, or adds new "replace" - replaces all attached files with new fulltext file The fulltext is appended by default to new records. @type mode: string @param from_date: ISO date for when to harvest records from. Ex. 2013-01-01 If the value is "last" it means to get records since last harvest. @type from_date: string @param until_date: ISO date for when to harvest records until. Ex. 2013-01-01 @type until_date: string @param fulltext: should the record have fulltext attached? "yes" or "no" @type fulltext: string @param hidden: should the fulltext be hidden when attached? "yes" or "no" @type hidden: string @param match: should a simple match with the database be done? "yes" or "no" @type match: string @param reportonly: only report number of records to harvest, then exit? "yes" or "no" @type reportonly: string @param threshold_date: ISO date for when to harvest records since. Ex. 2013-01-01 @type threshold_date: string @param devmode: Activate devmode. Full verbosity and no uploads/mails. @type devmode: string """ # This is the list of APSRecord objects to be harvested. final_record_list = APSRecordList() task_update_progress("Parsing input parameters") # Validate modes for mode in [new_mode, update_mode]: if mode not in ("append", "a", "correct", "c", "o", "replace", "r", "insert", "i", "email"): raise Exception("Warning: given upload mode '%s' is not valid." % (mode, )) # We hide fulltext by default if hidden.lower() == "no": hidden = False else: hidden = True # We attach fulltext by default if fulltext.lower() == "no": fulltext = False else: fulltext = True # We attach meta-data by default if metadata.lower() == "no": metadata = False else: metadata = True # We do not match records by default if match.lower() == "yes": match = True else: match = False # We do not reportonly by default if devmode.lower() == "yes": devmode = True task_set_task_param('verbose', 9) else: devmode = False # We do not reportonly by default if reportonly.lower() == "yes": reportonly = True else: reportonly = False if threshold_date: # Input from user. Validate date try: harvest_from_date = validate_date(threshold_date) except ValueError, e: write_message("Error parsing from_date, use (YYYY-MM-DD): %s" % (str(e), ), stream=sys.stderr) return 1
def submit_records(records_filename, records_list, mode, directory, taskid=0, silent=False, devmode=False, subject=None): """ Performs the logic to submit given file (filepath) of records either by e-mail or using BibUpload with given mode. Taskid is given to indicate if the task submission should wait for any previously submitted tasks. The submission can also be made "silent" in the sense of not updating the modification date of the records. @param records_filename: filepath to XML file containing records. @type records_filename: string @param records_list: list of APSRecord objects for records @type records_list: list @param mode: which submission mode is it? @type mode: string @param taskid: bibsched taskid, wait for task to complete before submission @type taskid: int @param silent: do not update the modification date of the records @type silent: bool @return: returns the given taskid upon submission, or True/False from email. """ if devmode: return None if not subject: now = datetime.datetime.now() subject = "APS harvest results: %s" % ( now.strftime("%Y-%m-%d %H:%M:%S"), ) # Check if we should create bibupload or e-mail if mode == "email": # Lets parse the records and find our IDs. list_of_dois = [] for record in records_list: # We strip away the first part of the DOI for readability. list_of_dois.append('/'.join(record.doi.split('/')[1:])) # We send an e-mail to CFG_APSHARVEST_EMAIL and put file on AFS. body = "Harvested new records: %s" % (records_filename, ) try: try: shutil.move(records_filename, directory) records_filename = os.path.join( directory, os.path.basename(records_filename)) body = "Harvested new records on %s. They are located here:\n %s" % \ (now.strftime("%Y-%m-%d %H:%M:%S"), records_filename) except IOError, e: # Some IOError body = "Error while harvesting records: \nError saving %s - %s" % \ (records_filename, str(e)) raise e finally: body = "%s\nRecords harvested (%s total):\n%s\n" % ( body, str(len(list_of_dois)), "\n".join(list_of_dois)) res = submit_records_via_mail(subject, body) write_message("Sent e-mail to %s with path to %s" % (CFG_APSHARVEST_EMAIL, records_filename)) return res else: # We submit a BibUpload task and wait for it to finish task_update_progress("Waiting for task to finish") if taskid != 0: write_message("Going to wait for %d to finish" % (taskid, )) while not can_launch_bibupload(taskid): # Lets wait until the previously launched task exits. task_sleep_now_if_required(can_stop_too=False) time.sleep(5.0) taskid = submit_bibupload_for_records(mode, records_filename, silent) write_message("Submitted BibUpload task #%s with mode %s" % (str(taskid), mode)) return taskid
def _task_submit_check_options(): """ Required by bibtask. Checks the options. """ update_personid = bibtask.task_get_option("update_personid") disambiguate = bibtask.task_get_option("disambiguate") merge = bibtask.task_get_option("merge") record_ids = bibtask.task_get_option("record_ids") all_records = bibtask.task_get_option("all_records") from_scratch = bibtask.task_get_option("from_scratch") commands = bool(update_personid) + bool(disambiguate) + bool(merge) if commands == 0: bibtask.write_message( "ERROR: At least one command should be specified!", stream=sys.stdout, verbose=0) return False if commands > 1: bibtask.write_message( "ERROR: The options --update-personid, --disambiguate " "and --merge are mutually exclusive.", stream=sys.stdout, verbose=0) return False assert commands == 1 if update_personid: if any((from_scratch, )): bibtask.write_message( "ERROR: The only options which can be specified " "with --update-personid are --record-ids and " "--all-records", stream=sys.stdout, verbose=0) return False options = bool(record_ids) + bool(all_records) if options > 1: bibtask.write_message( "ERROR: conflicting options: --record-ids and " "--all-records are mutually exclusive.", stream=sys.stdout, verbose=0) return False if record_ids: for iden in record_ids: if not iden.isdigit(): bibtask.write_message("ERROR: Record_ids expects numbers. " "Provided: %s." % iden) return False if disambiguate: if any((record_ids, all_records)): bibtask.write_message( "ERROR: The only option which can be specified " "with --disambiguate is from-scratch", stream=sys.stdout, verbose=0) return False if merge: if any((record_ids, all_records, from_scratch)): bibtask.write_message( "ERROR: There are no options which can be " "specified along with --merge", stream=sys.stdout, verbose=0) return False return True
perpage = 100 # Are we harvesting from last time or a specific date? if from_date == "last": dummy, harvest_from_date = fetch_last_updated( name="apsharvest_api_download") # Keeping current time until completed harvest. new_harvest_date = datetime.datetime.now() else: # Input from user. Validate date try: harvest_from_date = validate_date(from_date) except ValueError, e: write_message("Error parsing from_date, use (YYYY-MM-DD): %s" % (str(e), ), stream=sys.stderr) return 1 # Turn harvest_from_date back into a string (away from datetime object) harvest_from_date = harvest_from_date.strftime("%Y-%m-%d") status_message = "Checking for new records from APS from %s" % \ (harvest_from_date,) if until_date: # Input from user. Validate date try: validate_date(until_date) except ValueError, e: write_message( "Error parsing until_date, use (YYYY-MM-DD): %s" %
def task_submit_check_options(): """ NOTE: Depending on the parameters, either "BibSched mode" or plain straigh-forward execution mode is entered. """ if task_has_option("create_event_with_id"): print webstat.create_customevent( task_get_option("create_event_with_id"), task_get_option("event_name", None), task_get_option("column_headers", [])) sys.exit(0) elif task_has_option("destroy_event_with_id"): print webstat.destroy_customevent( task_get_option("destroy_event_with_id")) sys.exit(0) elif task_has_option("list_events"): events = webstat._get_customevents() if len(events) == 0: print "There are no custom events available." else: print "Available custom events are:\n" print '\n'.join([ x[0] + ": " + ((x[1] == None) and "No descriptive name" or str(x[1])) for x in events ]) sys.exit(0) elif task_has_option("cache_events"): events = task_get_option("cache_events") write_message(str(events), verbose=9) if events[0] == 'ALL': keyevents_to_cache = webstat.KEYEVENT_REPOSITORY.keys() customevents_to_cache = [x[0] for x in webstat._get_customevents()] elif events[0] == 'KEYEVENTS': keyevents_to_cache = webstat.KEYEVENT_REPOSITORY.keys() customevents_to_cache = [] elif events[0] == 'CUSTOMEVENTS': keyevents_to_cache = [] customevents_to_cache = [x[0] for x in webstat._get_customevents()] elif events[0] != '': keyevents_to_cache = [ x for x in webstat.KEYEVENT_REPOSITORY.keys() if x in events ] customevents_to_cache = [ x[0] for x in webstat._get_customevents() if x in events ] # Control so that we have valid event names if len(keyevents_to_cache + customevents_to_cache) == 0: # Oops, no events. Abort and display help. return False else: task_set_option("keyevents", keyevents_to_cache) task_set_option("customevents", customevents_to_cache) return True elif task_has_option("dump_config"): print """\ [general] visitors_box = True search_box = True record_box = True bibsched_box = True basket_box = True apache_box = True uptime_box = True [webstat_custom_event_1] name = baskets param1 = action param2 = basket param3 = user [apache_log_analyzer] profile = nil nb-histogram-items-to-print = 20 exclude-ip-list = ("137.138.249.162") home-collection = "Atlantis Institute of Fictive Science" search-interface-url = "/?" detailed-record-url = "/record/" search-engine-url = "/search?" search-engine-url-old-style = "/search.py?" basket-url = "/yourbaskets/" add-to-basket-url = "/yourbaskets/add" display-basket-url = "/yourbaskets/display" display-public-basket-url = "/yourbaskets/display_public" alert-url = "/youralerts/" display-your-alerts-url = "/youralerts/list" display-your-searches-url = "/youralerts/display" """ sys.exit(0) elif task_has_option("load_config"): from ConfigParser import ConfigParser conf = ConfigParser() conf.read(CFG_WEBSTAT_CONFIG_PATH) for section in conf.sections(): if section[:21] == "webstat_custom_event_": cols = [] name = "" for option, value in conf.items(section): if option == "name": name = value if option[:5] == "param": # add the column name in it's position index = int(option[-1]) - 1 while len(cols) <= index: cols.append("") cols[index] = value if name: res = run_sql( "SELECT COUNT(id) FROM staEVENT WHERE id = %s", (name, )) if res[0][0] == 0: # name does not exist, create customevent webstat.create_customevent(name, name, cols) else: # name already exists, update customevent webstat.modify_customevent(name, cols=cols) sys.exit(0) else: # False means that the --help should be displayed return False
def bibreformat_task(fmt, recids, without_fmt, process): """ BibReformat main task @param fmt: output format to use @param sql: dictionary with pre-created sql queries for various cases (for selecting records). Some of these queries will be picked depending on the case @param sql_queries: a list of sql queries to be executed to select records to reformat. @param cds_query: a search query to be executed to select records to reformat @param process_format: @param process: @param recids: a list of record IDs to reformat @return: None """ write_message("Processing format %s" % fmt) t1 = os.times()[4] start_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S") latest_bibrank_run = get_bibrankmethod_lastupdate('citation') def related_records(recids, recids_processed): if fmt == "HDREF" and recids: # HDREF represents the references tab # the tab needs to be recomputed not only when the record changes # but also when one of the citations changes sql = """SELECT id, modification_date FROM bibrec WHERE id in (%s)""" % ','.join(str(r) for r in recids) def check_date(mod_date): return mod_date.strftime( "%Y-%m-%d %H:%M:%S") < latest_bibrank_run rel_recids = intbitset([ recid for recid, mod_date in run_sql(sql) if check_date(mod_date) ]) for r in rel_recids: recids |= intbitset(get_cited_by(r)) # To not process recids twice recids -= recids_processed # Adds to the set of processed recids recids_processed += recids return recids def recid_chunker(recids): recids_processed = intbitset() chunk = intbitset() for recid in recids: if len(chunk) == 5000: for r in related_records(chunk, recids_processed): yield r recids_processed += chunk chunk = intbitset() if recid not in recids_processed: chunk.add(recid) if chunk: for r in related_records(chunk, recids_processed): yield r recIDs = list(recid_chunker(recids)) ### list of corresponding record IDs was retrieved ### now format the selected records if without_fmt: write_message("Records to be processed: %d" % len(recIDs)) write_message("Out of it records without existing cache: %d" % len(without_fmt)) else: write_message("Records to be processed: %d" % len(recIDs)) ### Initialize main loop total_rec = 0 # Total number of records tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call ### Iterate over all records prepared in lists I (option) if process: total_rec_1, tbibformat_1, tbibupload_1 = iterate_over_new(recIDs, fmt) total_rec += total_rec_1 tbibformat += tbibformat_1 tbibupload += tbibupload_1 ### Store last run time if task_has_option("last"): write_message("storing run date to %s" % start_date) store_last_updated(fmt, start_date) ### Final statistics t2 = os.times()[4] elapsed = t2 - t1 message = "total records processed: %d" % total_rec write_message(message) message = "total processing time: %2f sec" % elapsed write_message(message) message = "Time spent on external call (os.system):" write_message(message) message = " bibformat: %2f sec" % tbibformat write_message(message) message = " bibupload: %2f sec" % tbibupload write_message(message)
def add_other_id(other_id=None, doi="", eprint="", recid=None, system_number=None, reportnumbers=None, all_recids=None): """Search and match using given identifiers.""" query = "" if all_recids is None: all_recids = get_all_recids() if reportnumbers is None: reportnumbers = [] if recid is not None: query = "existing recid" try: recid = int(recid) except ValueError: recid = None if recid and recid not in all_recids: write_message( "WARNING: %s thought that their record %s had recid %s in %s but this seems wrong" % (CFG_OTHER_SITE, other_id, recid, CFG_THIS_SITE), stream=sys.stderr) recid = None if recid is None and eprint: query = 'oai:arXiv.org:%s' % (eprint, ) arxiv_ids = search_pattern(p=query, f='035__a', m='e') & all_recids if len(arxiv_ids) > 1: write_message( "ERROR: %s record %s matches more than one record in %s via %s: %s" % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, query, arxiv_ids), stream=sys.stderr) return [other_id] + list(arxiv_ids) elif len(arxiv_ids) == 1: recid = arxiv_ids[0] if recid is None and doi: query = 'doi:"%s"' % doi doi_ids = search_pattern(p=query) & all_recids if len(doi_ids) > 1: write_message( "ERROR: %s record %s matches more than one record in %s via %s: %s" % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, query, doi_ids), stream=sys.stderr) return [other_id] + list(doi_ids) elif len(doi_ids) == 1: recid = doi_ids[0] if recid is None and reportnumbers: query = "037__a:" + " OR 037__a:".join(reportnumbers) reportnumbers_ids = intbitset() for rn in reportnumbers: reportnumbers_ids |= search_pattern(p=rn, f='037__a', m='e') reportnumbers_ids &= all_recids if len(reportnumbers_ids) > 1: write_message( "ERROR: %s record %s matches more than one record in %s via %s: %s" % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, query, reportnumbers_ids), stream=sys.stderr) return [other_id] + list(reportnumbers_ids) elif len(reportnumbers_ids) == 1: recid = reportnumbers_ids[0] if recid is None and system_number and CFG_CERN_SITE: query = "035:%s 035:SPIRES" % (system_number, ) system_number_ids = search_pattern(p=query) system_number_ids &= all_recids if len(system_number_ids) > 1: write_message( "ERROR: %s record %s matches more than one record in %s via %s: %s" % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, query, system_number_ids), stream=sys.stderr) return [other_id] + list(system_number_ids) elif len(system_number_ids) == 1: recid = system_number_ids[0] if recid: recid = int(recid) record = get_record(recid) fields = record_get_field_instances(record, '035') for field in fields: subfields = dict(field_get_subfield_instances(field)) if CFG_OTHER_SITE.upper() == subfields.get('9', '').upper(): stored_recid = subfields.get('a', 0) try: stored_recid = int(stored_recid) except ValueError: # Not an integer, we move on and add the new ID. continue if stored_recid and int(stored_recid) != int(other_id): write_message( "ERROR: %s record %s matches %s record %s which already points back to a different record %s in %s" % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, recid, stored_recid, CFG_OTHER_SITE), stream=sys.stderr) if CFG_INSPIRE_SITE and int(other_id) not in CERN_IDS: write_message( "INFO: ID was found in 035 but the record is not core CERN hence it should be moved into 595" ) else: return if CFG_INSPIRE_SITE: fields = record_get_field_instances(record, '595') for field in fields: subfields = dict(field_get_subfield_instances(field)) if "CDS" in subfields.get('a', '').upper(): stored_recid = subfields.get('a', 0).split("-")[-1] try: stored_recid = int(stored_recid) except ValueError: # Not an integer, we move on and add the new ID. continue if stored_recid and int(stored_recid) != int(other_id): write_message( "ERROR: %s record %s matches %s record %s which already points back to a different record %s in %s" % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, recid, stored_recid, CFG_OTHER_SITE), stream=sys.stderr) if int(other_id) in CERN_IDS: write_message( "INFO: ID was found in 595 but the record is core CERN hence it should be moved into 035" ) else: return write_message("Matched {1}/{0} to {3}/{2} with {4}".format( other_id, CFG_OTHER_URL, recid, CFG_THIS_URL, query)) rec = {} record_add_field(rec, '001', controlfield_value='%s' % recid) # Let's filter out previous values in 035/595 for field in record_get_field_instances(record, '035'): subfields = field_get_subfield_instances(field) subfields_dict = dict(subfields) if subfields_dict.get('a') != str(other_id) or subfields_dict.get( '9') != CFG_OTHER_SITE: record_add_field(rec, '035', subfields=subfields) for field in record_get_field_instances(record, '595'): subfields = field_get_subfield_instances(field) subfields_dict = dict(subfields) if subfields_dict.get('a') != "CDS-{0}".format( other_id) or subfields_dict.get('9') != 'CERN': record_add_field(rec, '595', subfields=subfields) if CFG_INSPIRE_SITE: if int(other_id) in CERN_IDS: write_message("CERN relevant paper: adding 035") record_add_field(rec, '035', ind1=' ', ind2=' ', subfields=(('9', CFG_OTHER_SITE), ('a', other_id))) else: write_message("Non-CERN relevant paper: adding 595") record_add_field(rec, '595', ind1=' ', ind2=' ', subfields=(('9', "CERN"), ('a', "CDS-{0}".format(other_id)))) else: record_add_field(rec, '035', ind1=' ', ind2=' ', subfields=(('9', CFG_OTHER_SITE), ('a', other_id))) return record_xml_output(rec)
def import_recid_list(input_stream=sys.stdin, batch_limit=500, automatic_upload=False): """Import identifiers from file, match and generate output files.""" all_recids = get_all_recids() output_files = [] current_batch = [] current_dupes = [] i = 0 for row in input_stream: if row.endswith('\n'): row = row[:-1] row = row.split('|') if row: try: other_id, doi, eprint, recid, system_number = row[0], row[ 1], row[2], row[3], row[4] except IndexError: # Something is up write_message("WARNING: {0} is invalid".format(row), stream=sys.stderr) continue if len(row) > 5: reportnumbers = row[5:] else: reportnumbers = None if not other_id: other_id = None if not recid: recid = None result = add_other_id(other_id, doi, eprint, recid, system_number, reportnumbers, all_recids) if result: if isinstance(result, list): # Duplications found current_dupes.append(result) continue current_batch.append(result) i += 1 if i % batch_limit == 0: output_file = write_results(current_batch) output_files.append(output_file) if automatic_upload: task_low_level_submission('bibupload', 'bst_inspire_cds_synchro', '-c', output_file, '-n') write_message("Scheduled bibupload --correct %s" % output_file) task_sleep_now_if_required() current_batch = [] if len(current_batch) > 0: output_file = write_results(current_batch) output_files.append(output_file) if automatic_upload: task_low_level_submission('bibupload', 'bst_inspire_cds_synchro', '-c', output_file, '-n') write_message("Scheduled bibupload --correct %s" % output_file) write_message("Matched in total {0} records.".format(i)) if len(current_dupes) > 0: # We have duplications dupes_output_file = get_temporary_file("cds_duplicates_", ".txt") with open(dupes_output_file, "w") as fd: fd.write("\n".join([ "{0}: {1}".format(dupe[0], dupe[1:]) for dupe in current_dupes ])) write_message( "Found {0} possible duplicates which are available here: {1}". format(len(current_dupes), dupes_output_file)) return output_files
def _dbdump_run_task_core(): """ Run DB dumper core stuff. Note: do not use task_can_sleep() stuff here because we don't want other tasks to interrupt us while we are dumping the DB content. """ # read params: task_update_progress("Reading parameters") write_message("Reading parameters started") output_dir = task_get_option('output', CFG_LOGDIR) output_num = task_get_option('number', 5) output_fil_prefix = CFG_DATABASE_NAME + '-dbdump-' output_fil_suffix = task_get_task_param('task_starting_time').replace( ' ', '_') + '.sql' output_fil = output_fil_prefix + output_fil_suffix write_message("Reading parameters ended") # make dump: task_update_progress("Dumping database") write_message("Database dump started") _dump_database(output_dir, output_fil) write_message("Database dump ended") # prune old dump files: task_update_progress("Pruning old dump files") write_message("Pruning old dump files started") _delete_old_dumps(output_dir, output_fil_prefix, output_num) write_message("Pruning old dump files ended") # we are done: task_update_progress("Done.") return True
def task_run_core(name=NAME): """Entry point for the arxiv-pdf-checker task""" # First gather recids to process recids = task_get_option('recids') if recids: start_date = None else: start_date = datetime.now() dummy, last_date = fetch_last_updated(name) recids = fetch_updated_arxiv_records(last_date) if task_get_option('missing'): recids |= fetch_records_missing_arxiv_fulltext() else: recids |= fetch_records_missing_arxiv_fulltext() & \ fetch_records_modified_since(last_date) updated_recids = set() try: for count, recid in enumerate(recids): if count % 50 == 0: msg = 'Done %s of %s' % (count, len(recids)) write_message(msg) task_update_progress(msg) # BibTask sleep task_sleep_now_if_required(can_stop_too=True) write_message('processing %s' % recid, verbose=9) try: if process_one(recid): updated_recids.add(recid) time.sleep(6) except AlreadyHarvested: write_message('already harvested successfully') time.sleep(6) except FoundExistingPdf: write_message('pdf already attached (matching md5)') time.sleep(6) except PdfNotAvailable: write_message("no pdf available") time.sleep(20) except InvenioFileDownloadError, e: write_message("failed to download: %s" % e) time.sleep(20) finally: # We want to process updated records even in case we are interrupted msg = 'Updated %s records' % len(updated_recids) write_message(msg) task_update_progress(msg) write_message(repr(updated_recids)) # For all updated records, we want to sync the 8564 tags # and reextract references if updated_recids: submit_fixmarc_task(updated_recids) submit_refextract_task(updated_recids) # Store last run date of the daemon # not if it ran on specific recids from the command line with --id # but only if it ran on the modified records if start_date: store_last_updated(0, start_date, name) return True
def single_tag_rank(config): """Connect the given tag with the data from the kb file given""" write_message("Loading knowledgebase file", verbose=9) kb_data = {} records = [] write_message("Reading knowledgebase file: %s" % config.get(config.get("rank_method", "function"), "kb_src")) with open(config.get(config.get("rank_method", "function"), "kb_src"), 'r') as f: for line in f: if not line[0:1] == "#": key, value = line.strip().split("---") kb_data[key.strip()] = value.strip() write_message("Number of lines read from knowledgebase file: %s" % len(kb_data)) tag = config.get(config.get("rank_method", "function"), "tag") tags = config.get(config.get("rank_method", "function"), "check_mandatory_tags").split(",") if tags == ['']: tags = "" records = [] for recids, recide in options["recid_range"]: task_sleep_now_if_required(can_stop_too=True) write_message("......Processing records #%s-%s" % (recids, recide)) recs = run_sql("SELECT id_bibrec, value FROM bib%sx, bibrec_bib%sx WHERE tag=%%s AND id_bibxxx=id and id_bibrec >=%%s and id_bibrec<=%%s" % (tag[0:2], tag[0:2]), (tag, recids, recide)) valid = intbitset(trailing_bits=1) valid.discard(0) for key in tags: newset = intbitset(run_sql("SELECT id_bibrec FROM bib%sx, bibrec_bib%sx WHERE id_bibxxx=id AND tag=%%s AND id_bibxxx=id and id_bibrec >=%%s and id_bibrec<=%%s" % (tag[0:2], tag[0:2]), (key, recids, recide))) valid &= newset if tags: recs = [(rec, value) for recid, value in recs if recid in valid] records += list(recs) write_message("Number of records found with the necessary tags: %s" % len(records)) records = [(recid, value) for recid, value in records if recid in options["validset"]] rnkset = {} for key, value in records: if value in kb_data: if key not in rnkset: rnkset[key] = float(kb_data[value]) else: if kb_data.has_key(rnkset[key]) and float(kb_data[value]) > float((rnkset[key])[1]): rnkset[key] = float(kb_data[value]) else: rnkset[key] = 0 write_message("Number of records available in rank method: %s" % len(rnkset)) return rnkset
def warn(self, msg): """ Add a warning to the record """ self.issues.append(Issue('warning', self.rule['name'], msg)) write_message("[WARN] record %s by rule %s: %s" % (self.record_id, self.rule["name"], msg))
def process_batch_job(batch_job_file): """ Processes a batch job description dictionary @param batch_job_file: a fullpath to a batch job file @type batch_job_file: string @return: 1 if the process was successfull, 0 if not @rtype; int """ def upload_marcxml_file(marcxml): """ Creates a temporary marcxml file and sends it to bibupload """ xml_filename = 'bibencode_' + str(batch_job['recid']) + '_' + str( uuid.uuid4()) + '.xml' xml_filename = os.path.join(invenio.config.CFG_TMPSHAREDDIR, xml_filename) xml_file = file(xml_filename, 'w') xml_file.write(marcxml) xml_file.close() targs = ['-c', xml_filename] task_low_level_submission('bibupload', 'bibencode', *targs) #---------# # GENERAL # #---------# _task_write_message("----------- Handling Master -----------") ## Check the validity of the batch file here batch_job = json_decode_file(batch_job_file) ## Sanitise batch description and raise errrors batch_job = sanitise_batch_job(batch_job) ## Check if the record exists if record_exists(batch_job['recid']) < 1: raise Exception("Record not found") recdoc = BibRecDocs(batch_job['recid']) #--------------------# # UPDATE FROM MASTER # #--------------------# ## We want to add new stuff to the video's record, using the master as input if getval(batch_job, 'update_from_master'): found_master = False bibdocs = recdoc.list_bibdocs() for bibdoc in bibdocs: bibdocfiles = bibdoc.list_all_files() for bibdocfile in bibdocfiles: comment = bibdocfile.get_comment() description = bibdocfile.get_description() subformat = bibdocfile.get_subformat() m_comment = getval(batch_job, 'bibdoc_master_comment', comment) m_description = getval(batch_job, 'bibdoc_master_description', description) m_subformat = getval(batch_job, 'bibdoc_master_subformat', subformat) if (comment == m_comment and description == m_description and subformat == m_subformat): found_master = True batch_job['input'] = bibdocfile.get_full_path() ## Get the aspect of the from the record try: ## Assumes pbcore metadata mapping batch_job['aspect'] = get_fieldvalues( 124, CFG_BIBENCODE_ASPECT_RATIO_MARC_FIELD)[0] except IndexError: pass break if found_master: break if not found_master: _task_write_message("Video master for record %d not found" % batch_job['recid']) task_update_progress("Video master for record %d not found" % batch_job['recid']) ## Maybe send an email? return 1 ## Clean the job to do no upscaling etc if getval(batch_job, 'assure_quality'): batch_job = clean_job_for_quality(batch_job) global _BATCH_STEPS _BATCH_STEPS = len(batch_job['jobs']) ## Generate the docname from the input filename's name or given name bibdoc_video_docname, bibdoc_video_extension = decompose_file( batch_job['input'])[1:] if not bibdoc_video_extension or getval(batch_job, 'bibdoc_master_extension'): bibdoc_video_extension = getval(batch_job, 'bibdoc_master_extension') if getval(batch_job, 'bibdoc_master_docname'): bibdoc_video_docname = getval(batch_job, 'bibdoc_master_docname') write_message("Creating BibDoc for %s" % bibdoc_video_docname) ## If the bibdoc exists, receive it if bibdoc_video_docname in recdoc.get_bibdoc_names(): bibdoc_video = recdoc.get_bibdoc(bibdoc_video_docname) ## Create a new bibdoc if it does not exist else: bibdoc_video = recdoc.add_bibdoc(docname=bibdoc_video_docname) ## Get the directory auf the newly created bibdoc to copy stuff there bibdoc_video_directory = bibdoc_video.get_base_dir() #--------# # MASTER # #--------# if not getval(batch_job, 'update_from_master'): if getval(batch_job, 'add_master'): ## Generate the right name for the master ## The master should be hidden first an then renamed ## when it is really available ## !!! FIX !!! _task_write_message("Adding %s master to the BibDoc" % bibdoc_video_docname) master_format = compose_format( bibdoc_video_extension, getval(batch_job, 'bibdoc_master_subformat', 'master')) ## If a file of the same format is there, something is wrong, remove it! ## it might be caused by a previous corrupted submission etc. if bibdoc_video.format_already_exists_p(master_format): bibdoc_video.delete_file(master_format, 1) bibdoc_video.add_file_new_format( batch_job['input'], version=1, description=getval(batch_job, 'bibdoc_master_description'), comment=getval(batch_job, 'bibdoc_master_comment'), docformat=master_format) #-----------# # JOBS LOOP # #-----------# return_code = 1 global _BATCH_STEP for job in batch_job['jobs']: _task_write_message("----------- Job %s of %s -----------" % (_BATCH_STEP, _BATCH_STEPS)) ## Try to substitute docname with master docname if getval(job, 'bibdoc_docname'): job['bibdoc_docname'] = Template( job['bibdoc_docname']).safe_substitute( {'bibdoc_master_docname': bibdoc_video_docname}) #-------------# # TRANSCODING # #-------------# if job['mode'] == 'encode': ## Skip the job if assure_quality is not set and marked as fallback if not getval(batch_job, 'assure_quality') and getval( job, 'fallback'): continue if getval(job, 'profile'): profile = get_encoding_profile(job['profile']) else: profile = None ## We need an extension defined fot the video container bibdoc_video_extension = getval(job, 'extension', getval(profile, 'extension')) if not bibdoc_video_extension: raise Exception("No container/extension defined") ## Get the docname and subformat bibdoc_video_subformat = getval(job, 'bibdoc_subformat') bibdoc_slave_video_docname = getval(job, 'bibdoc_docname', bibdoc_video_docname) ## The subformat is incompatible with ffmpegs name convention ## We do the encoding without and rename it afterwards bibdoc_video_fullpath = compose_file(bibdoc_video_directory, bibdoc_slave_video_docname, bibdoc_video_extension) _task_write_message( "Transcoding %s to %s;%s" % (bibdoc_slave_video_docname, bibdoc_video_extension, bibdoc_video_subformat)) ## We encode now directly into the bibdocs directory encoding_result = encode_video( input_file=batch_job['input'], output_file=bibdoc_video_fullpath, acodec=getval(job, 'audiocodec'), vcodec=getval(job, 'videocodec'), abitrate=getval(job, 'videobitrate'), vbitrate=getval(job, 'audiobitrate'), resolution=getval(job, 'resolution'), passes=getval(job, 'passes', 1), special=getval(job, 'special'), specialfirst=getval(job, 'specialfirst'), specialsecond=getval(job, 'specialsecond'), metadata=getval(job, 'metadata'), width=getval(job, 'width'), height=getval(job, 'height'), aspect=getval(batch_job, 'aspect'), # Aspect for every job profile=getval(job, 'profile'), update_fnc=_task_update_overall_status, message_fnc=_task_write_message) return_code &= encoding_result ## only on success if encoding_result: ## Rename it, adding the subformat os.rename( bibdoc_video_fullpath, compose_file(bibdoc_video_directory, bibdoc_video_extension, bibdoc_video_subformat, 1, bibdoc_slave_video_docname)) bibdoc_video._build_file_list() bibdoc_video_format = compose_format(bibdoc_video_extension, bibdoc_video_subformat) if getval(job, 'bibdoc_comment'): bibdoc_video.set_comment(getval(job, 'bibdoc_comment'), bibdoc_video_format) if getval(job, 'bibdoc_description'): bibdoc_video.set_description( getval(job, 'bibdoc_description'), bibdoc_video_format) #------------# # EXTRACTION # #------------# # if there are multiple extraction jobs, all the produced files # with the same name will be in the same bibdoc! Make sure that # you use different subformats or docname templates to avoid # conflicts. if job['mode'] == 'extract': if getval(job, 'profile'): profile = get_extract_profile(job['profile']) else: profile = {} bibdoc_frame_subformat = getval(job, 'bibdoc_subformat') _task_write_message("Extracting frames to temporary directory") tmpdir = invenio.config.CFG_TMPDIR + "/" + str(uuid.uuid4()) os.mkdir(tmpdir) #Move this to the batch description bibdoc_frame_docname = getval(job, 'bibdoc_docname', bibdoc_video_docname) tmpfname = ( tmpdir + "/" + bibdoc_frame_docname + '.' + getval(profile, 'extension', getval(job, 'extension', 'jpg'))) extraction_result = extract_frames( input_file=batch_job['input'], output_file=tmpfname, size=getval(job, 'size'), positions=getval(job, 'positions'), numberof=getval(job, 'numberof'), width=getval(job, 'width'), height=getval(job, 'height'), aspect=getval(batch_job, 'aspect'), profile=getval(job, 'profile'), update_fnc=_task_update_overall_status, ) return_code &= extraction_result ## only on success: if extraction_result: ## for every filename in the directorys, create a bibdoc that contains ## all sizes of the frame from the two directories files = os.listdir(tmpdir) for filename in files: ## The docname was altered by BibEncode extract through substitution ## Retrieve it from the filename again bibdoc_frame_docname, bibdoc_frame_extension = os.path.splitext( filename) _task_write_message("Creating new bibdoc for %s" % bibdoc_frame_docname) ## If the bibdoc exists, receive it if bibdoc_frame_docname in recdoc.get_bibdoc_names(): bibdoc_frame = recdoc.get_bibdoc(bibdoc_frame_docname) ## Create a new bibdoc if it does not exist else: bibdoc_frame = recdoc.add_bibdoc( docname=bibdoc_frame_docname) ## The filename including path from tmpdir fname = os.path.join(tmpdir, filename) bibdoc_frame_format = compose_format( bibdoc_frame_extension, bibdoc_frame_subformat) ## Same as with the master, if the format allready exists, ## override it, because something went wrong before if bibdoc_frame.format_already_exists_p( bibdoc_frame_format): bibdoc_frame.delete_file(bibdoc_frame_format, 1) _task_write_message("Adding %s jpg;%s to BibDoc" % (bibdoc_frame_docname, getval(job, 'bibdoc_subformat'))) bibdoc_frame.add_file_new_format( fname, version=1, description=getval(job, 'bibdoc_description'), comment=getval(job, 'bibdoc_comment'), docformat=bibdoc_frame_format) ## Remove the temporary folders _task_write_message("Removing temporary directory") shutil.rmtree(tmpdir) _BATCH_STEP = _BATCH_STEP + 1 #-----------------# # FIX BIBDOC/MARC # #-----------------# _task_write_message("----------- Handling MARCXML -----------") ## Fix the BibDoc for all the videos previously created _task_write_message("Updating BibDoc of %s" % bibdoc_video_docname) bibdoc_video._build_file_list() ## Fix the MARC _task_write_message("Fixing MARC") cli_fix_marc({}, [batch_job['recid']], False) if getval(batch_job, 'collection'): ## Make the record visible by moving in from the collection marcxml = ("<record><controlfield tag=\"001\">%d</controlfield>" "<datafield tag=\"980\" ind1=\" \" ind2=\" \">" "<subfield code=\"a\">%s</subfield></datafield></record>" ) % (batch_job['recid'], batch_job['collection']) upload_marcxml_file(marcxml) #---------------------# # ADD MASTER METADATA # #---------------------# if getval(batch_job, 'add_master_metadata'): _task_write_message("Adding master metadata") pbcore = pbcore_metadata(input_file=getval(batch_job, 'input'), pbcoreIdentifier=batch_job['recid'], aspect_override=getval(batch_job, 'aspect')) marcxml = format(pbcore, CFG_BIBENCODE_PBCORE_MARC_XSLT) upload_marcxml_file(marcxml) #------------------# # ADD MARC SNIPPET # #------------------# if getval(batch_job, 'marc_snippet'): marc_snippet = open(getval(batch_job, 'marc_snippet')) marcxml = marc_snippet.read() marc_snippet.close() upload_marcxml_file(marcxml) #--------------# # DELETE INPUT # #--------------# if getval(batch_job, 'delete_input'): _task_write_message("Deleting input file") # only if successfull if not return_code: # only if input matches pattern if getval(batch_job, 'delete_input_pattern', '') in getval(batch_job, 'input'): try: os.remove(getval(batch_job, 'input')) except OSError: pass #--------------# # NOTIFICATION # #--------------# ## Send Notification emails on errors if not return_code: if getval(batch_job, 'notify_user'): _notify_error_user( getval(batch_job, 'notify_user'), getval(batch_job, 'submission_filename', batch_job['input']), getval(batch_job, 'recid'), getval(batch_job, 'submission_title', "")) _task_write_message("Notify user because of an error") if getval(batch_job, 'notify_admin'): _task_write_message("Notify admin because of an error") if type(getval(batch_job, 'notify_admin') == type(str())): _notify_error_admin(batch_job, getval(batch_job, 'notify_admin')) else: _notify_error_admin(batch_job) else: if getval(batch_job, 'notify_user'): _task_write_message("Notify user because of success") _notify_success_user( getval(batch_job, 'notify_user'), getval(batch_job, 'submission_filename', batch_job['input']), getval(batch_job, 'recid'), getval(batch_job, 'submission_title', "")) return 1
def bibrank_engine(run): """Run the indexing task. Return 1 in case of success and 0 in case of failure. """ startCreate = time.time() options["run"] = [] options["run"].append(run) for rank_method_code in options["run"]: task_sleep_now_if_required(can_stop_too=True) cfg_name = getName(rank_method_code) write_message("Running rank method: %s." % cfg_name) config = load_config(rank_method_code) cfg_short = rank_method_code cfg_function = "%s_exec" % config.get("rank_method", "function") cfg_repair_function = "%s_repair_exec" % config.get("rank_method", "function") cfg_name = getName(cfg_short) options["validset"] = get_valid_range(rank_method_code) if task_get_option("query") is not None: params = {"of": "id"} if task_get_option("collection"): params["c"] = task_get_option("collection").split(",") params["p"] = task_get_option("query") recIDs = perform_request_search(**params) ranges = [(recID, recID) for recID in recIDs] task_set_option("id", task_get_option("id", []) + ranges) options["recid_range"] = ranges elif task_get_option("collection"): l_of_colls = task_get_option("collection").split(",") recIDs = perform_request_search(c=l_of_colls) options["recid_range"] = [(recID, recID) for recID in recIDs] elif task_get_option("id"): options["recid_range"] = task_get_option("id") elif task_get_option("modified"): options["recid_range"] = add_recIDs_by_date(rank_method_code, task_get_option("modified")) elif task_get_option("last_updated"): options["recid_range"] = add_recIDs_by_date(rank_method_code) else: write_message("No records specified, updating all", verbose=2) min_id = run_sql("SELECT min(id) from bibrec")[0][0] max_id = run_sql("SELECT max(id) from bibrec")[0][0] options["recid_range"] = [(min_id, max_id)] if task_get_option("quick") == "yes": write_message("Recalculate parameter used", verbose=9) if task_get_option("cmd") == "del": del_recids(cfg_short, options["recid_range"]) elif task_get_option("cmd") == "add": func_object = globals().get(cfg_function) func_object(rank_method_code, cfg_name, config) elif task_get_option("cmd") == "stat": rank_method_code_statistics(rank_method_code) elif task_get_option("cmd") == "check": check_method(rank_method_code) elif task_get_option("cmd") == "print-missing": func_object = globals().get(cfg_function) func_object(rank_method_code, cfg_name, config) elif task_get_option("cmd") == "repair": func_object = globals().get(cfg_repair_function) func_object() else: msg = "Invalid command found processing %s" % rank_method_code write_message(msg, sys.stderr) raise StandardError(msg) if task_get_option("verbose"): showtime((time.time() - startCreate)) return 1