def process_affiliations(record_ids=None, all_records=False): name = 'affiliations' if all_records: records = intbitset(run_sql("SELECT id FROM bibrec")) start_time = datetime.now() elif record_ids: records = intbitset(record_ids) start_time = None else: dummy_last_recid, last_updated = fetch_last_updated(name) start_time = datetime.now() sql = """SELECT `id` FROM `bibrec` WHERE `modification_date` >= %s AND `modification_date` <= %s ORDER BY `modification_date`""" records = intbitset(run_sql(sql, [last_updated.isoformat(), start_time.isoformat()])) records_iter = iter(records) processed_records_count = 0 while True: task_sleep_now_if_required() chunk = list(islice(records_iter, CHUNK_SIZE)) if not chunk: break process_and_store(chunk) processed_records_count += len(chunk) task_update_progress('processed %s out of %s records' % (processed_records_count, len(records))) if start_time: store_last_updated(None, start_time, name)
def test_latest_records(self): name = 'affiliations' dummy_last_recid, last_date = fetch_last_updated(name) run_sql('UPDATE xtrJOB SET last_updated = %s WHERE name = %s', (datetime(year=1900, month=1, day=1), name)) process_affiliations() pids = intbitset( run_sql( "SELECT personid FROM aidPERSONIDDATA where tag = 'canonical_name'" )) aff = get_current_aff(pids) for value in aff.itervalues(): del value['last_occurence'] compare_aff_dicts(self, aff, EXPECTED_AFF) run_sql("TRUNCATE aidAFFILIATIONS") self.assertEqual(get_current_aff(pids), {}) store_last_updated(None, last_date, name)
def get_recids_to_load(): """ Generates the final list of record IDs to load. Returns a list of tuples like: (recid, date) """ recids_given = task_get_option("recids", default=[]) query_given = task_get_option("query") reportnumbers_given = task_get_option("reportnumbers") if query_given: write_message("Performing given search query: %s" % (query_given,)) result = perform_request_search(p=query_given, of='id', rg=0, wl=0) recids_given.extend(result) if reportnumbers_given: write_message("Searching for records referring to given reportnumbers") for reportnumber in reportnumbers_given: result = perform_request_search(p='reportnumber:%s' % (reportnumber,), of='id', rg=0, wl=0) recids_given.extend(result) recids_given = [(recid, None) for recid in recids_given] last_id, last_date = fetch_last_updated(name="bibcatalog") records_found = [] if task_get_option("new", default=False): records_found.extend(get_all_new_records(since=last_date, last_id=last_id)) if task_get_option("modified", default=False): records_found.extend(get_all_modified_records(since=last_date, last_id=last_id)) for recid, date in records_found: recids_given.append((recid, date)) return recids_given
def get_recids_to_load(): """ Generates the final list of record IDs to load. Returns a list of tuples like: (recid, date) """ recids_given = task_get_option("recids", default=[]) query_given = task_get_option("query") reportnumbers_given = task_get_option("reportnumbers") if query_given: write_message("Performing given search query: %s" % (query_given, )) result = perform_request_search(p=query_given, of='id', rg=0, wl=0) recids_given.extend(result) if reportnumbers_given: write_message("Searching for records referring to given reportnumbers") for reportnumber in reportnumbers_given: result = perform_request_search(p='reportnumber:%s' % (reportnumber, ), of='id', rg=0, wl=0) recids_given.extend(result) recids_given = [(recid, None) for recid in recids_given] last_id, last_date = fetch_last_updated(name="bibcatalog") records_found = [] if task_get_option("new", default=False): records_found.extend( get_all_new_records(since=last_date, last_id=last_id)) if task_get_option("modified", default=False): records_found.extend( get_all_modified_records(since=last_date, last_id=last_id)) for recid, date in records_found: recids_given.append((recid, date)) return recids_given
def task_run_core(name=NAME): """Entry point for the arxiv-pdf-checker task""" # First gather recids to process recids = task_get_option('recids') if recids: start_date = None recids = [(recid, None) for recid in recids] else: start_date = datetime.now() dummy, last_date = fetch_last_updated(name) recids = fetch_updated_arxiv_records(last_date) updated_recids = set() try: for count, (recid, dummy) in enumerate(recids): if count % 50 == 0: msg = 'Done %s of %s' % (count, len(recids)) write_message(msg) task_update_progress(msg) # BibTask sleep task_sleep_now_if_required(can_stop_too=True) write_message('processing %s' % recid, verbose=9) try: if process_one(recid): updated_recids.add(recid) time.sleep(6) except AlreadyHarvested: write_message('already harvested successfully') time.sleep(6) except FoundExistingPdf: write_message('pdf already attached (matching md5)') time.sleep(6) except PdfNotAvailable: write_message("no pdf available") time.sleep(20) except InvenioFileDownloadError, e: write_message("failed to download: %s" % e) time.sleep(20) finally: # We want to process updated records even in case we are interrupted msg = 'Updated %s records' % len(updated_recids) write_message(msg) task_update_progress(msg) write_message(repr(updated_recids)) # For all updated records, we want to sync the 8564 tags # and reextract references if updated_recids: submit_fixmarc_task(updated_recids) submit_refextract_task(updated_recids) # Store last run date of the daemon # not if it ran on specific recids from the command line with --id # but only if it ran on the modified records if start_date: store_last_updated(0, start_date, name) return True
def task_run_core(name=NAME): """ Performs a search to find records without a texkey, generates a new one and uploads the changes in chunks """ recids = task_get_task_param('recids') if recids: start_date = None write_message("processing recids from commandline") else: start_date = datetime.now() recids = intbitset() recids |= intbitset( perform_request_search(p='-035:spirestex -035:inspiretex', cc='HEP')) if task_get_task_param('all'): write_message("processing all records without texkey") else: _, last_date = fetch_last_updated(name) recids = recids & fetch_records_modified_since(last_date) write_message("processing records modified since: %s" % last_date) write_message("Found %s records to assign texkeys" % len(recids)) processed_recids = [] xml_to_process = [] for count, recid in enumerate(recids): write_message("processing recid %s" % recid) # Check that the record does not have already a texkey has_texkey = False recstruct = get_record(recid) for instance in record_get_field_instances(recstruct, tag="035", ind1="", ind2=""): try: provenance = field_get_subfield_values(instance, "9")[0] except IndexError: provenance = "" try: value = field_get_subfield_values(instance, "a")[0] except IndexError: value = "" provenances = ["SPIRESTeX", "INSPIRETeX"] if provenance in provenances and value: has_texkey = True write_message("INFO: Record %s has already texkey %s" % (recid, value)) if not has_texkey: TexKeySeq = TexkeySeq() new_texkey = "" try: new_texkey = TexKeySeq.next_value(recid) except TexkeyNoAuthorError: write_message( "WARNING: Record %s has no first author or collaboration" % recid) continue except TexkeyNoYearError: write_message("WARNING: Record %s has no year" % recid) continue write_message("Created texkey %s for record %d" % (new_texkey, recid)) xml = create_xml(recid, new_texkey) processed_recids.append(recid) xml_to_process.append(xml) task_update_progress("Done %d out of %d." % (count, len(recids))) task_sleep_now_if_required() # sequence ID to be used in all subsequent tasks sequence_id = str(random.randrange(1, 4294967296)) if xml_to_process: process_chunk(xml_to_process, sequence_id) # Finally, index all the records processed # FIXME: Waiting for sequence id to be fixed # if processed_recids: # submit_bibindex_task(processed_recids, sequence_id) if start_date: store_last_updated(0, start_date, name) return True