Exemplo n.º 1
0
def process_affiliations(record_ids=None, all_records=False):
    name = 'affiliations'

    if all_records:
        records = intbitset(run_sql("SELECT id FROM bibrec"))
        start_time = datetime.now()
    elif record_ids:
        records = intbitset(record_ids)
        start_time = None
    else:
        dummy_last_recid, last_updated = fetch_last_updated(name)
        start_time = datetime.now()
        sql = """SELECT `id` FROM `bibrec`
                 WHERE `modification_date` >= %s
                 AND `modification_date` <= %s
                 ORDER BY `modification_date`"""
        records = intbitset(run_sql(sql, [last_updated.isoformat(), start_time.isoformat()]))

    records_iter = iter(records)
    processed_records_count = 0
    while True:
        task_sleep_now_if_required()
        chunk = list(islice(records_iter, CHUNK_SIZE))
        if not chunk:
            break
        process_and_store(chunk)
        processed_records_count += len(chunk)
        task_update_progress('processed %s out of %s records' % (processed_records_count, len(records)))
    if start_time:
        store_last_updated(None, start_time, name)
Exemplo n.º 2
0
def task_run_core():
    """
    Main daemon task.

    Returns True when run successfully. False otherwise.
    """
    # Dictionary of "plugin_name" -> func
    tickets_to_apply = task_get_option('tickets')
    write_message("Ticket plugins found: %s" % (str(tickets_to_apply), ),
                  verbose=9)

    task_update_progress("Loading records")
    records_concerned = get_recids_to_load()
    write_message("%i record(s) found" % (len(records_concerned), ))

    records_processed = 0
    for record, last_date in load_records_from_id(records_concerned):
        records_processed += 1
        recid = record_id_from_record(record)
        task_update_progress(
            "Processing records %s/%s (%i%%)" %
            (records_processed, len(records_concerned),
             int(float(records_processed) / len(records_concerned) * 100)))
        task_sleep_now_if_required(can_stop_too=True)
        for ticket_name, plugin in tickets_to_apply.items():
            if plugin:
                write_message("Running template %s for %s" %
                              (ticket_name, recid),
                              verbose=5)
                try:
                    ticket = BibCatalogTicket(recid=int(recid))
                    if plugin['check_record'](ticket, record):
                        ticket = plugin['generate_ticket'](ticket, record)
                        write_message("Ticket to be generated: %s" %
                                      (ticket, ),
                                      verbose=5)
                        res = ticket.submit()
                        if res:
                            write_message("Ticket #%s created for %s" %
                                          (ticket.ticketid, recid))
                        else:
                            write_message("Ticket already exists for %s" %
                                          (recid, ))
                    else:
                        write_message("Skipping record %s", (recid, ))
                except Exception, e:
                    write_message("Error submitting ticket for record %s:" %
                                  (recid, ))
                    write_message(traceback.format_exc())
                    raise e
            else:
                raise BibCatalogPluginException("Plugin not valid in %s" %
                                                (ticket_name, ))

        if last_date:
            store_last_updated(recid, last_date, name="bibcatalog")
Exemplo n.º 3
0
def task_run_core():
    """
    Main daemon task.

    Returns True when run successfully. False otherwise.
    """
    # Dictionary of "plugin_name" -> func
    tickets_to_apply = task_get_option('tickets')
    write_message("Ticket plugins found: %s" %
                  (str(tickets_to_apply),), verbose=9)

    task_update_progress("Loading records")
    records_concerned = get_recids_to_load()
    write_message("%i record(s) found" %
                  (len(records_concerned),))

    records_processed = 0
    for record, last_date in load_records_from_id(records_concerned):
        records_processed += 1
        recid = record_id_from_record(record)
        task_update_progress("Processing records %s/%s (%i%%)"
                             % (records_processed, len(records_concerned),
                                int(float(records_processed) / len(records_concerned) * 100)))
        task_sleep_now_if_required(can_stop_too=True)
        for ticket_name, plugin in tickets_to_apply.items():
            if plugin:
                write_message("Running template %s for %s" % (ticket_name, recid),
                              verbose=5)
                try:
                    ticket = BibCatalogTicket(recid=int(recid))
                    if plugin['check_record'](ticket, record):
                        ticket = plugin['generate_ticket'](ticket, record)
                        write_message("Ticket to be generated: %s" % (ticket,), verbose=5)
                        res = ticket.submit()
                        if res:
                            write_message("Ticket #%s created for %s" %
                                         (ticket.ticketid, recid))
                        else:
                            write_message("Ticket already exists for %s" %
                                          (recid,))
                    else:
                        write_message("Skipping record %s", (recid,))
                except Exception, e:
                    write_message("Error submitting ticket for record %s:" % (recid,))
                    write_message(traceback.format_exc())
                    raise e
            else:
                raise BibCatalogPluginException("Plugin not valid in %s" % (ticket_name,))

        if last_date:
            store_last_updated(recid, last_date, name="bibcatalog")
 def test_latest_records(self):
     name = 'affiliations'
     dummy_last_recid, last_date = fetch_last_updated(name)
     run_sql('UPDATE xtrJOB SET last_updated = %s WHERE name = %s',
             (datetime(year=1900, month=1, day=1), name))
     process_affiliations()
     pids = intbitset(
         run_sql(
             "SELECT personid FROM aidPERSONIDDATA where tag = 'canonical_name'"
         ))
     aff = get_current_aff(pids)
     for value in aff.itervalues():
         del value['last_occurence']
     compare_aff_dicts(self, aff, EXPECTED_AFF)
     run_sql("TRUNCATE aidAFFILIATIONS")
     self.assertEqual(get_current_aff(pids), {})
     store_last_updated(None, last_date, name)
Exemplo n.º 5
0
def task_run_core(name=NAME):
    """Entry point for the arxiv-pdf-checker task"""

    # First gather recids to process
    recids = task_get_option('recids')
    if recids:
        start_date = None
        recids = [(recid, None) for recid in recids]
    else:
        start_date = datetime.now()
        dummy, last_date = fetch_last_updated(name)
        recids = fetch_updated_arxiv_records(last_date)

    updated_recids = set()

    try:

        for count, (recid, dummy) in enumerate(recids):
            if count % 50 == 0:
                msg = 'Done %s of %s' % (count, len(recids))
                write_message(msg)
                task_update_progress(msg)

            # BibTask sleep
            task_sleep_now_if_required(can_stop_too=True)

            write_message('processing %s' % recid, verbose=9)
            try:
                if process_one(recid):
                    updated_recids.add(recid)
                time.sleep(6)
            except AlreadyHarvested:
                write_message('already harvested successfully')
                time.sleep(6)
            except FoundExistingPdf:
                write_message('pdf already attached (matching md5)')
                time.sleep(6)
            except PdfNotAvailable:
                write_message("no pdf available")
                time.sleep(20)
            except InvenioFileDownloadError, e:
                write_message("failed to download: %s" % e)
                time.sleep(20)

    finally:
        # We want to process updated records even in case we are interrupted
        msg = 'Updated %s records' % len(updated_recids)
        write_message(msg)
        task_update_progress(msg)
        write_message(repr(updated_recids))

        # For all updated records, we want to sync the 8564 tags
        # and reextract references
        if updated_recids:
            submit_fixmarc_task(updated_recids)
            submit_refextract_task(updated_recids)

    # Store last run date of the daemon
    # not if it ran on specific recids from the command line with --id
    # but only if it ran on the modified records
    if start_date:
        store_last_updated(0, start_date, name)

    return True
Exemplo n.º 6
0
def task_run_core(name=NAME):
    """ Performs a search to find records without a texkey, generates a new
    one and uploads the changes in chunks """
    recids = task_get_task_param('recids')
    if recids:
        start_date = None
        write_message("processing recids from commandline")
    else:
        start_date = datetime.now()
        recids = intbitset()
        recids |= intbitset(
            perform_request_search(p='-035:spirestex -035:inspiretex',
                                   cc='HEP'))

        if task_get_task_param('all'):
            write_message("processing all records without texkey")
        else:
            _, last_date = fetch_last_updated(name)
            recids = recids & fetch_records_modified_since(last_date)
            write_message("processing records modified since: %s" % last_date)

    write_message("Found %s records to assign texkeys" % len(recids))
    processed_recids = []
    xml_to_process = []
    for count, recid in enumerate(recids):
        write_message("processing recid %s" % recid)

        # Check that the record does not have already a texkey
        has_texkey = False
        recstruct = get_record(recid)
        for instance in record_get_field_instances(recstruct,
                                                   tag="035",
                                                   ind1="",
                                                   ind2=""):
            try:
                provenance = field_get_subfield_values(instance, "9")[0]
            except IndexError:
                provenance = ""
            try:
                value = field_get_subfield_values(instance, "a")[0]
            except IndexError:
                value = ""
            provenances = ["SPIRESTeX", "INSPIRETeX"]
            if provenance in provenances and value:
                has_texkey = True
                write_message("INFO: Record %s has already texkey %s" %
                              (recid, value))

        if not has_texkey:
            TexKeySeq = TexkeySeq()
            new_texkey = ""
            try:
                new_texkey = TexKeySeq.next_value(recid)
            except TexkeyNoAuthorError:
                write_message(
                    "WARNING: Record %s has no first author or collaboration" %
                    recid)
                continue
            except TexkeyNoYearError:
                write_message("WARNING: Record %s has no year" % recid)
                continue
            write_message("Created texkey %s for record %d" %
                          (new_texkey, recid))
            xml = create_xml(recid, new_texkey)
            processed_recids.append(recid)
            xml_to_process.append(xml)

        task_update_progress("Done %d out of %d." % (count, len(recids)))
        task_sleep_now_if_required()

    # sequence ID to be used in all subsequent tasks
    sequence_id = str(random.randrange(1, 4294967296))
    if xml_to_process:
        process_chunk(xml_to_process, sequence_id)

    # Finally, index all the records processed
    # FIXME: Waiting for sequence id to be fixed
    # if processed_recids:
    #     submit_bibindex_task(processed_recids, sequence_id)

    if start_date:
        store_last_updated(0, start_date, name)

    return True
Exemplo n.º 7
0
def task_run_core(name=NAME):
    """Entry point for the arxiv-pdf-checker task"""

    # First gather recids to process
    recids = task_get_option('recids')
    if recids:
        start_date = None
        recids = [(recid, None) for recid in recids]
    else:
        start_date = datetime.now()
        dummy, last_date = fetch_last_updated(name)
        recids = fetch_updated_arxiv_records(last_date)

    updated_recids = set()

    try:

        for count, (recid, dummy) in enumerate(recids):
            if count % 50 == 0:
                msg = 'Done %s of %s' % (count, len(recids))
                write_message(msg)
                task_update_progress(msg)

            # BibTask sleep
            task_sleep_now_if_required(can_stop_too=True)

            write_message('processing %s' % recid, verbose=9)
            try:
                if process_one(recid):
                    updated_recids.add(recid)
                time.sleep(6)
            except AlreadyHarvested:
                write_message('already harvested successfully')
                time.sleep(6)
            except FoundExistingPdf:
                write_message('pdf already attached (matching md5)')
                time.sleep(6)
            except PdfNotAvailable:
                write_message("no pdf available")
                time.sleep(20)
            except InvenioFileDownloadError, e:
                write_message("failed to download: %s" % e)
                time.sleep(20)

    finally:
        # We want to process updated records even in case we are interrupted
        msg = 'Updated %s records' % len(updated_recids)
        write_message(msg)
        task_update_progress(msg)
        write_message(repr(updated_recids))

        # For all updated records, we want to sync the 8564 tags
        # and reextract references
        if updated_recids:
            submit_fixmarc_task(updated_recids)
            submit_refextract_task(updated_recids)

    # Store last run date of the daemon
    # not if it ran on specific recids from the command line with --id
    # but only if it ran on the modified records
    if start_date:
        store_last_updated(0, start_date, name)

    return True