Пример #1
0
def check_slave_is_in_consistent_state(connection=None):
    """
    Check if the slave is already aware that dbdump task is running.
    dbdump being a monotask, guarantee that no other task is currently
    running and it's hence safe to detach the slave and start the
    actual dump.
    """
    if connection is None:
        connection = get_connection_for_dump_on_slave()
    i = 0
    ## Let's take the current status of dbdump (e.g. RUNNING, ABOUT TO STOP, etc.)...
    current_status = run_sql("""SELECT status FROM "schTASK" WHERE id=%s""",
                             (task_get_task_param('task_id'), ))[0][0]
    while True:
        if i == 10:
            ## Timeout!!
            raise StandardError(
                "The slave seems not to pick up with the master")
        ## ...and let's see if it matches with what the slave sees.
        if run_sql(
                """SELECT status FROM "schTASK" WHERE id=%s AND status=%s""",
            (task_get_task_param('task_id'), current_status),
                connection=connection):
            ## Bingo!
            return
        time.sleep(3)
        i += 1
Пример #2
0
def update_rule_last_run(rule_name):
    """
    Set the last time a rule was run to now. This function should be called
    after a rule has been ran.
    """

    if task_has_option('record_ids') or task_get_option('no_upload', False) \
            or task_get_option('no_tickets', False):
        return   # We don't want to update the database in this case

    updated = run_sql("UPDATE bibcheck_rules SET last_run=%s WHERE name=%s;",
                      (task_get_task_param('task_starting_time'), rule_name,))
    if not updated: # rule not in the database, insert it
        run_sql("INSERT INTO bibcheck_rules(name, last_run) VALUES (%s, %s)",
                (rule_name, task_get_task_param('task_starting_time')))
Пример #3
0
def watch_directory(new_job_dir=CFG_BIBENCODE_DAEMON_DIR_NEWJOBS,
                    old_job_dir=CFG_BIBENCODE_DAEMON_DIR_OLDJOBS):
    """ Checks a folder job files, parses and executes them
    @param new_job_dir: path to the directory with new jobs
    @type new_job_dir: string
    @param old_job_dir: path to the directory where the old jobs are moved
    @type old_job_dir: string
    """
    global _NUMBER, _TASKID
    write_message('Checking directory %s for new jobs' % new_job_dir)
    task_update_progress('Checking for new jobs')
    _TASKID = task_get_task_param('task_id')
    files = os.listdir(new_job_dir)
    for file in files:
        file_fullpath = os.path.join(new_job_dir, file)
        if has_signature(file_fullpath):
            write_message('New Job found: %s' % file)
            job = json_decode_file(file_fullpath)
            if not getval(job, 'isbatch'):
                args = job_to_args(job)
                if not launch_task(args):
                    write_message('Error submitting task')
            else:
                ## We need the job description for the batch engine
                ## So we need to use the new path inside the oldjobs dir
                process_batch(os.path.join(old_job_dir, file))
            ## Move the file to the done dir
            shutil.move(file_fullpath, os.path.join(old_job_dir, file))
            ## Update number for next job
            _NUMBER += 1
    return 1
Пример #4
0
def _dbdump_run_task_core():
    """
    Run DB dumper core stuff.

    Note: do not use task_can_sleep() stuff here because we don't want
    other tasks to interrupt us while we are dumping the DB content.
    """
    # read params:
    task_update_progress("Reading parameters")
    write_message("Reading parameters started")
    output_dir = task_get_option('output', CFG_LOGDIR)
    output_num = task_get_option('number', 5)
    output_fil_prefix = CFG_DATABASE_NAME + '-dbdump-'
    output_fil_suffix = task_get_task_param('task_starting_time').replace(' ', '_') + '.sql.gz'
    output_fil = output_fil_prefix + output_fil_suffix
    write_message("Reading parameters ended")
    # make dump:
    task_update_progress("Dumping database")
    write_message("Database dump started")
    _dump_database(output_dir, output_fil)
    write_message("Database dump ended")
    # prune old dump files:
    task_update_progress("Pruning old dump files")
    write_message("Pruning old dump files started")
    _delete_old_dumps(output_dir, output_fil_prefix, output_num)
    write_message("Pruning old dump files ended")
    # we are done:
    task_update_progress("Done.")
    return True
Пример #5
0
def watch_directory(new_job_dir=CFG_BIBENCODE_DAEMON_DIR_NEWJOBS,
                    old_job_dir=CFG_BIBENCODE_DAEMON_DIR_OLDJOBS):
    """ Checks a folder job files, parses and executes them
    @param new_job_dir: path to the directory with new jobs
    @type new_job_dir: string
    @param old_job_dir: path to the directory where the old jobs are moved
    @type old_job_dir: string
    """
    global _NUMBER, _TASKID
    write_message('Checking directory %s for new jobs' % new_job_dir)
    task_update_progress('Checking for new jobs')
    _TASKID = task_get_task_param('task_id')
    files = os.listdir(new_job_dir)
    for file in files:
        file_fullpath = os.path.join(new_job_dir, file)
        if has_signature(file_fullpath):
            write_message('New Job found: %s' % file)
            job = json_decode_file(file_fullpath)
            if not getval(job, 'isbatch'):
                args = job_to_args(job)
                if not launch_task(args):
                    write_message('Error submitting task')
            else:
                ## We need the job description for the batch engine
                ## So we need to use the new path inside the oldjobs dir
                process_batch(os.path.join(old_job_dir, file))
            ## Move the file to the done dir
            shutil.move(file_fullpath, os.path.join(old_job_dir, file))
            ## Update number for next job
            _NUMBER += 1
    return 1
Пример #6
0
def _update_job_lastrun_time(jobname):
    """Update expJOB table and set lastrun time of JOBNAME to the task
    starting time."""
    run_sql("""UPDATE "expJOB" SET lastrun=%s WHERE jobname=%s""", (
        task_get_task_param('task_starting_time'),
        jobname,
    ))
Пример #7
0
def iterate_over_new(list, fmt):
    """
    Iterate over list of IDs

    @param list: the list of record IDs to format
    @param fmt: the output format to use
    @return: tuple (total number of records, time taken to format, time taken to insert)
    """
    global total_rec

    formatted_records = ''      # (string-)List of formatted record of an iteration
    tbibformat  = 0     # time taken up by external call
    tbibupload  = 0     # time taken up by external call
    start_date = task_get_task_param('task_starting_time') # Time at which the record was formatted

    tot = len(list)
    count = 0
    for recID in list:
        t1 = os.times()[4]
        start_date = time.strftime('%Y-%m-%d %H:%M:%S')
        format_record(recID, fmt, on_the_fly=True)
        formatted_record = zlib.compress(format_record(recID, fmt, on_the_fly=True))
        run_sql('REPLACE LOW_PRIORITY INTO bibfmt (id_bibrec, format, last_updated, value) VALUES (%s, %s, %s, %s)',
                (recID, fmt, start_date, formatted_record))
        t2 = os.times()[4]
        tbibformat += (t2 - t1)
        count += 1
        if (count % 100) == 0:
            write_message("   ... formatted %s records out of %s" % (count, tot))
            task_update_progress('Formatted %s out of %s' % (count, tot))
            task_sleep_now_if_required(can_stop_too=True)
    if (tot % 100) != 0:
        write_message("   ... formatted %s records out of %s" % (count, tot))
    return (tot, tbibformat, tbibupload)
Пример #8
0
def check_slave_is_in_consistent_state(connection=None):
    """
    Check if the slave is already aware that dbdump task is running.
    dbdump being a monotask, guarantee that no other task is currently
    running and it's hence safe to detach the slave and start the
    actual dump.
    """
    if connection is None:
        connection = get_connection_for_dump_on_slave()
    i = 0
    ## Let's take the current status of dbdump (e.g. RUNNING, ABOUT TO STOP, etc.)...
    current_status = run_sql("""SELECT status FROM "schTASK" WHERE id=%s""", (task_get_task_param('task_id'), ))[0][0]
    while True:
        if i == 10:
            ## Timeout!!
            raise StandardError("The slave seems not to pick up with the master")
        ## ...and let's see if it matches with what the slave sees.
        if run_sql("""SELECT status FROM "schTASK" WHERE id=%s AND status=%s""", (task_get_task_param('task_id'), current_status), connection=connection):
            ## Bingo!
            return
        time.sleep(3)
        i += 1
Пример #9
0
def _task_run_core():
    """Runs analyse_documents for each ontology, collection, record ids
    set."""

    automated_daemon_mode_p = True
    recids = bibtask.task_get_option('recids')
    collections = bibtask.task_get_option('collections')
    taxonomy = bibtask.task_get_option('taxonomy')

    if recids or collections:
        # We want to run some records/collection only, so we are not
        # in the automated daemon mode; this will be useful later.
        automated_daemon_mode_p = False

    # Check if the user specified which documents to extract keywords from.
    if recids:
        onto_recids = _get_recids_foreach_ontology(recids=recids,
                                                   taxonomy=taxonomy)
    elif collections:
        onto_recids = _get_recids_foreach_ontology(collections=collections,
                                                   taxonomy=taxonomy)
    else:
        onto_recids = _get_recids_foreach_ontology()

    if not onto_recids:
        # Nothing to do.
        if automated_daemon_mode_p:
            _update_date_of_last_run(
                bibtask.task_get_task_param('task_starting_time'))
        return 1

    # We will write to a temporary file as we go, because we might be processing
    # big collections with many docs
    _rid = time.strftime("%Y%m%d%H%M%S", time.localtime())
    abs_path = engine.get_tmp_file(_rid)
    fo = open(abs_path, 'w')

    fo.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    fo.write('<collection xmlns="http://www.loc.gov/MARC21/slim">\n')

    # Count the total number of records in order to update the progression.
    global _RECIDS_NUMBER
    for onto_rec in onto_recids:
        _RECIDS_NUMBER += len(onto_rec['recIDs'])

    rec_added = False

    for onto_rec in onto_recids:
        bibtask.task_sleep_now_if_required(can_stop_too=False)

        if onto_rec['collection'] is not None:
            bibtask.write_message(
                'INFO: Applying taxonomy %s to collection %s (%s '
                'records)' % (onto_rec['ontology'], onto_rec['collection'],
                              len(onto_rec['recIDs'])), stream=sys.stderr,
                verbose=3)
        else:
            bibtask.write_message('INFO: Applying taxonomy %s to recIDs %s. ' %
                                  (onto_rec['ontology'],
                                   ', '.join([str(recid) for recid in
                                              onto_rec['recIDs']])),
                                  stream=sys.stderr, verbose=3)
        if onto_rec['recIDs']:
            xml = _analyze_documents(onto_rec['recIDs'],
                                     onto_rec['ontology'],
                                     onto_rec['collection'])
            if len(xml) > 5:
                fo.write(xml)
                rec_added = True

    fo.write('</collection>\n')
    fo.close()

    # Apply the changes.
    if rec_added:
        if bconfig.CFG_DB_SAVE_KW:
            webinterface.upload_keywords(abs_path)
        else:
            bibtask.write_message(
                "INFO: CFG_DB_SAVE_KW is false, we don't save results",
                stream=sys.stderr, verbose=0)
    else:
        bibtask.write_message(
            "WARNING: No keywords found, recids: %s" % onto_recids,
            stream=sys.stderr, verbose=0)
        os.remove(abs_path)

    # Update the date of last run in the clsMETHOD table, but only if
    # we were running in an automated mode.
    if automated_daemon_mode_p:
        _update_date_of_last_run(
            bibtask.task_get_task_param('task_starting_time'))
    return 1
Пример #10
0
def task_run_core():
    """Core task of oaiharvest.

    This function will run all the operations needed
    to run an oaiharvest task into bibsched.

    :return: :raise InvenioOAIHarvestWarning:
    """
    workflow_id_preservation = 0
    workflow = None
    start_time = time.time()
    list_of_workflow_without_repository = []
    list_of_repository_per_workflow = {}

    repository = task_get_option("repository")
    if not repository:
        workflow_option = task_get_option("workflow")

        if isinstance(workflow_option, list):
            for name in workflow_option:
                if name not in list_of_workflow_without_repository:
                    list_of_workflow_without_repository.append(name)

        else:
            list_of_workflow_without_repository.append(workflow_option)
    else:
        if task_get_option("workflow"):

            workflow_option = task_get_option("workflow")
            if isinstance(workflow_option, list):
                for name in workflow_option:
                    if name not in list_of_repository_per_workflow:
                        list_of_repository_per_workflow[name] = repository

            else:
                list_of_repository_per_workflow[workflow_option] = repository

        elif isinstance(repository, list):

            for name_repository in repository:
                name_workflow = OaiHARVEST.get(
                    OaiHARVEST.name == name_repository).one().workflows
                if name_workflow not in list_of_repository_per_workflow:
                    list_of_repository_per_workflow[name_workflow] = [
                        name_repository]
                else:
                    list_of_repository_per_workflow[name_workflow].append(
                        name_repository)

        else:
            workflow_found = OaiHARVEST.get(
                OaiHARVEST.name == repository).one().workflows
            list_of_repository_per_workflow[workflow_found] = repository
    try:
        if list_of_repository_per_workflow:
            for workflow_to_launch in list_of_repository_per_workflow:
                options = task_get_option(None)
                options["repository"] = list_of_repository_per_workflow[
                    workflow_to_launch]
                workflow = start(workflow_to_launch,
                                 data=[""],
                                 stop_on_error=True,
                                 options=options)
        else:
            for workflow_to_launch in list_of_workflow_without_repository:
                workflow = start(workflow_to_launch,
                                 data=[""],
                                 stop_on_error=True,
                                 options=task_get_option(None))
        if workflow:
            workflow_id_preservation = workflow.uuid
            workflowlog = BibWorkflowEngineLog.query.filter(
                BibWorkflowEngineLog.id_object == workflow.uuid
            ).all()
            for log in workflowlog:
                write_message(log.message)
        execution_time = round(time.time() - start_time, 2)
        write_message("Execution time :" + str(execution_time))
    except WorkflowError as e:
        write_message("ERRORS HAPPENED")
        write_message("____________Workflow log output____________")
        workflow_id_preservation = e.id_workflow
        workflowlog = BibWorkflowEngineLog.query.filter(
            BibWorkflowEngineLog.id_object == e.id_workflow
        ).filter(BibWorkflowEngineLog.log_type >= 40).all()

        for log in workflowlog:
            write_message(log.message)

        for i in e.payload:
            write_message("\n\n____________Workflow " + i + " log output____________")
            workflowlog = BibWorkflowEngineLog.query.filter(
                BibWorkflowEngineLog.id_object == i
            ).filter(BibWorkflowEngineLog.log_type >= 40).all()
            for log in workflowlog:
                write_message(log.message)

        write_message("____________Object log output____________")

        objectlog = BibWorkflowObjectLog.query.filter(
            BibWorkflowObjectLog.id_object == e.id_object
        ).filter(BibWorkflowEngineLog.log_type >= 40).all()

        for log in objectlog:
            write_message(log.message)

        execution_time = round(time.time() - start_time, 2)
        write_message("Execution time :" + str(execution_time))

    # Generate reports
    ticket_queue = task_get_option("create-ticket-in")
    notification_email = task_get_option("notify-email-to")
    workflow_main = Workflow.query.filter(
        Workflow.uuid == workflow_id_preservation
    ).one()

    if ticket_queue or notification_email:

        subject, text = generate_harvest_report(
            workflow_main,
            current_task_id=task_get_task_param("task_id")
        )
        # Create ticket for finished harvest?
        if ticket_queue:
            ticketid = create_ticket(ticket_queue, subject=subject, text=text)
            if ticketid:
                write_message("Ticket %s submitted." % (str(ticketid),))

        # Send e-mail for finished harvest?
        if notification_email:
            send_email(fromaddr=CFG_SITE_SUPPORT_EMAIL,
                       toaddr=notification_email,
                       subject=subject,
                       content=text)

    if workflow_main.counter_error:
        if CFG_OAI_FAILED_HARVESTING_STOP_QUEUE == 0 or \
           not task_get_task_param("sleeptime") or \
           workflow_main.counter_error > 1:
            # Admin want BibSched to stop, or the task is not set to
            # run at a later date: we must stop the queue.
            write_message("An error occurred. Task is configured to stop")
            return False
        else:
            # An error happened, but it can be recovered at next run
            # (task is re-scheduled) and admin set BibSched to
            # continue even after failure.
            write_message("Error occurred, but task is configured to continue")
            if CFG_OAI_FAILED_HARVESTING_EMAILS_ADMIN:
                try:
                    raise InvenioOAIHarvestWarning(
                        "OAIHarvest (task #%s) failed at fully harvesting."
                        " BibSched has NOT been stopped, and OAIHarvest will"
                        " try to recover at next run" %
                        (task_get_task_param("task_id"),)
                    )
                except InvenioOAIHarvestWarning:
                    register_exception(stream='warning', alert_admin=True)
            return True
    else:
        return True
Пример #11
0
def _dbdump_run_task_core():
    """
    Run DB dumper core stuff.

    Note: do not use task_can_sleep() stuff here because we don't want
    other tasks to interrupt us while we are dumping the DB content.
    """
    # read params:
    host = CFG_DATABASE_HOST
    port = CFG_DATABASE_PORT
    connection = None
    active_queues = []
    try:
        if task_get_option('slave') and not task_get_option('dump_on_slave_helper_mode'):
            connection = get_connection_for_dump_on_slave()
            write_message("Dump on slave requested")
            write_message("... checking if slave is well up...")
            check_slave_is_up(connection)
            write_message("... checking if slave is in consistent state...")
            check_slave_is_in_consistent_state(connection)
            write_message("... detaching slave database...")
            detach_slave(connection)
            write_message("... scheduling dump on slave helper...")
            helper_arguments = []
            if task_get_option("number"):
                helper_arguments += ["--number", str(task_get_option("number"))]
            if task_get_option("output"):
                helper_arguments += ["--output", str(task_get_option("output"))]
            if task_get_option("params"):
                helper_arguments += ["--params", str(task_get_option("params"))]
            if task_get_option("ignore_tables"):
                helper_arguments += ["--ignore-tables", str(task_get_option("ignore_tables"))]
            if task_get_option("compress"):
                helper_arguments += ["--compress"]
            if task_get_option("slave"):
                helper_arguments += ["--slave", str(task_get_option("slave"))]
            helper_arguments += ['-N', 'slavehelper', '--dump-on-slave-helper']
            task_id = task_low_level_submission('dbdump', task_get_task_param('user'), '-P4', *helper_arguments)
            write_message("Slave scheduled with ID %s" % task_id)
            task_update_progress("DONE")
            return True
        elif task_get_option('dump_on_slave_helper_mode'):
            write_message("Dumping on slave mode")
            connection = get_connection_for_dump_on_slave()
            write_message("... checking if slave is well down...")
            check_slave_is_down(connection)
            host = CFG_DATABASE_SLAVE

        task_update_progress("Reading parameters")
        write_message("Reading parameters started")
        output_dir = task_get_option('output', CFG_LOGDIR)
        output_num = task_get_option('number', 5)
        params = task_get_option('params', None)
        compress = task_get_option('compress', False)
        slave = task_get_option('slave', False)
        ignore_tables = task_get_option('ignore_tables', None)
        if ignore_tables:
            ignore_tables = get_table_names(ignore_tables)
        else:
            ignore_tables = None

        output_file_suffix = task_get_task_param('task_starting_time')
        output_file_suffix = output_file_suffix.replace(' ', '_') + '.sql'
        if compress:
            output_file_suffix = "%s.gz" % (output_file_suffix,)
        write_message("Reading parameters ended")

        if task_get_option('disable_workers'):
            active_queues = get_queues()
            if active_queues:
                write_message("Suspend workers and wait for any running tasks to complete")
                suspend_queues(active_queues)
                write_message("Workers suspended")

        # make dump:
        task_update_progress("Dumping database")
        write_message("Database dump started")

        if slave:
            output_file_prefix = 'slave-%s-dbdump-' % (CFG_DATABASE_NAME,)
        else:
            output_file_prefix = '%s-dbdump-' % (CFG_DATABASE_NAME,)
        output_file = output_file_prefix + output_file_suffix
        dump_path = output_dir + os.sep + output_file
        dump_database(dump_path, \
                        host=host,
                        port=port,
                        params=params, \
                        compress=compress, \
                        ignore_tables=ignore_tables)
        write_message("Database dump ended")
    finally:
        for queue in active_queues:
            enable_queue(queue)
        if connection and task_get_option('dump_on_slave_helper_mode'):
            write_message("Reattaching slave")
            attach_slave(connection)
    # prune old dump files:
    task_update_progress("Pruning old dump files")
    write_message("Pruning old dump files started")
    _delete_old_dumps(output_dir, output_file_prefix, output_num)
    write_message("Pruning old dump files ended")
    # we are done:
    task_update_progress("Done.")
    return True
Пример #12
0
def task_run_core():
    """Core task of oaiharvest.

    This function will run all the operations needed
    to run an oaiharvest task into bibsched.

    :return: :raise InvenioOAIHarvestWarning:
    """
    workflow_id_preservation = 0
    workflow = None
    start_time = time.time()
    list_of_workflow_without_repository = []
    list_of_repository_per_workflow = {}

    repository = task_get_option("repository")
    if not repository:
        workflow_option = task_get_option("workflow")

        if isinstance(workflow_option, list):
            for name in workflow_option:
                if name not in list_of_workflow_without_repository:
                    list_of_workflow_without_repository.append(name)

        else:
            list_of_workflow_without_repository.append(workflow_option)
    else:
        if task_get_option("workflow"):

            workflow_option = task_get_option("workflow")
            if isinstance(workflow_option, list):
                for name in workflow_option:
                    if name not in list_of_repository_per_workflow:
                        list_of_repository_per_workflow[name] = repository

            else:
                list_of_repository_per_workflow[workflow_option] = repository

        elif isinstance(repository, list):

            for name_repository in repository:
                name_workflow = OaiHARVEST.get(
                    OaiHARVEST.name == name_repository).one().workflows
                if name_workflow not in list_of_repository_per_workflow:
                    list_of_repository_per_workflow[name_workflow] = [
                        name_repository
                    ]
                else:
                    list_of_repository_per_workflow[name_workflow].append(
                        name_repository)

        else:
            workflow_found = OaiHARVEST.get(
                OaiHARVEST.name == repository).one().workflows
            list_of_repository_per_workflow[workflow_found] = repository
    try:
        if list_of_repository_per_workflow:
            for workflow_to_launch in list_of_repository_per_workflow:
                options = task_get_option(None)
                options["repository"] = list_of_repository_per_workflow[
                    workflow_to_launch]
                workflow = start(workflow_to_launch,
                                 data=[""],
                                 stop_on_error=True,
                                 options=options)
        else:
            for workflow_to_launch in list_of_workflow_without_repository:
                workflow = start(workflow_to_launch,
                                 data=[""],
                                 stop_on_error=True,
                                 options=task_get_option(None))
        if workflow:
            workflow_id_preservation = workflow.uuid
            workflowlog = BibWorkflowEngineLog.query.filter(
                BibWorkflowEngineLog.id_object == workflow.uuid).all()
            for log in workflowlog:
                write_message(log.message)
        execution_time = round(time.time() - start_time, 2)
        write_message("Execution time :" + str(execution_time))
    except WorkflowError as e:
        write_message("ERRORS HAPPENED")
        write_message("____________Workflow log output____________")
        workflow_id_preservation = e.id_workflow
        workflowlog = BibWorkflowEngineLog.query.filter(
            BibWorkflowEngineLog.id_object == e.id_workflow).filter(
                BibWorkflowEngineLog.log_type >= 40).all()

        for log in workflowlog:
            write_message(log.message)

        for i in e.payload:
            write_message("\n\n____________Workflow " + i +
                          " log output____________")
            workflowlog = BibWorkflowEngineLog.query.filter(
                BibWorkflowEngineLog.id_object == i).filter(
                    BibWorkflowEngineLog.log_type >= 40).all()
            for log in workflowlog:
                write_message(log.message)

        write_message("____________Object log output____________")

        objectlog = BibWorkflowObjectLog.query.filter(
            BibWorkflowObjectLog.id_object == e.id_object).filter(
                BibWorkflowEngineLog.log_type >= 40).all()

        for log in objectlog:
            write_message(log.message)

        execution_time = round(time.time() - start_time, 2)
        write_message("Execution time :" + str(execution_time))

    # Generate reports
    ticket_queue = task_get_option("create-ticket-in")
    notification_email = task_get_option("notify-email-to")
    workflow_main = Workflow.query.filter(
        Workflow.uuid == workflow_id_preservation).one()

    if ticket_queue or notification_email:

        subject, text = generate_harvest_report(
            workflow_main, current_task_id=task_get_task_param("task_id"))
        # Create ticket for finished harvest?
        if ticket_queue:
            ticketid = create_ticket(ticket_queue, subject=subject, text=text)
            if ticketid:
                write_message("Ticket %s submitted." % (str(ticketid), ))

        # Send e-mail for finished harvest?
        if notification_email:
            send_email(fromaddr=CFG_SITE_SUPPORT_EMAIL,
                       toaddr=notification_email,
                       subject=subject,
                       content=text)

    if workflow_main.counter_error:
        if CFG_OAI_FAILED_HARVESTING_STOP_QUEUE == 0 or \
           not task_get_task_param("sleeptime") or \
           workflow_main.counter_error > 1:
            # Admin want BibSched to stop, or the task is not set to
            # run at a later date: we must stop the queue.
            write_message("An error occurred. Task is configured to stop")
            return False
        else:
            # An error happened, but it can be recovered at next run
            # (task is re-scheduled) and admin set BibSched to
            # continue even after failure.
            write_message("Error occurred, but task is configured to continue")
            if CFG_OAI_FAILED_HARVESTING_EMAILS_ADMIN:
                try:
                    raise InvenioOAIHarvestWarning(
                        "OAIHarvest (task #%s) failed at fully harvesting."
                        " BibSched has NOT been stopped, and OAIHarvest will"
                        " try to recover at next run" %
                        (task_get_task_param("task_id"), ))
                except InvenioOAIHarvestWarning:
                    register_exception(stream='warning', alert_admin=True)
            return True
    else:
        return True
Пример #13
0
def ref_analyzer(citation_informations, updated_recids, tags, config):
    """Analyze the citation informations and calculate the citation weight
       and cited by list dictionary.
    """
    citations = {}
    for recid in updated_recids:
        citations[recid] = set()
    references = {}
    for recid in updated_recids:
        references[recid] = set()

    def step(msg_prefix, recid, done, total):
        if done % 30 == 0:
            task_sleep_now_if_required()

        if done % 1000 == 0:
            mesg = "%s done %s of %s" % (msg_prefix, done, total)
            write_message(mesg)
            task_update_progress(mesg)

        write_message("Processing: %s" % recid, verbose=9)

    def add_to_cites(citer, citee):
        # Make sure we don't add ourselves
        # Workaround till we know why we are adding ourselves.
        if citer == citee:
            return

        citations[citee].add(citer)
        if citer in updated_recids:
            references[citer].add(citee)

    def add_to_refs(citer, citee):
        # Make sure we don't add ourselves
        # Workaround till we know why we are adding ourselves.
        if citer == citee:
            return

        if citee in updated_recids:
            citations[citee].add(citer)
        references[citer].add(citee)

    # dict of recid -> institute_give_publ_id
    records_info, references_info = citation_informations

    t1 = os.times()[4]

    # Try to find references based on 999C5r
    # e.g 8 -> ([astro-ph/9889],[hep-ph/768])
    # meaning: rec 8 contains these in bibliography
    write_message("Phase 1: Report numbers references")
    done = 0
    for thisrecid, refnumbers in iteritems(references_info['report-numbers']):
        step("Report numbers references", thisrecid, done,
             len(references_info['report-numbers']))
        done += 1

        for refnumber in (r for r in refnumbers if r):
            field = 'reportnumber'
            refnumber = standardize_report_number(refnumber)
            # Search for "hep-th/5644654 or such" in existing records
            recids = get_recids_matching_query(p=refnumber,
                                               f=field,
                                               config=config)
            write_message("These match searching %s in %s: %s" %
                          (refnumber, field, list(recids)),
                          verbose=9)

            if not recids:
                insert_into_missing(thisrecid, refnumber)
            else:
                remove_from_missing(refnumber)

            if len(recids) > 1:
                store_citation_warning('multiple-matches', refnumber)
                msg = "Whoops: record '%d' report number value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, refnumber, repr(recids))
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]:  # take only the first one
                add_to_refs(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t2 = os.times()[4]

    # Try to find references based on 999C5s
    # e.g. Phys.Rev.Lett. 53 (1986) 2285
    write_message("Phase 2: Journal references")
    done = 0
    for thisrecid, refs in iteritems(references_info['journals']):
        step("Journal references", thisrecid, done,
             len(references_info['journals']))
        done += 1

        for reference in (r for r in refs if r):
            p = reference
            field = 'journal'

            # check reference value to see whether it is well formed:
            if not re_CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK.match(p):
                store_citation_warning('not-well-formed', p)
                msg = "Whoops, record '%d' reference value '%s' " \
                      "is not well formed; skipping it." % (thisrecid, p)
                write_message(msg, stream=sys.stderr)
                continue  # skip this ill-formed value

            recids = get_recids_matching_query(p=p, f=field, config=config)
            write_message("These match searching %s in %s: %s" %
                          (reference, field, list(recids)),
                          verbose=9)

            if not recids:
                insert_into_missing(thisrecid, p)
            else:
                remove_from_missing(p)

            if len(recids) > 1:
                store_citation_warning('multiple-matches', p)
                msg = "Whoops: record '%d' reference value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, p, repr(recids))
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]:  # take only the first one
                add_to_refs(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t3 = os.times()[4]

    # Try to find references based on 999C5a
    # e.g. 10.1007/BF03170733
    write_message("Phase 3: DOI references")
    done = 0
    for thisrecid, refs in iteritems(references_info['doi']):
        step("DOI references", thisrecid, done, len(references_info['doi']))
        done += 1

        for reference in (r for r in refs if r):
            p = reference
            field = 'doi'

            recids = get_recids_matching_query(p=p, f=field, config=config)
            write_message("These match searching %s in %s: %s" %
                          (reference, field, list(recids)),
                          verbose=9)

            if not recids:
                insert_into_missing(thisrecid, p)
            else:
                remove_from_missing(p)

            if len(recids) > 1:
                store_citation_warning('multiple-matches', p)
                msg = "Whoops: record '%d' DOI value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, p, repr(recids))
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]:  # take only the first one
                add_to_refs(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t4 = os.times()[4]

    # Try to find references based on 999C5a (hdl references)
    # e.g. 4263537/4000
    write_message("Phase 4: HDL references")
    done = 0
    for thisrecid, refs in references_info['hdl'].iteritems():
        step("HDL references", thisrecid, done, len(references_info['hdl']))
        done += 1

        for reference in (r for r in refs if r):
            p = reference
            field = 'hdl'

            recids = get_recids_matching_query(p=p, f=field, config=config)
            write_message("These match searching %s in %s: %s" %
                          (reference, field, list(recids)),
                          verbose=9)

            if not recids:
                insert_into_missing(thisrecid, p)
            else:
                remove_from_missing(p)

            if len(recids) > 1:
                store_citation_warning('multiple-matches', p)
                msg = "Whoops: record '%d' HDL value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, p, repr(recids))
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]:  # take only the first one
                add_to_refs(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t5 = os.times()[4]

    # Try to find references based on 999C50
    # e.g. 1244
    write_message("Phase 5: Record ID references")
    done = 0
    for thisrecid, refs in references_info['record_id'].iteritems():
        step("Record ID references", thisrecid, done,
             len(references_info['record_id']))
        done += 1
        field = "001"
        for recid in (r for r in refs if r):
            valid = get_recids_matching_query(p=recid, f=field, config=config)
            write_message("These match searching %s in %s: %s" %
                          (recid, field, list(valid)),
                          verbose=9)
            if valid:
                add_to_refs(thisrecid, valid[0])

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t6 = os.times()[4]

    # Try to find references based on 999C5i
    # e.g. 978-3-942171-73-1
    write_message("Phase 6: ISBN references")
    done = 0
    for thisrecid, refs in references_info['isbn'].iteritems():
        step("ISBN references", thisrecid, done, len(references_info['isbn']))
        done += 1

        for reference in (r for r in refs if r):
            p = reference
            field = 'isbn'

            recids = get_recids_matching_query(p=p, f=field, config=config)
            write_message("These match searching %s in %s: %s" %
                          (reference, field, list(recids)),
                          verbose=9)

            if not recids:
                insert_into_missing(thisrecid, p)
            else:
                remove_from_missing(p)

            if len(recids) > 1:
                store_citation_warning('multiple-matches', p)
                msg = "Whoops: record '%d' ISBN value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, p, repr(recids))
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]:  # take only the first one
                add_to_refs(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t7 = os.times()[4]

    # Search for stuff like CERN-TH-4859/87 in list of refs
    write_message("Phase 7: report numbers catchup")
    done = 0
    for thisrecid, reportcodes in iteritems(records_info['report-numbers']):
        step("Report numbers catchup", thisrecid, done,
             len(records_info['report-numbers']))
        done += 1

        for reportcode in (r for r in reportcodes if r):
            if reportcode.startswith('arXiv'):
                std_reportcode = standardize_report_number(reportcode)
                report_pattern = r'^%s( *\[[a-zA-Z.-]*\])?' % \
                                                re.escape(std_reportcode)
                recids = get_recids_matching_query(
                    p=report_pattern,
                    f=tags['refs_report_number'],
                    m='r',
                    config=config)
            else:
                recids = get_recids_matching_query(
                    p=reportcode, f=tags['refs_report_number'], config=config)
            for recid in recids:
                add_to_cites(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    # Find this record's pubinfo in other records' bibliography
    write_message("Phase 8: journals catchup")
    done = 0
    t8 = os.times()[4]
    for thisrecid, rec_journals in iteritems(records_info['journals']):
        step("Journals catchup", thisrecid, done,
             len(records_info['journals']))
        done += 1

        for journal in rec_journals:
            journal = journal.replace("\"", "")
            # Search the publication string like
            # Phys. Lett., B 482 (2000) 417 in 999C5s
            recids = get_recids_matching_query(p=journal,
                                               f=tags['refs_journal'],
                                               config=config)
            write_message("These records match %s in %s: %s" %
                          (journal, tags['refs_journal'], list(recids)),
                          verbose=9)

            for recid in recids:
                add_to_cites(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    write_message("Phase 9: DOI catchup")
    done = 0
    t9 = os.times()[4]
    for thisrecid, dois in iteritems(records_info['doi']):
        step("DOI catchup", thisrecid, done, len(records_info['doi']))
        done += 1

        for doi in dois:
            recids = get_recids_matching_query(p=doi,
                                               f=tags['refs_doi'],
                                               config=config)
            write_message("These records match %s in %s: %s" %
                          (doi, tags['refs_doi'], list(recids)),
                          verbose=9)

            for recid in recids:
                add_to_cites(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    write_message("Phase 10: HDL catchup")
    done = 0
    t10 = os.times()[4]
    for thisrecid, hdls in records_info['hdl'].iteritems():
        step("HDL catchup", thisrecid, done, len(records_info['hdl']))
        done += 1

        for hdl in hdls:
            recids = get_recids_matching_query(p=hdl,
                                               f=tags['refs_doi'],
                                               config=config)
            write_message("These records match %s in %s: %s" %
                          (hdl, tags['refs_doi'], list(recids)),
                          verbose=9)

            for recid in recids:
                add_to_cites(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    write_message("Phase 11: ISBN catchup")
    done = 0
    t11 = os.times()[4]
    for thisrecid, isbns in records_info['isbn'].iteritems():
        step("ISBN catchup", thisrecid, done, len(records_info['isbn']))
        done += 1

        for isbn in isbns:
            recids = get_recids_matching_query(p=isbn,
                                               f=tags['refs_isbn'],
                                               config=config)
            write_message("These records match %s in %s: %s" %
                          (isbn, tags['refs_isbn'], list(recids)),
                          verbose=9)

            for recid in recids:
                add_to_cites(recid, thisrecid)

    write_message("Phase 12: Record ID catchup")
    done = 0
    t12 = os.times()[4]
    for thisrecid, record_ids in records_info['record_id'].iteritems():
        step("Record ID catchup", thisrecid, done,
             len(records_info['record_id']))
        done += 1

        for record_id in record_ids:
            recids = get_recids_matching_query(p=record_id,
                                               f=tags['refs_record_id'],
                                               config=config)
            write_message("These records match %s in %s: %s" %
                          (record_id, tags['refs_record_id'], list(recids)),
                          verbose=9)

            for recid in recids:
                add_to_cites(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    if task_get_task_param('verbose') >= 3:
        # Print only X first to prevent flood
        write_message("citation_list (x is cited by y):")
        write_message(dict(islice(iteritems(citations), 10)))
        write_message("size: %s" % len(citations))
        write_message("reference_list (x cites y):")
        write_message(dict(islice(iteritems(references), 10)))
        write_message("size: %s" % len(references))

    t13 = os.times()[4]

    write_message("Execution time for analyzing the citation information "
                  "generating the dictionary:")
    write_message("... checking ref report numbers: %.2f sec" % (t2 - t1))
    write_message("... checking ref journals: %.2f sec" % (t3 - t2))
    write_message("... checking ref DOI: %.2f sec" % (t4 - t3))
    write_message("... checking ref HDL: %.2f sec" % (t5 - t4))
    write_message("... checking ref Record ID: %.2f sec" % (t6 - t5))
    write_message("... checking ref ISBN: %.2f sec" % (t7 - t6))
    write_message("... checking rec report numbers: %.2f sec" % (t8 - t7))
    write_message("... checking rec journals: %.2f sec" % (t9 - t8))
    write_message("... checking rec DOI: %.2f sec" % (t10 - t9))
    write_message("... checking rec HDL: %.2f sec" % (t11 - t10))
    write_message("... checking rec ISBN: %.2f sec" % (t12 - t11))
    write_message("... checking rec Record ID: %.2f sec" % (t13 - t12))
    write_message("... total time of ref_analyze: %.2f sec" % (t13 - t1))

    return citations, references
Пример #14
0
def _task_run_core():
    """Runs analyse_documents for each ontology, collection, record ids
    set."""

    automated_daemon_mode_p = True
    recids = bibtask.task_get_option('recids')
    collections = bibtask.task_get_option('collections')
    taxonomy = bibtask.task_get_option('taxonomy')

    if recids or collections:
        # We want to run some records/collection only, so we are not
        # in the automated daemon mode; this will be useful later.
        automated_daemon_mode_p = False

    # Check if the user specified which documents to extract keywords from.
    if recids:
        onto_recids = _get_recids_foreach_ontology(recids=recids,
                                                   taxonomy=taxonomy)
    elif collections:
        onto_recids = _get_recids_foreach_ontology(collections=collections,
                                                   taxonomy=taxonomy)
    else:
        onto_recids = _get_recids_foreach_ontology()

    if not onto_recids:
        # Nothing to do.
        if automated_daemon_mode_p:
            _update_date_of_last_run(
                bibtask.task_get_task_param('task_starting_time'))
        return 1

    # We will write to a temporary file as we go, because we might be processing
    # big collections with many docs
    _rid = time.strftime("%Y%m%d%H%M%S", time.localtime())
    abs_path = engine.get_tmp_file(_rid)
    fo = open(abs_path, 'w')

    fo.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    fo.write('<collection xmlns="http://www.loc.gov/MARC21/slim">\n')

    # Count the total number of records in order to update the progression.
    global _RECIDS_NUMBER
    for onto_rec in onto_recids:
        _RECIDS_NUMBER += len(onto_rec['recIDs'])

    rec_added = False

    for onto_rec in onto_recids:
        bibtask.task_sleep_now_if_required(can_stop_too=False)

        if onto_rec['collection'] is not None:
            bibtask.write_message(
                'INFO: Applying taxonomy %s to collection %s (%s '
                'records)' % (onto_rec['ontology'], onto_rec['collection'],
                              len(onto_rec['recIDs'])),
                stream=sys.stderr,
                verbose=3)
        else:
            bibtask.write_message(
                'INFO: Applying taxonomy %s to recIDs %s. ' %
                (onto_rec['ontology'], ', '.join(
                    [str(recid) for recid in onto_rec['recIDs']])),
                stream=sys.stderr,
                verbose=3)
        if onto_rec['recIDs']:
            xml = _analyze_documents(onto_rec['recIDs'], onto_rec['ontology'],
                                     onto_rec['collection'])
            if len(xml) > 5:
                fo.write(xml)
                rec_added = True

    fo.write('</collection>\n')
    fo.close()

    # Apply the changes.
    if rec_added:
        if bconfig.CFG_DB_SAVE_KW:
            webinterface.upload_keywords(abs_path)
        else:
            bibtask.write_message(
                "INFO: CFG_DB_SAVE_KW is false, we don't save results",
                stream=sys.stderr,
                verbose=0)
    else:
        bibtask.write_message("WARNING: No keywords found, recids: %s" %
                              onto_recids,
                              stream=sys.stderr,
                              verbose=0)
        os.remove(abs_path)

    # Update the date of last run in the clsMETHOD table, but only if
    # we were running in an automated mode.
    if automated_daemon_mode_p:
        _update_date_of_last_run(
            bibtask.task_get_task_param('task_starting_time'))
    return 1
Пример #15
0
def ref_analyzer(citation_informations, dicts,
                 updated_recids, tags, do_catchup=True):
    """Analyze the citation informations and calculate the citation weight
       and cited by list dictionary.
    """
    citations_weight = dicts['cites_weight']
    citations = dicts['cites']
    references = dicts['refs']
    selfcites = dicts['selfcites']
    selfrefs = dicts['selfrefs']
    authorcites = dicts['authorcites']

    def step(msg_prefix, recid, done, total):
        if done % 30 == 0:
            task_sleep_now_if_required()

        if done % 1000 == 0:
            mesg = "%s done %s of %s" % (msg_prefix, done, total)
            write_message(mesg)
            task_update_progress(mesg)

        write_message("Processing: %s" % recid, verbose=9)

    def add_to_dicts(citer, cited):
        # Make sure we don't add ourselves
        # Workaround till we know why we are adding ourselves.
        if citer == cited:
            return
        if cited not in citations_weight:
            citations_weight[cited] = 0
        # Citations and citations weight
        if citer not in citations.setdefault(cited, []):
            citations[cited].append(citer)
            citations_weight[cited] += 1
        # References
        if cited not in references.setdefault(citer, []):
            references[citer].append(cited)

    # dict of recid -> institute_give_publ_id
    records_info, references_info = citation_informations

    t1 = os.times()[4]

    write_message("Phase 0: temporarily remove changed records from " \
                  "citation dictionaries; they will be filled later")
    if do_catchup:
        for somerecid in updated_recids:
            try:
                del citations[somerecid]
            except KeyError:
                pass

    for somerecid in updated_recids:
        try:
            del references[somerecid]
        except KeyError:
            pass

    # Try to find references based on 999C5r
    # e.g 8 -> ([astro-ph/9889],[hep-ph/768])
    # meaning: rec 8 contains these in bibliography
    write_message("Phase 1: Report numbers references")
    done = 0
    for thisrecid, refnumbers in iteritems(references_info['report-numbers']):
        step("Report numbers references", thisrecid, done,
                                        len(references_info['report-numbers']))
        done += 1

        for refnumber in (r for r in refnumbers if r):
            field = 'reportnumber'
            refnumber = standardize_report_number(refnumber)
            # Search for "hep-th/5644654 or such" in existing records
            recids = get_recids_matching_query(p=refnumber, f=field)
            write_message("These match searching %s in %s: %s" % \
                                   (refnumber, field, list(recids)), verbose=9)

            if not recids:
                insert_into_missing(thisrecid, refnumber)
            else:
                remove_from_missing(refnumber)

            if len(recids) > 1:
                store_citation_warning('multiple-matches', refnumber)
                msg = "Whoops: record '%d' report number value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, refnumber, repr(recids))
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]:  # take only the first one
                add_to_dicts(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t2 = os.times()[4]

    # Try to find references based on 999C5s
    # e.g. Phys.Rev.Lett. 53 (1986) 2285
    write_message("Phase 2: Journal references")
    done = 0
    for thisrecid, refs in iteritems(references_info['journals']):
        step("Journal references", thisrecid, done,
                                              len(references_info['journals']))
        done += 1

        for reference in (r for r in refs if r):
            p = reference
            field = 'journal'

            # check reference value to see whether it is well formed:
            if not re_CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK.match(p):
                store_citation_warning('not-well-formed', p)
                msg = "Whoops, record '%d' reference value '%s' " \
                      "is not well formed; skipping it." % (thisrecid, p)
                write_message(msg, stream=sys.stderr)
                continue  # skip this ill-formed value

            recids = search_unit(p, field) - INTBITSET_OF_DELETED_RECORDS
            write_message("These match searching %s in %s: %s" \
                                 % (reference, field, list(recids)), verbose=9)

            if not recids:
                insert_into_missing(thisrecid, p)
            else:
                remove_from_missing(p)

            if len(recids) > 1:
                store_citation_warning('multiple-matches', p)
                msg = "Whoops: record '%d' reference value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, p, repr(recids))
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]:  # take only the first one
                add_to_dicts(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t3 = os.times()[4]

    # Try to find references based on 999C5a
    # e.g. 10.1007/BF03170733
    write_message("Phase 3: DOI references")
    done = 0
    for thisrecid, refs in iteritems(references_info['doi']):
        step("DOI references", thisrecid, done, len(references_info['doi']))
        done += 1

        for reference in (r for r in refs if r):
            p = reference
            field = 'doi'

            recids = get_recids_matching_query(p, field)
            write_message("These match searching %s in %s: %s" \
                                 % (reference, field, list(recids)), verbose=9)

            if not recids:
                insert_into_missing(thisrecid, p)
            else:
                remove_from_missing(p)

            if len(recids) > 1:
                store_citation_warning('multiple-matches', p)
                msg = "Whoops: record '%d' DOI value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, p, repr(recids))
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]: # take only the first one
                add_to_dicts(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t4 = os.times()[4]

    # Search for stuff like CERN-TH-4859/87 in list of refs
    write_message("Phase 4: report numbers catchup")
    done = 0
    for thisrecid, reportcodes in iteritems(records_info['report-numbers']):
        step("Report numbers catchup", thisrecid, done,
                                           len(records_info['report-numbers']))
        done += 1

        for reportcode in (r for r in reportcodes if r):
            if reportcode.startswith('arXiv'):
                std_reportcode = standardize_report_number(reportcode)
                report_pattern = r'^%s( *\[[a-zA-Z.-]*\])?' % \
                                                re.escape(std_reportcode)
                recids = get_recids_matching_query(report_pattern,
                                                   tags['refs_report_number'],
                                                   'r')
            else:
                recids = get_recids_matching_query(reportcode,
                                                   tags['refs_report_number'],
                                                   'e')
            for recid in recids:
                add_to_dicts(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    # Find this record's pubinfo in other records' bibliography
    write_message("Phase 5: journals catchup")
    done = 0
    t5 = os.times()[4]
    for thisrecid, rec_journals in iteritems(records_info['journals']):
        step("Journals catchup", thisrecid, done,
                                                 len(records_info['journals']))
        done += 1

        for journal in rec_journals:
            journal = journal.replace("\"", "")
            # Search the publication string like
            # Phys. Lett., B 482 (2000) 417 in 999C5s
            recids = search_unit(p=journal, f=tags['refs_journal'], m='a') \
                                                - INTBITSET_OF_DELETED_RECORDS
            write_message("These records match %s in %s: %s" \
                    % (journal, tags['refs_journal'], list(recids)), verbose=9)

            for recid in recids:
                add_to_dicts(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    write_message("Phase 6: DOI catchup")
    done = 0
    t6 = os.times()[4]
    for thisrecid, dois in iteritems(records_info['doi']):
        step("DOI catchup", thisrecid, done, len(records_info['doi']))
        done += 1

        for doi in dois:
            # Search the publication string like
            # Phys. Lett., B 482 (2000) 417 in 999C5a
            recids = search_unit(p=doi, f=tags['refs_doi'], m='a') \
                                                - INTBITSET_OF_DELETED_RECORDS
            write_message("These records match %s in %s: %s" \
                            % (doi, tags['refs_doi'], list(recids)), verbose=9)

            for recid in recids:
                add_to_dicts(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    write_message("Phase 7: remove empty lists from dicts")

    # Remove empty lists in citation and reference
    keys = citations.keys()
    for k in keys:
        if not citations[k]:
            del citations[k]

    keys = references.keys()
    for k in keys:
        if not references[k]:
            del references[k]

    if task_get_task_param('verbose') >= 3:
        # Print only X first to prevent flood
        write_message("citation_list (x is cited by y):")
        write_message(dict(islice(iteritems(citations), 10)))
        write_message("size: %s" % len(citations))
        write_message("reference_list (x cites y):")
        write_message(dict(islice(iteritems(references), 10)))
        write_message("size: %s" % len(references))
        write_message("selfcitedbydic (x is cited by y and one of the " \
                      "authors of x same as y's):")
        write_message(dict(islice(iteritems(selfcites), 10)))
        write_message("size: %s" % len(selfcites))
        write_message("selfdic (x cites y and one of the authors of x " \
                      "same as y's):")
        write_message(dict(islice(iteritems(selfrefs), 10)))
        write_message("size: %s" % len(selfrefs))
        write_message("authorcitdic (author is cited in recs):")
        write_message(dict(islice(iteritems(authorcites), 10)))
        write_message("size: %s" % len(authorcites))

    t7 = os.times()[4]

    write_message("Execution time for analyzing the citation information " \
                  "generating the dictionary:")
    write_message("... checking ref report numbers: %.2f sec" % (t2-t1))
    write_message("... checking ref journals: %.2f sec" % (t3-t2))
    write_message("... checking ref DOI: %.2f sec" % (t4-t3))
    write_message("... checking rec report numbers: %.2f sec" % (t5-t4))
    write_message("... checking rec journals: %.2f sec" % (t6-t5))
    write_message("... checking rec DOI: %.2f sec" % (t7-t6))
    write_message("... total time of ref_analyze: %.2f sec" % (t7-t1))

    return citations_weight, citations, references, selfcites, \
                                                        selfrefs, authorcites
Пример #16
0
def _update_job_lastrun_time(jobname):
    """Update expJOB table and set lastrun time of JOBNAME to the task
    starting time."""
    run_sql("UPDATE expJOB SET lastrun=%s WHERE jobname=%s",
            (task_get_task_param('task_starting_time'), jobname,))
Пример #17
0
def ref_analyzer(citation_informations, updated_recids, tags, config):
    """Analyze the citation informations and calculate the citation weight
       and cited by list dictionary.
    """
    citations = {}
    for recid in updated_recids:
        citations[recid] = set()
    references = {}
    for recid in updated_recids:
        references[recid] = set()

    def step(msg_prefix, recid, done, total):
        if done % 30 == 0:
            task_sleep_now_if_required()

        if done % 1000 == 0:
            mesg = "%s done %s of %s" % (msg_prefix, done, total)
            write_message(mesg)
            task_update_progress(mesg)

        write_message("Processing: %s" % recid, verbose=9)

    def add_to_cites(citer, citee):
        # Make sure we don't add ourselves
        # Workaround till we know why we are adding ourselves.
        if citer == citee:
            return

        citations[citee].add(citer)
        if citer in updated_recids:
            references[citer].add(citee)

    def add_to_refs(citer, citee):
        # Make sure we don't add ourselves
        # Workaround till we know why we are adding ourselves.
        if citer == citee:
            return

        if citee in updated_recids:
            citations[citee].add(citer)
        references[citer].add(citee)

    # dict of recid -> institute_give_publ_id
    records_info, references_info = citation_informations

    t1 = os.times()[4]

    # Try to find references based on 999C5r
    # e.g 8 -> ([astro-ph/9889],[hep-ph/768])
    # meaning: rec 8 contains these in bibliography
    write_message("Phase 1: Report numbers references")
    done = 0
    for thisrecid, refnumbers in iteritems(references_info["report-numbers"]):
        step("Report numbers references", thisrecid, done, len(references_info["report-numbers"]))
        done += 1

        for refnumber in (r for r in refnumbers if r):
            field = "reportnumber"
            refnumber = standardize_report_number(refnumber)
            # Search for "hep-th/5644654 or such" in existing records
            recids = get_recids_matching_query(p=refnumber, f=field, config=config)
            write_message("These match searching %s in %s: %s" % (refnumber, field, list(recids)), verbose=9)

            if not recids:
                insert_into_missing(thisrecid, refnumber)
            else:
                remove_from_missing(refnumber)

            if len(recids) > 1:
                store_citation_warning("multiple-matches", refnumber)
                msg = (
                    "Whoops: record '%d' report number value '%s' "
                    "matches many records; taking only the first one. %s" % (thisrecid, refnumber, repr(recids))
                )
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]:  # take only the first one
                add_to_refs(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t2 = os.times()[4]

    # Try to find references based on 999C5s
    # e.g. Phys.Rev.Lett. 53 (1986) 2285
    write_message("Phase 2: Journal references")
    done = 0
    for thisrecid, refs in iteritems(references_info["journals"]):
        step("Journal references", thisrecid, done, len(references_info["journals"]))
        done += 1

        for reference in (r for r in refs if r):
            p = reference
            field = "journal"

            # check reference value to see whether it is well formed:
            if not re_CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK.match(p):
                store_citation_warning("not-well-formed", p)
                msg = "Whoops, record '%d' reference value '%s' " "is not well formed; skipping it." % (thisrecid, p)
                write_message(msg, stream=sys.stderr)
                continue  # skip this ill-formed value

            recids = get_recids_matching_query(p=p, f=field, config=config)
            write_message("These match searching %s in %s: %s" % (reference, field, list(recids)), verbose=9)

            if not recids:
                insert_into_missing(thisrecid, p)
            else:
                remove_from_missing(p)

            if len(recids) > 1:
                store_citation_warning("multiple-matches", p)
                msg = (
                    "Whoops: record '%d' reference value '%s' "
                    "matches many records; taking only the first one. %s" % (thisrecid, p, repr(recids))
                )
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]:  # take only the first one
                add_to_refs(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t3 = os.times()[4]

    # Try to find references based on 999C5a
    # e.g. 10.1007/BF03170733
    write_message("Phase 3: DOI references")
    done = 0
    for thisrecid, refs in iteritems(references_info["doi"]):
        step("DOI references", thisrecid, done, len(references_info["doi"]))
        done += 1

        for reference in (r for r in refs if r):
            p = reference
            field = "doi"

            recids = get_recids_matching_query(p=p, f=field, config=config)
            write_message("These match searching %s in %s: %s" % (reference, field, list(recids)), verbose=9)

            if not recids:
                insert_into_missing(thisrecid, p)
            else:
                remove_from_missing(p)

            if len(recids) > 1:
                store_citation_warning("multiple-matches", p)
                msg = "Whoops: record '%d' DOI value '%s' " "matches many records; taking only the first one. %s" % (
                    thisrecid,
                    p,
                    repr(recids),
                )
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]:  # take only the first one
                add_to_refs(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t4 = os.times()[4]

    # Try to find references based on 999C5a (hdl references)
    # e.g. 4263537/4000
    write_message("Phase 4: HDL references")
    done = 0
    for thisrecid, refs in references_info["hdl"].iteritems():
        step("HDL references", thisrecid, done, len(references_info["hdl"]))
        done += 1

        for reference in (r for r in refs if r):
            p = reference
            field = "hdl"

            recids = get_recids_matching_query(p=p, f=field, config=config)
            write_message("These match searching %s in %s: %s" % (reference, field, list(recids)), verbose=9)

            if not recids:
                insert_into_missing(thisrecid, p)
            else:
                remove_from_missing(p)

            if len(recids) > 1:
                store_citation_warning("multiple-matches", p)
                msg = "Whoops: record '%d' HDL value '%s' " "matches many records; taking only the first one. %s" % (
                    thisrecid,
                    p,
                    repr(recids),
                )
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]:  # take only the first one
                add_to_refs(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t5 = os.times()[4]

    # Try to find references based on 999C50
    # e.g. 1244
    write_message("Phase 5: Record ID references")
    done = 0
    for thisrecid, refs in references_info["record_id"].iteritems():
        step("Record ID references", thisrecid, done, len(references_info["record_id"]))
        done += 1
        field = "001"
        for recid in (r for r in refs if r):
            valid = get_recids_matching_query(p=recid, f=field, config=config)
            write_message("These match searching %s in %s: %s" % (recid, field, list(valid)), verbose=9)
            if valid:
                add_to_refs(thisrecid, valid[0])

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t6 = os.times()[4]

    # Try to find references based on 999C5i
    # e.g. 978-3-942171-73-1
    write_message("Phase 6: ISBN references")
    done = 0
    for thisrecid, refs in references_info["isbn"].iteritems():
        step("ISBN references", thisrecid, done, len(references_info["isbn"]))
        done += 1

        for reference in (r for r in refs if r):
            p = reference
            field = "isbn"

            recids = get_recids_matching_query(p=p, f=field, config=config)
            write_message("These match searching %s in %s: %s" % (reference, field, list(recids)), verbose=9)

            if not recids:
                insert_into_missing(thisrecid, p)
            else:
                remove_from_missing(p)

            if len(recids) > 1:
                store_citation_warning("multiple-matches", p)
                msg = "Whoops: record '%d' ISBN value '%s' " "matches many records; taking only the first one. %s" % (
                    thisrecid,
                    p,
                    repr(recids),
                )
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]:  # take only the first one
                add_to_refs(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t7 = os.times()[4]

    # Search for stuff like CERN-TH-4859/87 in list of refs
    write_message("Phase 7: report numbers catchup")
    done = 0
    for thisrecid, reportcodes in iteritems(records_info["report-numbers"]):
        step("Report numbers catchup", thisrecid, done, len(records_info["report-numbers"]))
        done += 1

        for reportcode in (r for r in reportcodes if r):
            if reportcode.startswith("arXiv"):
                std_reportcode = standardize_report_number(reportcode)
                report_pattern = r"^%s( *\[[a-zA-Z.-]*\])?" % re.escape(std_reportcode)
                recids = get_recids_matching_query(p=report_pattern, f=tags["refs_report_number"], m="r", config=config)
            else:
                recids = get_recids_matching_query(p=reportcode, f=tags["refs_report_number"], config=config)
            for recid in recids:
                add_to_cites(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    # Find this record's pubinfo in other records' bibliography
    write_message("Phase 8: journals catchup")
    done = 0
    t8 = os.times()[4]
    for thisrecid, rec_journals in iteritems(records_info["journals"]):
        step("Journals catchup", thisrecid, done, len(records_info["journals"]))
        done += 1

        for journal in rec_journals:
            journal = journal.replace('"', "")
            # Search the publication string like
            # Phys. Lett., B 482 (2000) 417 in 999C5s
            recids = get_recids_matching_query(p=journal, f=tags["refs_journal"], config=config)
            write_message("These records match %s in %s: %s" % (journal, tags["refs_journal"], list(recids)), verbose=9)

            for recid in recids:
                add_to_cites(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    write_message("Phase 9: DOI catchup")
    done = 0
    t9 = os.times()[4]
    for thisrecid, dois in iteritems(records_info["doi"]):
        step("DOI catchup", thisrecid, done, len(records_info["doi"]))
        done += 1

        for doi in dois:
            recids = get_recids_matching_query(p=doi, f=tags["refs_doi"], config=config)
            write_message("These records match %s in %s: %s" % (doi, tags["refs_doi"], list(recids)), verbose=9)

            for recid in recids:
                add_to_cites(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    write_message("Phase 10: HDL catchup")
    done = 0
    t10 = os.times()[4]
    for thisrecid, hdls in records_info["hdl"].iteritems():
        step("HDL catchup", thisrecid, done, len(records_info["hdl"]))
        done += 1

        for hdl in hdls:
            recids = get_recids_matching_query(p=hdl, f=tags["refs_doi"], config=config)
            write_message("These records match %s in %s: %s" % (hdl, tags["refs_doi"], list(recids)), verbose=9)

            for recid in recids:
                add_to_cites(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    write_message("Phase 11: ISBN catchup")
    done = 0
    t11 = os.times()[4]
    for thisrecid, isbns in records_info["isbn"].iteritems():
        step("ISBN catchup", thisrecid, done, len(records_info["isbn"]))
        done += 1

        for isbn in isbns:
            recids = get_recids_matching_query(p=isbn, f=tags["refs_isbn"], config=config)
            write_message("These records match %s in %s: %s" % (isbn, tags["refs_isbn"], list(recids)), verbose=9)

            for recid in recids:
                add_to_cites(recid, thisrecid)

    write_message("Phase 12: Record ID catchup")
    done = 0
    t12 = os.times()[4]
    for thisrecid, record_ids in records_info["record_id"].iteritems():
        step("Record ID catchup", thisrecid, done, len(records_info["record_id"]))
        done += 1

        for record_id in record_ids:
            recids = get_recids_matching_query(p=record_id, f=tags["refs_record_id"], config=config)
            write_message(
                "These records match %s in %s: %s" % (record_id, tags["refs_record_id"], list(recids)), verbose=9
            )

            for recid in recids:
                add_to_cites(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    if task_get_task_param("verbose") >= 3:
        # Print only X first to prevent flood
        write_message("citation_list (x is cited by y):")
        write_message(dict(islice(iteritems(citations), 10)))
        write_message("size: %s" % len(citations))
        write_message("reference_list (x cites y):")
        write_message(dict(islice(iteritems(references), 10)))
        write_message("size: %s" % len(references))

    t13 = os.times()[4]

    write_message("Execution time for analyzing the citation information " "generating the dictionary:")
    write_message("... checking ref report numbers: %.2f sec" % (t2 - t1))
    write_message("... checking ref journals: %.2f sec" % (t3 - t2))
    write_message("... checking ref DOI: %.2f sec" % (t4 - t3))
    write_message("... checking ref HDL: %.2f sec" % (t5 - t4))
    write_message("... checking ref Record ID: %.2f sec" % (t6 - t5))
    write_message("... checking ref ISBN: %.2f sec" % (t7 - t6))
    write_message("... checking rec report numbers: %.2f sec" % (t8 - t7))
    write_message("... checking rec journals: %.2f sec" % (t9 - t8))
    write_message("... checking rec DOI: %.2f sec" % (t10 - t9))
    write_message("... checking rec HDL: %.2f sec" % (t11 - t10))
    write_message("... checking rec ISBN: %.2f sec" % (t12 - t11))
    write_message("... checking rec Record ID: %.2f sec" % (t13 - t12))
    write_message("... total time of ref_analyze: %.2f sec" % (t13 - t1))

    return citations, references