예제 #1
0
def update_rule_last_run(rule_name):
    """
    Set the last time a rule was run to now. This function should be called
    after a rule has been ran.
    """

    if task_has_option('record_ids') or task_get_option('no_upload', False) \
            or task_get_option('no_tickets', False):
        return   # We don't want to update the database in this case

    updated = run_sql("UPDATE bibcheck_rules SET last_run=%s WHERE name=%s;",
                      (task_get_task_param('task_starting_time'), rule_name,))
    if not updated: # rule not in the database, insert it
        run_sql("INSERT INTO bibcheck_rules(name, last_run) VALUES (%s, %s)",
                (rule_name, task_get_task_param('task_starting_time')))
예제 #2
0
def iterate_over_new(list, fmt):
    "Iterate over list of IDs"
    global total_rec

    formatted_records = ''      # (string-)List of formatted record of an iteration
    tbibformat  = 0     # time taken up by external call
    tbibupload  = 0     # time taken up by external call
    start_date = task_get_task_param('task_starting_time') # Time at which the record was formatted

    tot = len(list)
    count = 0
    for recID in list:
        t1 = os.times()[4]
        start_date = time.strftime('%Y-%m-%d %H:%M:%S')
        formatted_record = zlib.compress(format_record(recID, fmt, on_the_fly=True))
        if run_sql('SELECT id FROM bibfmt WHERE id_bibrec=%s AND format=%s', (recID, fmt)):
            run_sql('UPDATE bibfmt SET last_updated=%s, value=%s WHERE id_bibrec=%s AND format=%s', (start_date, formatted_record, recID, fmt))
        else:
            run_sql('INSERT INTO bibfmt(id_bibrec, format, last_updated, value) VALUES(%s, %s, %s, %s)', (recID, fmt, start_date, formatted_record))
        t2 = os.times()[4]
        tbibformat += (t2 - t1)
        count += 1
        if (count % 100) == 0:
            write_message("   ... formatted %s records out of %s" % (count, tot))
            task_update_progress('Formatted %s out of %s' % (count, tot))
            task_sleep_now_if_required(can_stop_too=True)
    if (tot % 100) != 0:
        write_message("   ... formatted %s records out of %s" % (count, tot))
    return (tot, tbibformat, tbibupload)
예제 #3
0
def _dbdump_run_task_core():
    """
    Run DB dumper core stuff.

    Note: do not use task_can_sleep() stuff here because we don't want
    other tasks to interrupt us while we are dumping the DB content.
    """
    # read params:
    task_update_progress("Reading parameters")
    write_message("Reading parameters started")
    output_dir = task_get_option('output', CFG_LOGDIR)
    output_num = task_get_option('number', 5)
    output_fil_prefix = CFG_DATABASE_NAME + '-dbdump-'
    output_fil_suffix = task_get_task_param('task_starting_time').replace(' ','_') + '.sql'
    output_fil = output_fil_prefix + output_fil_suffix
    write_message("Reading parameters ended")
    # make dump:
    task_update_progress("Dumping database")
    write_message("Database dump started")
    _dump_database(output_dir, output_fil)
    write_message("Database dump ended")
    # prune old dump files:
    task_update_progress("Pruning old dump files")
    write_message("Pruning old dump files started")
    _delete_old_dumps(output_dir, output_fil_prefix, output_num)
    write_message("Pruning old dump files ended")
    # we are done:
    task_update_progress("Done.")
    return True
예제 #4
0
def watch_directory(new_job_dir=CFG_BIBENCODE_DAEMON_DIR_NEWJOBS,
                    old_job_dir=CFG_BIBENCODE_DAEMON_DIR_OLDJOBS):
    """ Checks a folder job files, parses and executes them
    @param new_job_dir: path to the directory with new jobs
    @type new_job_dir: string
    @param old_job_dir: path to the directory where the old jobs are moved
    @type old_job_dir: string
    """
    global _NUMBER, _TASKID
    write_message('Checking directory %s for new jobs' % new_job_dir)
    task_update_progress('Checking for new jobs')
    _TASKID = task_get_task_param('task_id')
    files = os.listdir(new_job_dir)
    for file in files:
        file_fullpath = os.path.join(new_job_dir, file)
        if has_signature(file_fullpath):
            write_message('New Job found: %s' % file)
            job = json_decode_file(file_fullpath)
            if not getval(job, 'isbatch'):
                args = job_to_args(job)
                if not launch_task(args):
                    write_message('Error submitting task')
            else:
                ## We need the job description for the batch engine
                ## So we need to use the new path inside the oldjobs dir
                process_batch(os.path.join(old_job_dir, file))
            ## Move the file to the done dir
            shutil.move(file_fullpath, os.path.join(old_job_dir, file))
            ## Update number for next job
            _NUMBER += 1
    return 1
예제 #5
0
파일: dbdump.py 프로젝트: bopopescu/invenio
def _dbdump_run_task_core():
    """
    Run DB dumper core stuff.

    Note: do not use task_can_sleep() stuff here because we don't want
    other tasks to interrupt us while we are dumping the DB content.
    """
    # read params:
    task_update_progress("Reading parameters")
    write_message("Reading parameters started")
    output_dir = task_get_option('output', CFG_LOGDIR)
    output_num = task_get_option('number', 5)
    output_fil_prefix = CFG_DATABASE_NAME + '-dbdump-'
    output_fil_suffix = task_get_task_param('task_starting_time').replace(
        ' ', '_') + '.sql.gz'
    output_fil = output_fil_prefix + output_fil_suffix
    write_message("Reading parameters ended")
    # make dump:
    task_update_progress("Dumping database")
    write_message("Database dump started")
    _dump_database(output_dir, output_fil)
    write_message("Database dump ended")
    # prune old dump files:
    task_update_progress("Pruning old dump files")
    write_message("Pruning old dump files started")
    _delete_old_dumps(output_dir, output_fil_prefix, output_num)
    write_message("Pruning old dump files ended")
    # we are done:
    task_update_progress("Done.")
    return True
예제 #6
0
def watch_directory(new_job_dir=CFG_BIBENCODE_DAEMON_DIR_NEWJOBS,
                    old_job_dir=CFG_BIBENCODE_DAEMON_DIR_OLDJOBS):
    """ Checks a folder job files, parses and executes them
    @param new_job_dir: path to the directory with new jobs
    @type new_job_dir: string
    @param old_job_dir: path to the directory where the old jobs are moved
    @type old_job_dir: string
    """
    global _NUMBER, _TASKID
    write_message('Checking directory %s for new jobs' % new_job_dir)
    task_update_progress('Checking for new jobs')
    _TASKID = task_get_task_param('task_id')
    files = os.listdir(new_job_dir)
    for file in files:
        file_fullpath = os.path.join(new_job_dir, file)
        if has_signature(file_fullpath):
            write_message('New Job found: %s' % file)
            job = json_decode_file(file_fullpath)
            if not getval(job, 'isbatch'):
                args = job_to_args(job)
                if not launch_task(args):
                    write_message('Error submitting task')
            else:
                ## We need the job description for the batch engine
                ## So we need to use the new path inside the oldjobs dir
                process_batch(os.path.join(old_job_dir, file))
            ## Move the file to the done dir
            shutil.move(file_fullpath, os.path.join(old_job_dir, file))
            ## Update number for next job
            _NUMBER += 1
    return 1
예제 #7
0
def iterate_over_new(list, fmt):
    """
    Iterate over list of IDs

    @param list: the list of record IDs to format
    @param fmt: the output format to use
    @return: tuple (total number of records, time taken to format, time taken to insert)
    """
    global total_rec

    formatted_records = ''      # (string-)List of formatted record of an iteration
    tbibformat  = 0     # time taken up by external call
    tbibupload  = 0     # time taken up by external call
    start_date = task_get_task_param('task_starting_time') # Time at which the record was formatted

    tot = len(list)
    count = 0
    for recID in list:
        t1 = os.times()[4]
        start_date = time.strftime('%Y-%m-%d %H:%M:%S')
        formatted_record = zlib.compress(format_record(recID, fmt, on_the_fly=True))
        run_sql('REPLACE LOW_PRIORITY INTO bibfmt (id_bibrec, format, last_updated, value) VALUES (%s, %s, %s, %s)',
                (recID, fmt, start_date, formatted_record))
        t2 = os.times()[4]
        tbibformat += (t2 - t1)
        count += 1
        if (count % 100) == 0:
            write_message("   ... formatted %s records out of %s" % (count, tot))
            task_update_progress('Formatted %s out of %s' % (count, tot))
            task_sleep_now_if_required(can_stop_too=True)
    if (tot % 100) != 0:
        write_message("   ... formatted %s records out of %s" % (count, tot))
    return (tot, tbibformat, tbibupload)
예제 #8
0
파일: bibexport.py 프로젝트: ppiotr/Invenio
def _update_job_lastrun_time(jobname):
    """Update expJOB table and set lastrun time of JOBNAME to the task
    starting time."""
    run_sql("UPDATE expJOB SET lastrun=%s WHERE jobname=%s", (
        task_get_task_param('task_starting_time'),
        jobname,
    ))
예제 #9
0
def get_next_starting_date(rule):
    """Calculate the date the next bibcheck run should consider as initial.

    If no filter has been specified then the time that is set is the time the
    task was started. Otherwise, it is set to the earliest date among last time
    webcoll was run and the last bibindex last_update as the last_run to prevent
    records that have yet to be categorized from being perpetually ignored.
    """
    def dt(t):
        return datetime.strptime(t, "%Y-%m-%d %H:%M:%S")

    # Upper limit
    task_starting_time = dt(task_get_task_param('task_starting_time'))

    for key, val in rule.iteritems():
        if key.startswith("filter_") and val:
            break
    else:
        return task_starting_time

    # Lower limit
    min_last_updated = run_sql("select min(last_updated) from idxINDEX")[0][0]
    cache_last_updated = dt(get_cache_last_updated_timestamp())

    if not min_last_updated or not cache_last_updated:
        # Some tables have never been initialized. Let's return the Epoch
        return datetime(1970, 1, 1)

    return min(min_last_updated, task_starting_time, cache_last_updated)
예제 #10
0
def update_rule_last_run(rule_name):
    """
    Set the last time a rule was run to now. This function should be called
    after a rule has been ran.
    """

    if task_has_option('record_ids') or task_get_option('no_upload', False) \
            or task_get_option('no_tickets', False):
        return  # We don't want to update the database in this case

    updated = run_sql("UPDATE bibcheck_rules SET last_run=%s WHERE name=%s;", (
        task_get_task_param('task_starting_time'),
        rule_name,
    ))
    if not updated:  # rule not in the database, insert it
        run_sql("INSERT INTO bibcheck_rules(name, last_run) VALUES (%s, %s)",
                (rule_name, task_get_task_param('task_starting_time')))
예제 #11
0
def check_slave_is_in_consistent_state(connection=None):
    """
    Check if the slave is already aware that dbdump task is running.
    dbdump being a monotask, guarantee that no other task is currently
    running and it's hence safe to detach the slave and start the
    actual dump.
    """
    if connection is None:
        connection = get_connection_for_dump_on_slave()
    i = 0
    ## Let's take the current status of dbdump (e.g. RUNNING, ABOUT TO STOP, etc.)...
    current_status = run_sql("SELECT status FROM schTASK WHERE id=%s", (task_get_task_param('task_id'), ))[0][0]
    while True:
        if i == 10:
            ## Timeout!!
            raise StandardError("The slave seems not to pick up with the master")
        ## ...and let's see if it matches with what the slave sees.
        if run_sql("SELECT status FROM schTASK WHERE id=%s AND status=%s", (task_get_task_param('task_id'), current_status), connection=connection):
            ## Bingo!
            return
        time.sleep(3)
        i += 1
예제 #12
0
def _generate_default_xml_out():
    """Generates the default output xml file directory, corresponding
    to this refextract task id. This will be called in a user specified
    xml out file has not been provided.
    @return: (string) output xml file directory"""
    results_dir = os.path.join(CFG_TMPDIR, "refextract")
    # Write the changes to a temporary file.
    filename = "refextract_task_%d.xml" % task_get_task_param('task_id', 0)
    abs_path = os.path.join(results_dir, filename)
    ## Make the folder, if not exists
    if not os.path.isdir(results_dir):
        os.mkdir(results_dir)
    return abs_path
예제 #13
0
def _generate_default_xml_out():
    """Generates the default output xml file directory, corresponding
    to this refextract task id. This will be called in a user specified
    xml out file has not been provided.
    @return: (string) output xml file directory"""
    results_dir = os.path.join(CFG_TMPDIR, "refextract")
    # Write the changes to a temporary file.
    filename = "refextract_task_%d.xml" % task_get_task_param('task_id', 0)
    abs_path = os.path.join(results_dir, filename)
    ## Make the folder, if not exists
    if not os.path.isdir(results_dir):
        os.mkdir(results_dir)
    return abs_path
예제 #14
0
def parse_option(key, value, opts, args):
    """
    Elaborate task submission parameter.
    """
    if args:
        # There should be no standalone arguments
        raise StandardError("Error: Unrecognised argument '%s'." % args[0])

    if key in ('-i', '--id'):
        recids = task_get_task_param('recids')
        if not recids:
            recids = set()
        task_set_task_param('recids', recids)
        recids.update(split_cli_ids_arg(value))
    elif key in ('-a', '--all'):
        task_set_task_param('all', True)

    return True
예제 #15
0
def iterate_over_new(list, fmt):
    "Iterate over list of IDs"
    global total_rec

    formatted_records = ''  # (string-)List of formatted record of an iteration
    tbibformat = 0  # time taken up by external call
    tbibupload = 0  # time taken up by external call
    start_date = task_get_task_param(
        'task_starting_time')  # Time at which the record was formatted

    tot = len(list)
    count = 0
    for recID in list:
        t1 = os.times()[4]
        start_date = time.strftime('%Y-%m-%d %H:%M:%S')
        formatted_record = zlib.compress(
            format_record(recID, fmt, on_the_fly=True))
        if run_sql('SELECT id FROM bibfmt WHERE id_bibrec=%s AND format=%s',
                   (recID, fmt)):
            run_sql(
                'UPDATE bibfmt SET last_updated=%s, value=%s WHERE id_bibrec=%s AND format=%s',
                (start_date, formatted_record, recID, fmt))
        else:
            run_sql(
                'INSERT INTO bibfmt(id_bibrec, format, last_updated, value) VALUES(%s, %s, %s, %s)',
                (recID, fmt, start_date, formatted_record))
        t2 = os.times()[4]
        tbibformat += (t2 - t1)
        count += 1
        if (count % 100) == 0:
            write_message("   ... formatted %s records out of %s" %
                          (count, tot))
            task_update_progress('Formatted %s out of %s' % (count, tot))
            task_sleep_now_if_required(can_stop_too=True)
    if (tot % 100) != 0:
        write_message("   ... formatted %s records out of %s" % (count, tot))
    return (tot, tbibformat, tbibupload)
예제 #16
0
def _task_run_core():
    """Runs analyse_documents for each ontology, collection, record ids
    set."""

    automated_daemon_mode_p = True
    recids = bibtask.task_get_option("recids")
    collections = bibtask.task_get_option("collections")
    taxonomy = bibtask.task_get_option("taxonomy")

    if recids or collections:
        # We want to run some records/collection only, so we are not
        # in the automated daemon mode; this will be useful later.
        automated_daemon_mode_p = False

    # Check if the user specified which documents to extract keywords from.
    if recids:
        onto_recids = _get_recids_foreach_ontology(recids=recids, taxonomy=taxonomy)
    elif collections:
        onto_recids = _get_recids_foreach_ontology(collections=collections, taxonomy=taxonomy)
    else:
        onto_recids = _get_recids_foreach_ontology()

    if not onto_recids:
        # Nothing to do.
        if automated_daemon_mode_p:
            _update_date_of_last_run(bibtask.task_get_task_param("task_starting_time"))
        return 1

    # We will write to a temporary file as we go, because we might be processing
    # big collections with many docs
    _rid = time.strftime("%Y%m%d%H%M%S", time.localtime())
    abs_path = bibclassify_engine.get_tmp_file(_rid)
    fo = open(abs_path, "w")

    fo.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    fo.write('<collection xmlns="http://www.loc.gov/MARC21/slim">\n')

    # Count the total number of records in order to update the progression.
    global _RECIDS_NUMBER
    for onto_rec in onto_recids:
        _RECIDS_NUMBER += len(onto_rec["recIDs"])

    rec_added = False

    for onto_rec in onto_recids:
        bibtask.task_sleep_now_if_required(can_stop_too=False)

        if onto_rec["collection"] is not None:
            bibtask.write_message(
                "INFO: Applying taxonomy %s to collection %s (%s "
                "records)" % (onto_rec["ontology"], onto_rec["collection"], len(onto_rec["recIDs"])),
                stream=sys.stderr,
                verbose=3,
            )
        else:
            bibtask.write_message(
                "INFO: Applying taxonomy %s to recIDs %s. "
                % (onto_rec["ontology"], ", ".join([str(recid) for recid in onto_rec["recIDs"]])),
                stream=sys.stderr,
                verbose=3,
            )
        if onto_rec["recIDs"]:
            xml = _analyze_documents(onto_rec["recIDs"], onto_rec["ontology"], onto_rec["collection"])
            if len(xml) > 5:
                fo.write(xml)
                rec_added = True

    fo.write("</collection>\n")
    fo.close()

    # Apply the changes.
    if rec_added:
        if bconfig.CFG_DB_SAVE_KW:
            bibclassify_webinterface.upload_keywords(abs_path)
        else:
            bibtask.write_message("INFO: CFG_DB_SAVE_KW is false, we don't save results", stream=sys.stderr, verbose=0)
    else:
        bibtask.write_message("WARNING: No keywords found, recids: %s" % onto_recids, stream=sys.stderr, verbose=0)
        os.remove(abs_path)

    # Update the date of last run in the clsMETHOD table, but only if
    # we were running in an automated mode.
    if automated_daemon_mode_p:
        _update_date_of_last_run(bibtask.task_get_task_param("task_starting_time"))
    return 1
def ref_analyzer(citation_informations, dicts,
                 updated_recids, tags, do_catchup=True):
    """Analyze the citation informations and calculate the citation weight
       and cited by list dictionary.
    """
    citations_weight = dicts['cites_weight']
    citations = dicts['cites']
    references = dicts['refs']
    selfcites = dicts['selfcites']
    selfrefs = dicts['selfrefs']
    authorcites = dicts['authorcites']

    def step(msg_prefix, recid, done, total):
        if done % 30 == 0:
            task_sleep_now_if_required()

        if done % 1000 == 0:
            mesg = "%s done %s of %s" % (msg_prefix, done, total)
            write_message(mesg)
            task_update_progress(mesg)

        write_message("Processing: %s" % recid, verbose=9)

    def add_to_dicts(citer, cited):
        # Make sure we don't add ourselves
        # Workaround till we know why we are adding ourselves.
        if citer == cited:
            return
        if cited not in citations_weight:
            citations_weight[cited] = 0
        # Citations and citations weight
        if citer not in citations.setdefault(cited, []):
            citations[cited].append(citer)
            citations_weight[cited] += 1
        # References
        if cited not in references.setdefault(citer, []):
            references[citer].append(cited)

    # dict of recid -> institute_give_publ_id
    records_info, references_info = citation_informations

    t1 = os.times()[4]

    write_message("Phase 0: temporarily remove changed records from " \
                  "citation dictionaries; they will be filled later")
    if do_catchup:
        for somerecid in updated_recids:
            try:
                del citations[somerecid]
            except KeyError:
                pass

    for somerecid in updated_recids:
        try:
            del references[somerecid]
        except KeyError:
            pass

    # Try to find references based on 999C5r
    # e.g 8 -> ([astro-ph/9889],[hep-ph/768])
    # meaning: rec 8 contains these in bibliography
    write_message("Phase 1: Report numbers references")
    done = 0
    for thisrecid, refnumbers in references_info['report-numbers'].iteritems():
        step("Report numbers references", thisrecid, done,
                                        len(references_info['report-numbers']))
        done += 1

        for refnumber in (r for r in refnumbers if r):
            field = 'reportnumber'
            refnumber = standardize_report_number(refnumber)
            # Search for "hep-th/5644654 or such" in existing records
            recids = get_recids_matching_query(p=refnumber, f=field)
            write_message("These match searching %s in %s: %s" % \
                                   (refnumber, field, list(recids)), verbose=9)

            if not recids:
                insert_into_missing(thisrecid, refnumber)
            else:
                remove_from_missing(refnumber)

            if len(recids) > 1:
                store_citation_warning('multiple-matches', refnumber)
                msg = "Whoops: record '%d' report number value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, refnumber, repr(recids))
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]:  # take only the first one
                add_to_dicts(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t2 = os.times()[4]

    # Try to find references based on 999C5s
    # e.g. Phys.Rev.Lett. 53 (1986) 2285
    write_message("Phase 2: Journal references")
    done = 0
    for thisrecid, refs in references_info['journals'].iteritems():
        step("Journal references", thisrecid, done,
                                              len(references_info['journals']))
        done += 1

        for reference in (r for r in refs if r):
            p = reference
            field = 'journal'

            # check reference value to see whether it is well formed:
            if not re_CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK.match(p):
                store_citation_warning('not-well-formed', p)
                msg = "Whoops, record '%d' reference value '%s' " \
                      "is not well formed; skipping it." % (thisrecid, p)
                write_message(msg, stream=sys.stderr)
                continue  # skip this ill-formed value

            recids = search_unit(p, field) - INTBITSET_OF_DELETED_RECORDS
            write_message("These match searching %s in %s: %s" \
                                 % (reference, field, list(recids)), verbose=9)

            if not recids:
                insert_into_missing(thisrecid, p)
            else:
                remove_from_missing(p)

            if len(recids) > 1:
                store_citation_warning('multiple-matches', p)
                msg = "Whoops: record '%d' reference value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, p, repr(recids))
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]:  # take only the first one
                add_to_dicts(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t3 = os.times()[4]

    # Try to find references based on 999C5a
    # e.g. 10.1007/BF03170733
    write_message("Phase 3: DOI references")
    done = 0
    for thisrecid, refs in references_info['doi'].iteritems():
        step("DOI references", thisrecid, done, len(references_info['doi']))
        done += 1

        for reference in (r for r in refs if r):
            p = reference
            field = 'doi'

            recids = get_recids_matching_query(p, field)
            write_message("These match searching %s in %s: %s" \
                                 % (reference, field, list(recids)), verbose=9)

            if not recids:
                insert_into_missing(thisrecid, p)
            else:
                remove_from_missing(p)

            if len(recids) > 1:
                store_citation_warning('multiple-matches', p)
                msg = "Whoops: record '%d' DOI value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, p, repr(recids))
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]: # take only the first one
                add_to_dicts(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t4 = os.times()[4]

    # Search for stuff like CERN-TH-4859/87 in list of refs
    write_message("Phase 4: report numbers catchup")
    done = 0
    for thisrecid, reportcodes in records_info['report-numbers'].iteritems():
        step("Report numbers catchup", thisrecid, done,
                                           len(records_info['report-numbers']))
        done += 1

        for reportcode in (r for r in reportcodes if r):
            if reportcode.startswith('arXiv'):
                std_reportcode = standardize_report_number(reportcode)
                report_pattern = r'^%s( *\[[a-zA-Z.-]*\])?' % \
                                                re.escape(std_reportcode)
                recids = get_recids_matching_query(report_pattern,
                                                   tags['refs_report_number'],
                                                   'r')
            else:
                recids = get_recids_matching_query(reportcode,
                                                   tags['refs_report_number'],
                                                   'e')
            for recid in recids:
                add_to_dicts(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    # Find this record's pubinfo in other records' bibliography
    write_message("Phase 5: journals catchup")
    done = 0
    t5 = os.times()[4]
    for thisrecid, rec_journals in records_info['journals'].iteritems():
        step("Journals catchup", thisrecid, done,
                                                 len(records_info['journals']))
        done += 1

        for journal in rec_journals:
            journal = journal.replace("\"", "")
            # Search the publication string like
            # Phys. Lett., B 482 (2000) 417 in 999C5s
            recids = search_unit(p=journal, f=tags['refs_journal'], m='a') \
                                                - INTBITSET_OF_DELETED_RECORDS
            write_message("These records match %s in %s: %s" \
                    % (journal, tags['refs_journal'], list(recids)), verbose=9)

            for recid in recids:
                add_to_dicts(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    write_message("Phase 6: DOI catchup")
    done = 0
    t6 = os.times()[4]
    for thisrecid, dois in records_info['doi'].iteritems():
        step("DOI catchup", thisrecid, done, len(records_info['doi']))
        done += 1

        for doi in dois:
            # Search the publication string like
            # Phys. Lett., B 482 (2000) 417 in 999C5a
            recids = search_unit(p=doi, f=tags['refs_doi'], m='a') \
                                                - INTBITSET_OF_DELETED_RECORDS
            write_message("These records match %s in %s: %s" \
                            % (doi, tags['refs_doi'], list(recids)), verbose=9)

            for recid in recids:
                add_to_dicts(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    write_message("Phase 7: remove empty lists from dicts")

    # Remove empty lists in citation and reference
    keys = citations.keys()
    for k in keys:
        if not citations[k]:
            del citations[k]

    keys = references.keys()
    for k in keys:
        if not references[k]:
            del references[k]

    if task_get_task_param('verbose') >= 3:
        # Print only X first to prevent flood
        write_message("citation_list (x is cited by y):")
        write_message(dict(islice(citations.iteritems(), 10)))
        write_message("size: %s" % len(citations))
        write_message("reference_list (x cites y):")
        write_message(dict(islice(references.iteritems(), 10)))
        write_message("size: %s" % len(references))
        write_message("selfcitedbydic (x is cited by y and one of the " \
                      "authors of x same as y's):")
        write_message(dict(islice(selfcites.iteritems(), 10)))
        write_message("size: %s" % len(selfcites))
        write_message("selfdic (x cites y and one of the authors of x " \
                      "same as y's):")
        write_message(dict(islice(selfrefs.iteritems(), 10)))
        write_message("size: %s" % len(selfrefs))
        write_message("authorcitdic (author is cited in recs):")
        write_message(dict(islice(authorcites.iteritems(), 10)))
        write_message("size: %s" % len(authorcites))

    t7 = os.times()[4]

    write_message("Execution time for analyzing the citation information " \
                  "generating the dictionary:")
    write_message("... checking ref report numbers: %.2f sec" % (t2-t1))
    write_message("... checking ref journals: %.2f sec" % (t3-t2))
    write_message("... checking ref DOI: %.2f sec" % (t4-t3))
    write_message("... checking rec report numbers: %.2f sec" % (t5-t4))
    write_message("... checking rec journals: %.2f sec" % (t6-t5))
    write_message("... checking rec DOI: %.2f sec" % (t7-t6))
    write_message("... total time of ref_analyze: %.2f sec" % (t7-t1))

    return citations_weight, citations, references, selfcites, \
                                                        selfrefs, authorcites
def ref_analyzer(citation_informations,
                 dicts,
                 updated_recids,
                 tags,
                 do_catchup=True):
    """Analyze the citation informations and calculate the citation weight
       and cited by list dictionary.
    """
    citations_weight = dicts['cites_weight']
    citations = dicts['cites']
    references = dicts['refs']
    selfcites = dicts['selfcites']
    selfrefs = dicts['selfrefs']
    authorcites = dicts['authorcites']

    def step(msg_prefix, recid, done, total):
        if done % 30 == 0:
            task_sleep_now_if_required()

        if done % 1000 == 0:
            mesg = "%s done %s of %s" % (msg_prefix, done, total)
            write_message(mesg)
            task_update_progress(mesg)

        write_message("Processing: %s" % recid, verbose=9)

    def add_to_dicts(citer, cited):
        # Make sure we don't add ourselves
        # Workaround till we know why we are adding ourselves.
        if citer == cited:
            return
        if cited not in citations_weight:
            citations_weight[cited] = 0
        # Citations and citations weight
        if citer not in citations.setdefault(cited, []):
            citations[cited].append(citer)
            citations_weight[cited] += 1
        # References
        if cited not in references.setdefault(citer, []):
            references[citer].append(cited)

    # dict of recid -> institute_give_publ_id
    records_info, references_info = citation_informations

    t1 = os.times()[4]

    write_message("Phase 0: temporarily remove changed records from " \
                  "citation dictionaries; they will be filled later")
    if do_catchup:
        for somerecid in updated_recids:
            try:
                del citations[somerecid]
            except KeyError:
                pass

    for somerecid in updated_recids:
        try:
            del references[somerecid]
        except KeyError:
            pass

    # Try to find references based on 999C5r
    # e.g 8 -> ([astro-ph/9889],[hep-ph/768])
    # meaning: rec 8 contains these in bibliography
    write_message("Phase 1: Report numbers references")
    done = 0
    for thisrecid, refnumbers in references_info['report-numbers'].iteritems():
        step("Report numbers references", thisrecid, done,
             len(references_info['report-numbers']))
        done += 1

        for refnumber in (r for r in refnumbers if r):
            field = 'reportnumber'
            refnumber = standardize_report_number(refnumber)
            # Search for "hep-th/5644654 or such" in existing records
            recids = get_recids_matching_query(p=refnumber, f=field)
            write_message("These match searching %s in %s: %s" % \
                                   (refnumber, field, list(recids)), verbose=9)

            if not recids:
                insert_into_missing(thisrecid, refnumber)
            else:
                remove_from_missing(refnumber)

            if len(recids) > 1:
                store_citation_warning('multiple-matches', refnumber)
                msg = "Whoops: record '%d' report number value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, refnumber, repr(recids))
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]:  # take only the first one
                add_to_dicts(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t2 = os.times()[4]

    # Try to find references based on 999C5s
    # e.g. Phys.Rev.Lett. 53 (1986) 2285
    write_message("Phase 2: Journal references")
    done = 0
    for thisrecid, refs in references_info['journals'].iteritems():
        step("Journal references", thisrecid, done,
             len(references_info['journals']))
        done += 1

        for reference in (r for r in refs if r):
            p = reference
            field = 'journal'

            # check reference value to see whether it is well formed:
            if not re_CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK.match(p):
                store_citation_warning('not-well-formed', p)
                msg = "Whoops, record '%d' reference value '%s' " \
                      "is not well formed; skipping it." % (thisrecid, p)
                write_message(msg, stream=sys.stderr)
                continue  # skip this ill-formed value

            recids = search_unit(p, field) - INTBITSET_OF_DELETED_RECORDS
            write_message("These match searching %s in %s: %s" \
                                 % (reference, field, list(recids)), verbose=9)

            if not recids:
                insert_into_missing(thisrecid, p)
            else:
                remove_from_missing(p)

            if len(recids) > 1:
                store_citation_warning('multiple-matches', p)
                msg = "Whoops: record '%d' reference value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, p, repr(recids))
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]:  # take only the first one
                add_to_dicts(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t3 = os.times()[4]

    # Try to find references based on 999C5a
    # e.g. 10.1007/BF03170733
    write_message("Phase 3: DOI references")
    done = 0
    for thisrecid, refs in references_info['doi'].iteritems():
        step("DOI references", thisrecid, done, len(references_info['doi']))
        done += 1

        for reference in (r for r in refs if r):
            p = reference
            field = 'doi'

            recids = get_recids_matching_query(p, field)
            write_message("These match searching %s in %s: %s" \
                                 % (reference, field, list(recids)), verbose=9)

            if not recids:
                insert_into_missing(thisrecid, p)
            else:
                remove_from_missing(p)

            if len(recids) > 1:
                store_citation_warning('multiple-matches', p)
                msg = "Whoops: record '%d' DOI value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, p, repr(recids))
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]:  # take only the first one
                add_to_dicts(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t4 = os.times()[4]

    # Search for stuff like CERN-TH-4859/87 in list of refs
    write_message("Phase 4: report numbers catchup")
    done = 0
    for thisrecid, reportcodes in records_info['report-numbers'].iteritems():
        step("Report numbers catchup", thisrecid, done,
             len(records_info['report-numbers']))
        done += 1

        for reportcode in (r for r in reportcodes if r):
            if reportcode.startswith('arXiv'):
                std_reportcode = standardize_report_number(reportcode)
                report_pattern = r'^%s( *\[[a-zA-Z.-]*\])?' % \
                                                re.escape(std_reportcode)
                recids = get_recids_matching_query(report_pattern,
                                                   tags['refs_report_number'],
                                                   'r')
            else:
                recids = get_recids_matching_query(reportcode,
                                                   tags['refs_report_number'],
                                                   'e')
            for recid in recids:
                add_to_dicts(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    # Find this record's pubinfo in other records' bibliography
    write_message("Phase 5: journals catchup")
    done = 0
    t5 = os.times()[4]
    for thisrecid, rec_journals in records_info['journals'].iteritems():
        step("Journals catchup", thisrecid, done,
             len(records_info['journals']))
        done += 1

        for journal in rec_journals:
            journal = journal.replace("\"", "")
            # Search the publication string like
            # Phys. Lett., B 482 (2000) 417 in 999C5s
            recids = search_unit(p=journal, f=tags['refs_journal'], m='a') \
                                                - INTBITSET_OF_DELETED_RECORDS
            write_message("These records match %s in %s: %s" \
                    % (journal, tags['refs_journal'], list(recids)), verbose=9)

            for recid in recids:
                add_to_dicts(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    write_message("Phase 6: DOI catchup")
    done = 0
    t6 = os.times()[4]
    for thisrecid, dois in records_info['doi'].iteritems():
        step("DOI catchup", thisrecid, done, len(records_info['doi']))
        done += 1

        for doi in dois:
            # Search the publication string like
            # Phys. Lett., B 482 (2000) 417 in 999C5a
            recids = search_unit(p=doi, f=tags['refs_doi'], m='a') \
                                                - INTBITSET_OF_DELETED_RECORDS
            write_message("These records match %s in %s: %s" \
                            % (doi, tags['refs_doi'], list(recids)), verbose=9)

            for recid in recids:
                add_to_dicts(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    write_message("Phase 7: remove empty lists from dicts")

    # Remove empty lists in citation and reference
    keys = citations.keys()
    for k in keys:
        if not citations[k]:
            del citations[k]

    keys = references.keys()
    for k in keys:
        if not references[k]:
            del references[k]

    if task_get_task_param('verbose') >= 3:
        # Print only X first to prevent flood
        write_message("citation_list (x is cited by y):")
        write_message(dict(islice(citations.iteritems(), 10)))
        write_message("size: %s" % len(citations))
        write_message("reference_list (x cites y):")
        write_message(dict(islice(references.iteritems(), 10)))
        write_message("size: %s" % len(references))
        write_message("selfcitedbydic (x is cited by y and one of the " \
                      "authors of x same as y's):")
        write_message(dict(islice(selfcites.iteritems(), 10)))
        write_message("size: %s" % len(selfcites))
        write_message("selfdic (x cites y and one of the authors of x " \
                      "same as y's):")
        write_message(dict(islice(selfrefs.iteritems(), 10)))
        write_message("size: %s" % len(selfrefs))
        write_message("authorcitdic (author is cited in recs):")
        write_message(dict(islice(authorcites.iteritems(), 10)))
        write_message("size: %s" % len(authorcites))

    t7 = os.times()[4]

    write_message("Execution time for analyzing the citation information " \
                  "generating the dictionary:")
    write_message("... checking ref report numbers: %.2f sec" % (t2 - t1))
    write_message("... checking ref journals: %.2f sec" % (t3 - t2))
    write_message("... checking ref DOI: %.2f sec" % (t4 - t3))
    write_message("... checking rec report numbers: %.2f sec" % (t5 - t4))
    write_message("... checking rec journals: %.2f sec" % (t6 - t5))
    write_message("... checking rec DOI: %.2f sec" % (t7 - t6))
    write_message("... total time of ref_analyze: %.2f sec" % (t7 - t1))

    return citations_weight, citations, references, selfcites, \
                                                        selfrefs, authorcites
def task_run_core():
    """Run the harvesting task.  The row argument is the oaiharvest task
    queue row, containing if, arguments, etc.
    Return 1 in case of success and 0 in case of failure.
    """
    reposlist = []
    datelist = []
    dateflag = 0
    possible_postmodes = [code for code, dummy in CFG_OAI_POSSIBLE_POSTMODES]
    filepath_prefix = tmpHARVESTpath + "_" + str(
        task_get_task_param("task_id"))
    ### go ahead: build up the reposlist
    if task_get_option("repository") is not None:
        ### user requests harvesting from selected repositories
        write_message("harvesting from selected repositories")
        for reposname in task_get_option("repository"):
            row = get_row_from_reposname(reposname)
            if row == []:
                write_message("source name " + reposname + " is not valid")
                continue
            else:
                reposlist.append(get_row_from_reposname(reposname))
    else:
        ### user requests harvesting from all repositories
        write_message("harvesting from all repositories in the database")
        reposlist = get_all_rows_from_db()

    ### go ahead: check if user requested from-until harvesting
    if task_get_option("dates"):
        ### for each repos simply perform a from-until date harvesting...
        ### no need to update anything
        dateflag = 1
        for element in task_get_option("dates"):
            datelist.append(element)

    error_happened_p = False
    j = 0
    for repos in reposlist:
        j += 1
        task_sleep_now_if_required()
        reponame = str(repos[0][6])
        postmode = str(repos[0][9])
        setspecs = str(repos[0][10])
        harvested_files_list = []
        if postmode in possible_postmodes:
            # Harvest phase
            harvestpath = filepath_prefix + "_" + str(j) + "_" + \
                         time.strftime("%Y%m%d%H%M%S") + "_harvested"
            if dateflag == 1:
                task_update_progress("Harvesting %s from %s to %s (%i/%i)" % \
                                     (reponame, \
                                      str(datelist[0]),
                                      str(datelist[1]),
                                      j, \
                                      len(reposlist)))
                exit_code, file_list = oai_harvest_get(prefix=repos[0][2],
                                                       baseurl=repos[0][1],
                                                       harvestpath=harvestpath,
                                                       fro=str(datelist[0]),
                                                       until=str(datelist[1]),
                                                       setspecs=setspecs)
                if exit_code == 1:
                    write_message("source " + reponame + \
                                  " was harvested from " + str(datelist[0]) \
                                  + " to " + str(datelist[1]))
                    harvested_files_list = file_list
                else:
                    write_message("an error occurred while harvesting "
                                  "from source " + reponame +
                                  " for the dates chosen")
                    error_happened_p = True
                    continue

            elif dateflag != 1 and repos[0][7] is None and repos[0][8] != 0:
                write_message("source " + reponame + \
                              " was never harvested before - harvesting whole "
                              "repository")
                task_update_progress("Harvesting %s (%i/%i)" % \
                                     (reponame,
                                      j, \
                                      len(reposlist)))
                exit_code, file_list = oai_harvest_get(prefix=repos[0][2],
                                                       baseurl=repos[0][1],
                                                       harvestpath=harvestpath,
                                                       setspecs=setspecs)
                if exit_code == 1:
                    update_lastrun(repos[0][0])
                    harvested_files_list = file_list
                else:
                    write_message("an error occurred while harvesting from "
                                  "source " + reponame)
                    error_happened_p = True
                    continue

            elif dateflag != 1 and repos[0][8] != 0:
                ### check that update is actually needed,
                ### i.e. lastrun+frequency>today
                timenow = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                lastrundate = re.sub(r'\.[0-9]+$', '',
                                     str(repos[0][7]))  # remove trailing .00
                timeinsec = int(repos[0][8]) * 60 * 60
                updatedue = add_timestamp_and_timelag(lastrundate, timeinsec)
                proceed = compare_timestamps_with_tolerance(updatedue, timenow)
                if proceed == 0 or proceed == -1:  #update needed!
                    write_message("source " + reponame +
                                  " is going to be updated")
                    fromdate = str(repos[0][7])
                    fromdate = fromdate.split()[0]  # get rid of time
                    # of the day for the moment
                    task_update_progress("Harvesting %s (%i/%i)" % \
                                         (reponame,
                                         j, \
                                         len(reposlist)))
                    exit_code, file_list = oai_harvest_get(
                        prefix=repos[0][2],
                        baseurl=repos[0][1],
                        harvestpath=harvestpath,
                        fro=fromdate,
                        setspecs=setspecs)
                    if exit_code == 1:
                        update_lastrun(repos[0][0])
                        harvested_files_list = file_list
                    else:
                        write_message("an error occurred while harvesting "
                                      "from source " + reponame)
                        error_happened_p = True
                        continue
                else:
                    write_message("source " + reponame +
                                  " does not need updating")
                    continue

            elif dateflag != 1 and repos[0][8] == 0:
                write_message("source " + reponame + \
                    " has frequency set to 'Never' so it will not be updated")
                continue

            # Harvesting done, now convert/extract/filter/upload as requested
            if len(harvested_files_list) < 1:
                write_message("No records harvested for %s" % (reponame, ))
                continue
            active_files_list = harvested_files_list
            # Convert phase
            if 'c' in postmode:
                converted_files_list = []
                i = 0
                for active_file in active_files_list:
                    i += 1
                    task_sleep_now_if_required()
                    task_update_progress("Converting material harvested from %s (%i/%i)" % \
                                         (reponame, \
                                          i, \
                                          len(active_files_list)))
                    converted_file = filepath_prefix + "_" + str(i) + "_" + \
                        time.strftime("%Y%m%d%H%M%S") + "_converted"
                    converted_files_list.append(converted_file)
                    (exitcode,
                     err_msg) = call_bibconvert(config=str(repos[0][5]),
                                                harvestpath=active_file,
                                                convertpath=converted_file)
                    if exitcode == 0:
                        write_message("material harvested from source " +
                                      reponame + " was successfully converted")
                    else:
                        write_message(
                            "an error occurred while converting from " +
                            reponame + ': \n' + err_msg)
                        error_happened_p = True
                        continue
                # print stats:
                for converted_file in converted_files_list:
                    write_message("File %s contains %i records." % \
                                  (converted_file,
                                   get_nb_records_in_file(converted_file)))
                active_files_list = converted_files_list

            if 'e' in postmode:
                # Download tarball for each harvested/converted record, then run plotextrator.
                # Update converted xml files with generated xml or add it for upload
                extracted_files_list = []
                i = 0
                for active_file in active_files_list:
                    i += 1
                    task_sleep_now_if_required()
                    task_update_progress("Extracting material harvested from %s (%i/%i)" % \
                                         (reponame, i, len(active_files_list)))
                    extracted_file = filepath_prefix + "_" + str(i) + "_" + \
                        time.strftime("%Y%m%d%H%M%S") + "_extracted"
                    extracted_files_list.append(extracted_file)
                    (exitcode,
                     err_msg) = call_plotextractor(active_file, extracted_file)
                    if exitcode == 0:
                        write_message("material harvested from source " +
                                      reponame + " was successfully extracted")
                    else:
                        write_message(
                            "an error occurred while extracting from " +
                            reponame + ': \n' + err_msg)
                        error_happened_p = True
                        continue
                # print stats:
                for extracted_file in extracted_files_list:
                    write_message("File %s contains %i records." % \
                                  (extracted_file,
                                   get_nb_records_in_file(extracted_file)))
                active_files_list = extracted_files_list

            # Filter-phase
            if 'f' in postmode:
                # first call bibfilter:
                res = 0
                uploaded = False
                i = 0
                for active_file in active_files_list:
                    i += 1
                    task_sleep_now_if_required()
                    task_update_progress("Filtering material harvested from %s (%i/%i)" % \
                                         (reponame, \
                                          i, \
                                          len(active_files_list)))
                    res += call_bibfilter(str(repos[0][11]), active_file)
                if len(active_files_list) > 0:
                    if res == 0:
                        write_message("material harvested from source " +
                                      reponame +
                                      " was successfully bibfiltered")
                    else:
                        write_message("an error occurred while bibfiltering "
                                      "harvest from " + reponame)
                        error_happened_p = True
                        continue
                # print stats:
                for active_file in active_files_list:
                    write_message("File %s contains %i records." % \
                        (active_file + ".insert.xml",
                        get_nb_records_in_file(active_file + ".insert.xml")))
                    write_message("File %s contains %i records." % \
                        (active_file + ".correct.xml",
                        get_nb_records_in_file(active_file + ".correct.xml")))
                    write_message("File %s contains %i records." % \
                        (active_file + ".append.xml",
                        get_nb_records_in_file(active_file + ".append.xml")))
                    write_message("File %s contains %i records." % \
                        (active_file + ".holdingpen.xml",
                        get_nb_records_in_file(active_file + ".holdingpen.xml")))

            # Upload files
            if "u" in postmode:
                if 'f' in postmode:
                    # upload filtered files
                    i = 0
                    for active_file in active_files_list:
                        task_sleep_now_if_required()
                        i += 1
                        if get_nb_records_in_file(active_file +
                                                  ".insert.xml") > 0:
                            task_update_progress("Uploading new records harvested from %s (%i/%i)" % \
                                                 (reponame, \
                                                  i, \
                                                  len(active_files_list)))
                            res += call_bibupload(active_file + ".insert.xml", \
                                                  ["-i"], oai_src_id = repos[0][0])
                            uploaded = True
                        task_sleep_now_if_required()
                        if get_nb_records_in_file(active_file +
                                                  ".correct.xml") > 0:
                            task_update_progress("Uploading corrections for records harvested from %s (%i/%i)" % \
                                                 (reponame, \
                                                  i, \
                                                  len(active_files_list)))
                            res += call_bibupload(active_file + ".correct.xml", \
                                                  ["-c"], oai_src_id = repos[0][0])
                            uploaded = True
                        if get_nb_records_in_file(active_file +
                                                  ".append.xml") > 0:
                            task_update_progress("Uploading additions for records harvested from %s (%i/%i)" % \
                                                 (reponame, \
                                                  i, \
                                                  len(active_files_list)))
                            res += call_bibupload(active_file + ".append.xml", \
                                                  ["-a"], oai_src_id = repos[0][0])
                            uploaded = True
                        if get_nb_records_in_file(active_file +
                                                  ".holdingpen.xml") > 0:
                            task_update_progress("Uploading records harvested from %s to holding pen (%i/%i)" % \
                                                 (reponame, \
                                                  i, \
                                                  len(active_files_list)))
                            res += call_bibupload(active_file + ".holdingpen.xml", \
                                                  ["-o"], oai_src_id = repos[0][0])
                            uploaded = True
                    if len(active_files_list) > 0:
                        if res == 0:
                            if uploaded:
                                write_message(
                                    "material harvested from source " +
                                    reponame + " was successfully uploaded")
                            else:
                                write_message("nothing to upload")
                        else:
                            write_message("an error occurred while uploading "
                                          "harvest from " + reponame)
                            error_happened_p = True
                            continue
                else:
                    # upload files normally
                    res = 0
                    i = 0
                    uploaded = False
                    for active_file in active_files_list:
                        i += 1
                        task_sleep_now_if_required()
                        if get_nb_records_in_file(active_file) > 0:
                            task_update_progress("Uploading records harvested from %s (%i/%i)" % \
                                                 (reponame, \
                                                  i, \
                                                  len(active_files_list)))
                            res += call_bibupload(active_file,
                                                  oai_src_id=repos[0][0])
                            uploaded = True
                        if res == 0:
                            if uploaded:
                                write_message(
                                    "material harvested from source " +
                                    reponame + " was successfully uploaded")
                            else:
                                write_message("nothing to upload")
                        else:
                            write_message("an error occurred while uploading "
                                          "harvest from " + reponame)
                            error_happened_p = True
                            continue

        else:  ### this should not happen
            write_message("invalid postprocess mode: " + postmode +
                          " skipping repository")
            error_happened_p = True
            continue

    if error_happened_p:
        return False
    else:
        return True
예제 #20
0
def _dbdump_run_task_core():
    """
    Run DB dumper core stuff.

    Note: do not use task_can_sleep() stuff here because we don't want
    other tasks to interrupt us while we are dumping the DB content.
    """
    # read params:
    host = CFG_DATABASE_HOST
    port = CFG_DATABASE_PORT
    connection = None
    try:
        if task_get_option('slave') and not task_get_option('dump_on_slave_helper_mode'):
            connection = get_connection_for_dump_on_slave()
            write_message("Dump on slave requested")
            write_message("... checking if slave is well up...")
            check_slave_is_up(connection)
            write_message("... checking if slave is in consistent state...")
            check_slave_is_in_consistent_state(connection)
            write_message("... detaching slave database...")
            detach_slave(connection)
            write_message("... scheduling dump on slave helper...")
            helper_arguments = []
            if task_get_option("number"):
                helper_arguments += ["--number", str(task_get_option("number"))]
            if task_get_option("output"):
                helper_arguments += ["--output", str(task_get_option("output"))]
            if task_get_option("params"):
                helper_arguments += ["--params", str(task_get_option("params"))]
            if task_get_option("ignore_tables"):
                helper_arguments += ["--ignore-tables", str(task_get_option("ignore_tables"))]
            if task_get_option("compress"):
                helper_arguments += ["--compress"]
            if task_get_option("slave"):
                helper_arguments += ["--slave", str(task_get_option("slave"))]
            helper_arguments += ['-N', 'slavehelper', '--dump-on-slave-helper']
            task_id = task_low_level_submission('dbdump', task_get_task_param('user'), '-P4', *helper_arguments)
            write_message("Slave scheduled with ID %s" % task_id)
            task_update_progress("DONE")
            return True
        elif task_get_option('dump_on_slave_helper_mode'):
            write_message("Dumping on slave mode")
            connection = get_connection_for_dump_on_slave()
            write_message("... checking if slave is well down...")
            check_slave_is_down(connection)
            host = CFG_DATABASE_SLAVE

        task_update_progress("Reading parameters")
        write_message("Reading parameters started")
        output_dir = task_get_option('output', CFG_LOGDIR)
        output_num = task_get_option('number', 5)
        params = task_get_option('params', None)
        compress = task_get_option('compress', False)
        slave = task_get_option('slave', False)
        ignore_tables = task_get_option('ignore_tables', None)
        if ignore_tables:
            ignore_tables = get_table_names(ignore_tables)
        else:
            ignore_tables = None

        output_file_suffix = task_get_task_param('task_starting_time')
        output_file_suffix = output_file_suffix.replace(' ', '_') + '.sql'
        if compress:
            output_file_suffix = "%s.gz" % (output_file_suffix,)
        write_message("Reading parameters ended")

        # make dump:
        task_update_progress("Dumping database")
        write_message("Database dump started")

        if slave:
            output_file_prefix = 'slave-%s-dbdump-' % (CFG_DATABASE_NAME,)
        else:
            output_file_prefix = '%s-dbdump-' % (CFG_DATABASE_NAME,)
        output_file = output_file_prefix + output_file_suffix
        dump_path = output_dir + os.sep + output_file
        dump_database(dump_path, \
                        host=host,
                        port=port,
                        params=params, \
                        compress=compress, \
                        ignore_tables=ignore_tables)
        write_message("Database dump ended")
    finally:
        if connection and task_get_option('dump_on_slave_helper_mode'):
            write_message("Reattaching slave")
            attach_slave(connection)
    # prune old dump files:
    task_update_progress("Pruning old dump files")
    write_message("Pruning old dump files started")
    _delete_old_dumps(output_dir, output_file_prefix, output_num)
    write_message("Pruning old dump files ended")
    # we are done:
    task_update_progress("Done.")
    return True
예제 #21
0
def _task_run_core():
    """Runs anayse_documents for each ontology, collection, record ids
    set."""
    automated_daemon_mode_p = True
    recids = task_get_option('recids')
    collections = task_get_option('collections')
    taxonomy = task_get_option('taxonomy')

    if recids or collections:
        # We want to run some records/collection only, so we are not
        # in the automated daemon mode; this will be useful later.
        automated_daemon_mode_p = False

    # Check if the user specified which documents to extract keywords from.
    if recids:
        onto_recids = _get_recids_foreach_ontology(recids=recids,
                                                   taxonomy=taxonomy)
    elif collections:
        onto_recids = _get_recids_foreach_ontology(collections=collections,
                                                   taxonomy=taxonomy)
    else:
        onto_recids = _get_recids_foreach_ontology()

    if not onto_recids:
        # Nothing to do.
        if automated_daemon_mode_p:
            _update_date_of_last_run(task_get_task_param('task_starting_time'))
        return 1

    changes = []
    changes.append('<?xml version="1.0" encoding="UTF-8"?>')
    changes.append('<collection xmlns="http://www.loc.gov/MARC21/slim">')

    # Count the total number of records in order to update the progression.
    global _RECIDS_NUMBER
    for onto_rec in onto_recids:
        _RECIDS_NUMBER += len(onto_rec['recIDs'])

    for onto_rec in onto_recids:
        task_sleep_now_if_required(can_stop_too=False)

        if onto_rec['collection'] is not None:
            write_message('INFO: Applying taxonomy %s to collection %s (%s '
                          'records)' %
                          (onto_rec['ontology'], onto_rec['collection'],
                           len(onto_rec['recIDs'])),
                          stream=sys.stderr,
                          verbose=3)
        else:
            write_message('INFO: Applying taxonomy %s to recIDs %s. ' %
                          (onto_rec['ontology'], ', '.join(
                              [str(recid) for recid in onto_rec['recIDs']])),
                          stream=sys.stderr,
                          verbose=3)

        if onto_rec['recIDs']:
            changes.append(
                _analyze_documents(onto_rec['recIDs'], onto_rec['ontology'],
                                   onto_rec['collection']))

    changes.append('</collection>')

    # Write the changes to a temporary file.
    tmp_directory = "%s/bibclassify" % CFG_TMPDIR
    filename = "bibclassifyd_%s.xml" % time.strftime("%Y%m%d%H%M%S",
                                                     time.localtime())
    abs_path = os.path.join(tmp_directory, filename)

    if not os.path.isdir(tmp_directory):
        os.mkdir(tmp_directory)

    file_desc = open(abs_path, "w")
    file_desc.write('\n'.join(changes))
    file_desc.close()

    # Apply the changes.
    if changes:
        cmd = "%s/bibupload -n -c '%s' " % (CFG_BINDIR, abs_path)
        errcode = 0
        try:
            errcode = os.system(cmd)
        except OSError, exc:
            write_message('ERROR: Command %s failed [%s].' % (cmd, exc),
                          stream=sys.stderr,
                          verbose=0)
        if errcode != 0:
            write_message("ERROR: %s failed, error code is %d." %
                          (cmd, errcode),
                          stream=sys.stderr,
                          verbose=0)
            return 0
예제 #22
0
        cmd = "%s/bibupload -n -c '%s' " % (CFG_BINDIR, abs_path)
        errcode = 0
        try:
            errcode = os.system(cmd)
        except OSError, exc:
            write_message('ERROR: Command %s failed [%s].' % (cmd, exc),
                stream=sys.stderr, verbose=0)
        if errcode != 0:
            write_message("ERROR: %s failed, error code is %d." %
                (cmd, errcode), stream=sys.stderr, verbose=0)
            return 0

    # Update the date of last run in the clsMETHOD table, but only if
    # we were running in an automated mode.
    if automated_daemon_mode_p:
        _update_date_of_last_run(task_get_task_param('task_starting_time'))
    return 1

def _analyze_documents(records, ontology, collection):
    """For each collection, parse the documents attached to the records
    in collection with the corresponding ontology."""
    global _INDEX

    if not records:
        # No records could be found.
        write_message("WARNING: No record were found in collection %s." %
            collection, stream=sys.stderr, verbose=2)
        return False

    # Process records:
    output = []
예제 #23
0
def _task_run_core():
    """Runs anayse_documents for each ontology, collection, record ids
    set."""
    automated_daemon_mode_p = True
    recids = task_get_option('recids')
    collections = task_get_option('collections')
    taxonomy = task_get_option('taxonomy')

    if recids or collections:
        # We want to run some records/collection only, so we are not
        # in the automated daemon mode; this will be useful later.
        automated_daemon_mode_p = False

    # Check if the user specified which documents to extract keywords from.
    if recids:
        onto_recids = _get_recids_foreach_ontology(recids=recids,
            taxonomy=taxonomy)
    elif collections:
        onto_recids = _get_recids_foreach_ontology(collections=collections,
            taxonomy=taxonomy)
    else:
        onto_recids = _get_recids_foreach_ontology()

    if not onto_recids:
        # Nothing to do.
        if automated_daemon_mode_p:
            _update_date_of_last_run(task_get_task_param('task_starting_time'))
        return 1

    changes = []
    changes.append('<?xml version="1.0" encoding="UTF-8"?>')
    changes.append('<collection xmlns="http://www.loc.gov/MARC21/slim">')

    # Count the total number of records in order to update the progression.
    global _RECIDS_NUMBER
    for onto_rec in onto_recids:
        _RECIDS_NUMBER += len(onto_rec['recIDs'])

    for onto_rec in onto_recids:
        task_sleep_now_if_required(can_stop_too=False)

        if onto_rec['collection'] is not None:
            write_message('INFO: Applying taxonomy %s to collection %s (%s '
                'records)' % (onto_rec['ontology'], onto_rec['collection'],
                len(onto_rec['recIDs'])), stream=sys.stderr, verbose=3)
        else:
            write_message('INFO: Applying taxonomy %s to recIDs %s. ' %
                (onto_rec['ontology'],
                ', '.join([str(recid) for recid in onto_rec['recIDs']])),
                stream=sys.stderr, verbose=3)

        if onto_rec['recIDs']:
            changes.append(_analyze_documents(onto_rec['recIDs'],
                onto_rec['ontology'], onto_rec['collection']))

    changes.append('</collection>')

    # Write the changes to a temporary file.
    tmp_directory = "%s/bibclassify" % CFG_TMPDIR
    filename = "bibclassifyd_%s.xml" % time.strftime("%Y%m%d%H%M%S",
        time.localtime())
    abs_path = os.path.join(tmp_directory, filename)

    if not os.path.isdir(tmp_directory):
        os.mkdir(tmp_directory)

    file_desc = open(abs_path, "w")
    file_desc.write('\n'.join(changes))
    file_desc.close()

    # Apply the changes.
    if changes:
        cmd = "%s/bibupload -n -c '%s' " % (CFG_BINDIR, abs_path)
        errcode = 0
        try:
            errcode = os.system(cmd)
        except OSError, exc:
            write_message('ERROR: Command %s failed [%s].' % (cmd, exc),
                stream=sys.stderr, verbose=0)
        if errcode != 0:
            write_message("ERROR: %s failed, error code is %d." %
                (cmd, errcode), stream=sys.stderr, verbose=0)
            return 0
def ref_analyzer(citation_informations, updated_recids, tags, config):
    """Analyze the citation informations and calculate the citation weight
       and cited by list dictionary.
    """
    citations = {}
    for recid in updated_recids:
        citations[recid] = set()
    references = {}
    for recid in updated_recids:
        references[recid] = set()

    def step(msg_prefix, recid, done, total):
        if done % 30 == 0:
            task_sleep_now_if_required()

        if done % 1000 == 0:
            mesg = "%s done %s of %s" % (msg_prefix, done, total)
            write_message(mesg)
            task_update_progress(mesg)

        write_message("Processing: %s" % recid, verbose=9)

    def add_to_cites(citer, citee):
        # Make sure we don't add ourselves
        # Workaround till we know why we are adding ourselves.
        if citer == citee:
            return

        citations[citee].add(citer)
        if citer in updated_recids:
            references[citer].add(citee)

    def add_to_refs(citer, citee):
        # Make sure we don't add ourselves
        # Workaround till we know why we are adding ourselves.
        if citer == citee:
            return

        if citee in updated_recids:
            citations[citee].add(citer)
        references[citer].add(citee)

    # dict of recid -> institute_give_publ_id
    records_info, references_info = citation_informations

    t1 = os.times()[4]

    # Try to find references based on 999C5r
    # e.g 8 -> ([astro-ph/9889],[hep-ph/768])
    # meaning: rec 8 contains these in bibliography
    write_message("Phase 1: Report numbers references")
    done = 0
    for thisrecid, refnumbers in references_info['report-numbers'].iteritems():
        step("Report numbers references", thisrecid, done,
                                        len(references_info['report-numbers']))
        done += 1

        for refnumber in (r for r in refnumbers if r):
            field = 'reportnumber'
            refnumber = standardize_report_number(refnumber)
            # Search for "hep-th/5644654 or such" in existing records
            recids = get_recids_matching_query(p=refnumber,
                                               f=field,
                                               config=config)
            write_message("These match searching %s in %s: %s" %
                                   (refnumber, field, list(recids)), verbose=9)

            if not recids:
                insert_into_missing(thisrecid, refnumber)
            else:
                remove_from_missing(refnumber)

            if len(recids) > 1:
                store_citation_warning('multiple-matches', refnumber)
                msg = "Whoops: record '%d' report number value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, refnumber, repr(recids))
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]:  # take only the first one
                add_to_refs(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t2 = os.times()[4]

    # Try to find references based on 999C5s
    # e.g. Phys.Rev.Lett. 53 (1986) 2285
    write_message("Phase 2: Journal references")
    done = 0
    for thisrecid, refs in references_info['journals'].iteritems():
        step("Journal references", thisrecid, done,
                                              len(references_info['journals']))
        done += 1

        for reference in (r for r in refs if r):
            p = reference
            field = 'journal'

            # check reference value to see whether it is well formed:
            if not re_CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK.match(p):
                store_citation_warning('not-well-formed', p)
                msg = "Whoops, record '%d' reference value '%s' " \
                      "is not well formed; skipping it." % (thisrecid, p)
                write_message(msg, stream=sys.stderr)
                continue  # skip this ill-formed value

            recids = get_recids_matching_query(p=p,
                                               f=field,
                                               config=config)
            write_message("These match searching %s in %s: %s"
                                 % (reference, field, list(recids)), verbose=9)

            if not recids:
                insert_into_missing(thisrecid, p)
            else:
                remove_from_missing(p)

            if len(recids) > 1:
                store_citation_warning('multiple-matches', p)
                msg = "Whoops: record '%d' reference value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, p, repr(recids))
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]:  # take only the first one
                add_to_refs(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t3 = os.times()[4]

    # Try to find references based on 999C5a
    # e.g. 10.1007/BF03170733
    write_message("Phase 3: DOI references")
    done = 0
    for thisrecid, refs in references_info['doi'].iteritems():
        step("DOI references", thisrecid, done, len(references_info['doi']))
        done += 1

        for reference in (r for r in refs if r):
            p = reference
            field = 'doi'

            recids = get_recids_matching_query(p=p,
                                               f=field,
                                               config=config)
            write_message("These match searching %s in %s: %s"
                                 % (reference, field, list(recids)), verbose=9)

            if not recids:
                insert_into_missing(thisrecid, p)
            else:
                remove_from_missing(p)

            if len(recids) > 1:
                store_citation_warning('multiple-matches', p)
                msg = "Whoops: record '%d' DOI value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, p, repr(recids))
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]:  # take only the first one
                add_to_refs(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t4 = os.times()[4]

    # Try to find references based on 999C5a (hdl references)
    # e.g. 4263537/4000
    write_message("Phase 4: HDL references")
    done = 0
    for thisrecid, refs in references_info['hdl'].iteritems():
        step("HDL references", thisrecid, done, len(references_info['hdl']))
        done += 1

        for reference in (r for r in refs if r):
            p = reference
            field = 'hdl'

            recids = get_recids_matching_query(p=p,
                                               f=field,
                                               config=config)
            write_message("These match searching %s in %s: %s"
                                 % (reference, field, list(recids)), verbose=9)

            if not recids:
                insert_into_missing(thisrecid, p)
            else:
                remove_from_missing(p)

            if len(recids) > 1:
                store_citation_warning('multiple-matches', p)
                msg = "Whoops: record '%d' HDL value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, p, repr(recids))
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]:  # take only the first one
                add_to_refs(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t5 = os.times()[4]

    # Try to find references based on 999C50
    # e.g. 1244
    write_message("Phase 5: Record ID references")
    done = 0
    for thisrecid, refs in references_info['record_id'].iteritems():
        step("Record ID references", thisrecid, done, len(references_info['record_id']))
        done += 1
        field = "001"
        for recid in (r for r in refs if r):
            valid = get_recids_matching_query(p=recid, f=field, config=config)
            write_message("These match searching %s in %s: %s"
                                 % (recid, field, list(valid)), verbose=9)
            if valid:
                add_to_refs(thisrecid, valid[0])

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t6 = os.times()[4]

    # Try to find references based on 999C5i
    # e.g. 978-3-942171-73-1
    write_message("Phase 6: ISBN references")
    done = 0
    for thisrecid, refs in references_info['isbn'].iteritems():
        step("ISBN references", thisrecid, done, len(references_info['isbn']))
        done += 1

        for reference in (r for r in refs if r):
            p = reference
            field = 'isbn'

            recids = get_recids_matching_query(p=p,
                                               f=field,
                                               config=config)
            write_message("These match searching %s in %s: %s"
                                 % (reference, field, list(recids)), verbose=9)

            if not recids:
                insert_into_missing(thisrecid, p)
            else:
                remove_from_missing(p)

            if len(recids) > 1:
                store_citation_warning('multiple-matches', p)
                msg = "Whoops: record '%d' ISBN value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, p, repr(recids))
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]:  # take only the first one
                add_to_refs(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t7 = os.times()[4]

    # Search for stuff like CERN-TH-4859/87 in list of refs
    write_message("Phase 7: report numbers catchup")
    done = 0
    for thisrecid, reportcodes in records_info['report-numbers'].iteritems():
        step("Report numbers catchup", thisrecid, done,
                                           len(records_info['report-numbers']))
        done += 1

        for reportcode in (r for r in reportcodes if r):
            if reportcode.startswith('arXiv'):
                std_reportcode = standardize_report_number(reportcode)
                report_pattern = r'^%s( *\[[a-zA-Z.-]*\])?' % \
                                                re.escape(std_reportcode)
                recids = get_recids_matching_query(p=report_pattern,
                                                   f=tags['refs_report_number'],
                                                   m='r',
                                                   config=config)
            else:
                recids = get_recids_matching_query(p=reportcode,
                                                   f=tags['refs_report_number'],
                                                   config=config)
            for recid in recids:
                add_to_cites(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    # Find this record's pubinfo in other records' bibliography
    write_message("Phase 8: journals catchup")
    done = 0
    t8 = os.times()[4]
    for thisrecid, rec_journals in records_info['journals'].iteritems():
        step("Journals catchup", thisrecid, done,
                                                 len(records_info['journals']))
        done += 1

        for journal in rec_journals:
            journal = journal.replace("\"", "")
            # Search the publication string like
            # Phys. Lett., B 482 (2000) 417 in 999C5s
            recids = get_recids_matching_query(p=journal,
                                               f=tags['refs_journal'],
                                               config=config)
            write_message("These records match %s in %s: %s"
                    % (journal, tags['refs_journal'], list(recids)), verbose=9)

            for recid in recids:
                add_to_cites(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    write_message("Phase 9: DOI catchup")
    done = 0
    t9 = os.times()[4]
    for thisrecid, dois in records_info['doi'].iteritems():
        step("DOI catchup", thisrecid, done, len(records_info['doi']))
        done += 1

        for doi in dois:
            recids = get_recids_matching_query(p=doi,
                                               f=tags['refs_doi'],
                                               config=config)
            write_message("These records match %s in %s: %s"
                            % (doi, tags['refs_doi'], list(recids)), verbose=9)

            for recid in recids:
                add_to_cites(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    write_message("Phase 10: HDL catchup")
    done = 0
    t10 = os.times()[4]
    for thisrecid, hdls in records_info['hdl'].iteritems():
        step("HDL catchup", thisrecid, done, len(records_info['hdl']))
        done += 1

        for hdl in hdls:
            recids = get_recids_matching_query(p=hdl,
                                               f=tags['refs_doi'],
                                               config=config)
            write_message("These records match %s in %s: %s"
                            % (hdl, tags['refs_doi'], list(recids)), verbose=9)

            for recid in recids:
                add_to_cites(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    write_message("Phase 11: ISBN catchup")
    done = 0
    t11 = os.times()[4]
    for thisrecid, isbns in records_info['isbn'].iteritems():
        step("ISBN catchup", thisrecid, done, len(records_info['isbn']))
        done += 1

        for isbn in isbns:
            recids = get_recids_matching_query(p=isbn,
                                               f=tags['refs_isbn'],
                                               config=config)
            write_message("These records match %s in %s: %s"
                            % (isbn, tags['refs_isbn'], list(recids)), verbose=9)

            for recid in recids:
                add_to_cites(recid, thisrecid)

    write_message("Phase 12: Record ID catchup")
    done = 0
    t12 = os.times()[4]
    for thisrecid, record_ids in records_info['record_id'].iteritems():
        step("Record ID catchup", thisrecid, done, len(records_info['record_id']))
        done += 1

        for record_id in record_ids:
            recids = get_recids_matching_query(p=record_id,
                                               f=tags['refs_record_id'],
                                               config=config)
            write_message("These records match %s in %s: %s"
                            % (record_id, tags['refs_record_id'], list(recids)), verbose=9)

            for recid in recids:
                add_to_cites(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    if task_get_task_param('verbose') >= 3:
        # Print only X first to prevent flood
        write_message("citation_list (x is cited by y):")
        write_message(dict(islice(citations.iteritems(), 10)))
        write_message("size: %s" % len(citations))
        write_message("reference_list (x cites y):")
        write_message(dict(islice(references.iteritems(), 10)))
        write_message("size: %s" % len(references))

    t13 = os.times()[4]

    write_message("Execution time for analyzing the citation information "
                  "generating the dictionary:")
    write_message("... checking ref report numbers: %.2f sec" % (t2-t1))
    write_message("... checking ref journals: %.2f sec" % (t3-t2))
    write_message("... checking ref DOI: %.2f sec" % (t4-t3))
    write_message("... checking ref HDL: %.2f sec" % (t5-t4))
    write_message("... checking ref Record ID: %.2f sec" % (t6-t5))
    write_message("... checking ref ISBN: %.2f sec" % (t7-t6))
    write_message("... checking rec report numbers: %.2f sec" % (t8-t7))
    write_message("... checking rec journals: %.2f sec" % (t9-t8))
    write_message("... checking rec DOI: %.2f sec" % (t10-t9))
    write_message("... checking rec HDL: %.2f sec" % (t11-t10))
    write_message("... checking rec ISBN: %.2f sec" % (t12-t11))
    write_message("... checking rec Record ID: %.2f sec" % (t13-t12))
    write_message("... total time of ref_analyze: %.2f sec" % (t13-t1))

    return citations, references
예제 #25
0
            errcode = os.system(cmd)
        except OSError, exc:
            write_message('ERROR: Command %s failed [%s].' % (cmd, exc),
                          stream=sys.stderr,
                          verbose=0)
        if errcode != 0:
            write_message("ERROR: %s failed, error code is %d." %
                          (cmd, errcode),
                          stream=sys.stderr,
                          verbose=0)
            return 0

    # Update the date of last run in the clsMETHOD table, but only if
    # we were running in an automated mode.
    if automated_daemon_mode_p:
        _update_date_of_last_run(task_get_task_param('task_starting_time'))
    return 1


def _analyze_documents(records, ontology, collection):
    """For each collection, parse the documents attached to the records
    in collection with the corresponding ontology."""
    global _INDEX

    if not records:
        # No records could be found.
        write_message("WARNING: No record were found in collection %s." %
                      collection,
                      stream=sys.stderr,
                      verbose=2)
        return False
예제 #26
0
def _task_run_core():
    """Runs analyse_documents for each ontology, collection, record ids
    set."""

    automated_daemon_mode_p = True
    recids = bibtask.task_get_option('recids')
    collections = bibtask.task_get_option('collections')
    taxonomy = bibtask.task_get_option('taxonomy')

    if recids or collections:
        # We want to run some records/collection only, so we are not
        # in the automated daemon mode; this will be useful later.
        automated_daemon_mode_p = False

    # Check if the user specified which documents to extract keywords from.
    if recids:
        onto_recids = _get_recids_foreach_ontology(recids=recids,
                                                   taxonomy=taxonomy)
    elif collections:
        onto_recids = _get_recids_foreach_ontology(collections=collections,
                                                   taxonomy=taxonomy)
    else:
        onto_recids = _get_recids_foreach_ontology()

    if not onto_recids:
        # Nothing to do.
        if automated_daemon_mode_p:
            _update_date_of_last_run(
                bibtask.task_get_task_param('task_starting_time'))
        return 1

    # We will write to a temporary file as we go, because we might be processing
    # big collections with many docs
    _rid = time.strftime("%Y%m%d%H%M%S", time.localtime())
    abs_path = bibclassify_engine.get_tmp_file(_rid)
    fo = open(abs_path, 'w')

    fo.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    fo.write('<collection xmlns="http://www.loc.gov/MARC21/slim">\n')

    # Count the total number of records in order to update the progression.
    global _RECIDS_NUMBER
    for onto_rec in onto_recids:
        _RECIDS_NUMBER += len(onto_rec['recIDs'])

    rec_added = False

    for onto_rec in onto_recids:
        bibtask.task_sleep_now_if_required(can_stop_too=False)

        if onto_rec['collection'] is not None:
            bibtask.write_message(
                'INFO: Applying taxonomy %s to collection %s (%s '
                'records)' % (onto_rec['ontology'], onto_rec['collection'],
                              len(onto_rec['recIDs'])),
                stream=sys.stderr,
                verbose=3)
        else:
            bibtask.write_message(
                'INFO: Applying taxonomy %s to recIDs %s. ' %
                (onto_rec['ontology'], ', '.join(
                    [str(recid) for recid in onto_rec['recIDs']])),
                stream=sys.stderr,
                verbose=3)
        if onto_rec['recIDs']:
            xml = _analyze_documents(onto_rec['recIDs'], onto_rec['ontology'],
                                     onto_rec['collection'])
            if len(xml) > 5:
                fo.write(xml)
                rec_added = True

    fo.write('</collection>\n')
    fo.close()

    # Apply the changes.
    if rec_added:
        if bconfig.CFG_DB_SAVE_KW:
            bibclassify_webinterface.upload_keywords(abs_path)
        else:
            bibtask.write_message(
                "INFO: CFG_DB_SAVE_KW is false, we don't save results",
                stream=sys.stderr,
                verbose=0)
    else:
        bibtask.write_message("WARNING: No keywords found, recids: %s" %
                              onto_recids,
                              stream=sys.stderr,
                              verbose=0)
        os.remove(abs_path)

    # Update the date of last run in the clsMETHOD table, but only if
    # we were running in an automated mode.
    if automated_daemon_mode_p:
        _update_date_of_last_run(
            bibtask.task_get_task_param('task_starting_time'))
    return 1
예제 #27
0
def _update_job_lastrun_time(jobname):
    """Update expJOB table and set lastrun time of JOBNAME to the task
    starting time."""
    run_sql("UPDATE expJOB SET lastrun=%s WHERE jobname=%s",
            (task_get_task_param('task_starting_time'), jobname,))
예제 #28
0
def get_citation_informations(recid_list, config):
    """scans the collections searching references (999C5x -fields) and
       citations for items in the recid_list
       returns a 4 list of dictionaries that contains the citation information
       of cds records
       examples: [ {} {} {} {} ]
                 [ {5: 'SUT-DP-92-70-5'},
                   { 93: ['astro-ph/9812088']},
                   { 93: ['Phys. Rev. Lett. 96 (2006) 081301'] }, {} ]
        NB: stuff here is for analysing new or changed records.
        see "ref_analyzer" for more.
    """
    begin_time = os.times()[4]
    d_reports_numbers = {}  #dict of recid -> institute-given-report-code
    d_references_report_numbers = {}  #dict of recid -> ['astro-ph/xyz']
    d_references_s = {
    }  #dict of recid -> list_of_the_entries_of_this_recs_bibliography
    d_records_s = {}  #dict of recid -> this_records_publication_info
    citation_informations = []

    write_message("config function " + config.get("rank_method", "function"),
                  verbose=9)
    function = ""
    try:
        function = config.get("rank_method", "function")
    except:
        register_exception(
            prefix="cfg section [rank_method] has no attribute called function",
            alert_admin=True)
        #we cannot continue
        return [{}, {}, {}, {}]
    record_pri_number_tag = ""
    try:
        record_pri_number_tag = config.get(function, "primary_report_number")
    except:
        register_exception(prefix="cfg section " + function +
                           " has no attribute primary_report_number",
                           alert_admin=True)
        return [{}, {}, {}, {}]
    record_add_number_tag = ""
    try:
        record_add_number_tag = config.get(
            config.get("rank_method", "function"), "additional_report_number")
    except:
        register_exception(prefix="config error. cfg section " + function +
                           " has no attribute additional_report_number",
                           alert_admin=True)
        return [{}, {}, {}, {}]

    reference_number_tag = ""
    try:
        reference_number_tag = config.get(
            config.get("rank_method", "function"),
            "reference_via_report_number")
    except:
        register_exception(prefix="config error. cfg section " + function +
                           " has no attribute reference_via_report_number",
                           alert_admin=True)
        return [{}, {}, {}, {}]

    reference_tag = ""
    try:
        reference_tag = config.get(config.get("rank_method", "function"),
                                   "reference_via_pubinfo")
    except:
        register_exception(prefix="config error. cfg section " + function +
                           " has no attribute reference_via_pubinfo",
                           alert_admin=True)
        return [{}, {}, {}, {}]

    p_record_pri_number_tag = tagify(parse_tag(record_pri_number_tag))
    #037a: contains (often) the "hep-ph/0501084" tag of THIS record
    p_record_add_number_tag = tagify(parse_tag(record_add_number_tag))
    #088a: additional short identifier for the record
    p_reference_number_tag = tagify(parse_tag(reference_number_tag))
    #999C5r. this is in the reference list, refers to other records. Looks like: hep-ph/0408002
    p_reference_tag = tagify(parse_tag(reference_tag))
    #999C5s. A standardized way of writing a reference in the reference list. Like: Nucl. Phys. B 710 (2000) 371
    #fields needed to construct the pubinfo for this record
    publication_pages_tag = ""
    publication_year_tag = ""
    publication_journal_tag = ""
    publication_volume_tag = ""
    publication_format_string = "p v (y) c"
    try:
        tag = config.get(function, "pubinfo_journal_page")
        publication_pages_tag = tagify(parse_tag(tag))
        tag = config.get(function, "pubinfo_journal_year")
        publication_year_tag = tagify(parse_tag(tag))
        tag = config.get(function, "pubinfo_journal_title")
        publication_journal_tag = tagify(parse_tag(tag))
        tag = config.get(function, "pubinfo_journal_volume")
        publication_volume_tag = tagify(parse_tag(tag))
        publication_format_string = config.get(function,
                                               "pubinfo_journal_format")
    except:
        pass

    #print values for tags for debugging
    if task_get_task_param('verbose') >= 9:
        write_message("tag values")
        write_message("p_record_pri_number_tag " +
                      str(p_record_pri_number_tag))
        write_message("p_reference_tag " + str(p_reference_tag))
        write_message("publication_journal_tag " +
                      str(publication_journal_tag))
        write_message("publication_format_string is " +
                      publication_format_string)
    done = 0  #for status reporting
    numrecs = len(recid_list)

    # perform quick check to see if there are some records with
    # reference tags, because otherwise get.cit.inf would be slow even
    # if there is nothing to index:
    if run_sql("SELECT value FROM bib%sx WHERE tag=%%s LIMIT 1" % p_reference_tag[0:2],
               (p_reference_tag,)) or \
       run_sql("SELECT value FROM bib%sx WHERE tag=%%s LIMIT 1" % p_reference_number_tag[0:2],
               (p_reference_number_tag,)):
        for recid in recid_list:
            if (done % 10 == 0):
                task_sleep_now_if_required()
                #in fact we can sleep any time here

            if (done % 1000 == 0):
                mesg = "get cit.inf done " + str(done) + " of " + str(numrecs)
                write_message(mesg)
                task_update_progress(mesg)
            done = done + 1

            pri_report_numbers = get_fieldvalues(recid,
                                                 p_record_pri_number_tag)
            add_report_numbers = get_fieldvalues(recid,
                                                 p_record_add_number_tag)
            reference_report_numbers = get_fieldvalues(recid,
                                                       p_reference_number_tag)
            references_s = get_fieldvalues(recid, p_reference_tag)

            l_report_numbers = pri_report_numbers
            l_report_numbers.extend(add_report_numbers)
            d_reports_numbers[recid] = l_report_numbers

            if reference_report_numbers:
                d_references_report_numbers[recid] = reference_report_numbers

            references_s = get_fieldvalues(recid, p_reference_tag)
            write_message(str(recid) + "'s " + str(p_reference_tag) +
                          " values " + str(references_s),
                          verbose=9)
            if references_s:
                d_references_s[recid] = references_s

            #get a combination of
            #journal vol (year) pages
            if publication_pages_tag and publication_journal_tag and \
                 publication_volume_tag and publication_year_tag and publication_format_string:
                tagsvalues = {}  #we store the tags and their values here
                #like c->444 y->1999 p->"journal of foo",v->20
                tagsvalues["p"] = ""
                tagsvalues["y"] = ""
                tagsvalues["c"] = ""
                tagsvalues["v"] = ""
                tmp = get_fieldvalues(recid, publication_journal_tag)
                if tmp:
                    tagsvalues["p"] = tmp[0]
                tmp = get_fieldvalues(recid, publication_volume_tag)
                if tmp:
                    tagsvalues["v"] = tmp[0]
                tmp = get_fieldvalues(recid, publication_year_tag)
                if tmp:
                    tagsvalues["y"] = tmp[0]
                tmp = get_fieldvalues(recid, publication_pages_tag)
                if tmp:
                    #if the page numbers have "x-y" take just x
                    pages = tmp[0]
                    hpos = pages.find("-")
                    if hpos > 0:
                        pages = pages[:hpos]
                    tagsvalues["c"] = pages
                #format the publ infostring according to the format
                publ = ""
                ok = 1
                for i in range(0, len(publication_format_string)):
                    current = publication_format_string[i]
                    #these are supported
                    if current == "p" or current == "c" or current == "v" \
                                      or current == "y":
                        if tagsvalues[current]:
                            #add the value in the string
                            publ += tagsvalues[current]
                        else:
                            ok = 0
                            break  #it was needed and not found
                    else:
                        publ += current  #just add the character in the format string
                if ok:
                    write_message("d_records_s (publication info) for " +
                                  str(recid) + " is " + publ,
                                  verbose=9)
                    d_records_s[recid] = publ
    else:
        mesg = "Warning: there are no records with tag values for "
        mesg += p_reference_number_tag + " or " + p_reference_tag + ". Nothing to do."
        write_message(mesg)

    mesg = "get cit.inf done fully"
    write_message(mesg)
    task_update_progress(mesg)

    citation_informations.append(d_reports_numbers)
    citation_informations.append(d_references_report_numbers)
    citation_informations.append(d_references_s)
    citation_informations.append(d_records_s)
    end_time = os.times()[4]
    write_message("Execution time for generating citation info from record: %.2f sec" % \
                  (end_time - begin_time))
    return citation_informations
예제 #29
0
def get_citation_weight(rank_method_code, config):
    """return a dictionary which is used by bibrank daemon for generating
    the index of sorted research results by citation information
    """
    begin_time = time.time()
    last_update_time = get_bibrankmethod_lastupdate(rank_method_code)

    if task_get_option("quick") == "no":
        last_update_time = "0000-00-00 00:00:00"
        write_message("running thorough indexing since quick option not used", verbose=3)

    last_modified_records = get_last_modified_rec(last_update_time)
    #id option forces re-indexing a certain range even if there are no new recs
    if last_modified_records or task_get_option("id"):
        if task_get_option("id"):
            #construct a range of records to index
            taskid = task_get_option("id")
            first = taskid[0][0]
            last = taskid[0][1]
            #make range, last+1 so that e.g. -i 1-2 really means [1,2] not [1]
            updated_recid_list = range(first, last+1)
        else:
            updated_recid_list = create_recordid_list(last_modified_records)

        write_message("Last update "+str(last_update_time)+" records: "+ \
                       str(len(last_modified_records))+" updates: "+ \
                       str(len(updated_recid_list)))

        #write_message("updated_recid_list: "+str(updated_recid_list))
        result_intermediate = last_updated_result(rank_method_code)

        #result_intermed should be warranted to exists!
        #but if the user entered a "-R" (do all) option, we need to
        #make an empty start set
        if task_get_option("quick") == "no":
            result_intermediate = [{}, {}, {}]
        else:
            # check indexing times of `journal' and `reportnumber`
            # indexes, since if they are not up to date yet, then we
            # should wait and not run citation indexing as of yet:
            last_timestamp_bibrec = run_sql("SELECT DATE_FORMAT(MAX(modification_date), '%%Y-%%m-%%d %%H:%%i:%%s') FROM bibrec", (), 1)[0][0]
            last_timestamp_indexes = run_sql("SELECT DATE_FORMAT(MAX(last_updated), '%%Y-%%m-%%d %%H:%%i:%%s') FROM idxINDEX WHERE name IN (%s,%s)", ('journal', 'reportnumber'), 1)[0][0]
            if not last_timestamp_indexes or \
               not last_timestamp_bibrec or \
               last_timestamp_bibrec > last_timestamp_indexes:
                write_message("Not running citation indexer since journal/reportnumber indexes are not up to date yet.")
                return {}

        citation_weight_dic_intermediate = result_intermediate[0]
        citation_list_intermediate = result_intermediate[1]
        reference_list_intermediate = result_intermediate[2]

        # Enrich updated_recid_list so that it would contain also
        # records citing or referring to updated records, so that
        # their citation information would be updated too.  Not the
        # most efficient way to treat this problem, but the one that
        # requires least code changes until ref_analyzer() is more
        # nicely re-factored.
        updated_recid_list_set = intbitset(updated_recid_list)
        for somerecid in updated_recid_list:
            # add both citers and citees:
            updated_recid_list_set |= intbitset(citation_list_intermediate.get(somerecid, []))
            updated_recid_list_set |= intbitset(reference_list_intermediate.get(somerecid, []))
        updated_recid_list = list(updated_recid_list_set)

        #call the procedure that does the hard work by reading fields of
        #citations and references in the updated_recid's (but nothing else)!
        if task_get_task_param('verbose') >= 9:
            write_message("Entering get_citation_informations")
        citation_informations = get_citation_informations(updated_recid_list,
                                                          config)
        #write_message("citation_informations: "+str(citation_informations))
        #create_analysis_tables() #temporary..
                                  #test how much faster in-mem indexing is
        write_message("Entering ref_analyzer", verbose=9)
        #call the analyser that uses the citation_informations to really
        #search x-cites-y in the coll..
        dic = ref_analyzer(citation_informations,
                           citation_weight_dic_intermediate,
                           citation_list_intermediate,
                           reference_list_intermediate,
                           config,updated_recid_list)
                    #dic is docid-numberofreferences like {1: 2, 2: 0, 3: 1}
        #write_message("Docid-number of known references "+str(dic))
        end_time = time.time()
        write_message("Total time of get_citation_weight(): %.2f sec" % (end_time - begin_time))
        task_update_progress("citation analysis done")
    else:
        dic = {}
        write_message("No new records added since last time this rank method was executed")
    return dic
예제 #30
0
def ref_analyzer(citation_informations, initialresult, initial_citationlist,
                 initial_referencelist, config, updated_rec_list):
    """Analyze the citation informations and calculate the citation weight
       and cited by list dictionary.
    """
    function = ""
    try:
        function = config.get("rank_method", "function")
    except:
        register_exception(
            prefix="cfg section [rank_method] has no attr function",
            alert_admin=True)
        return {}

    pubrefntag = ""
    try:
        pubrefntag = config.get(function, "reference_via_report_number")
    except:
        register_exception(prefix="cfg section " + function +
                           " has no attr reference_via_report_number",
                           alert_admin=True)
        return {}

    pubreftag = ""
    try:
        pubreftag = config.get(function, "reference_via_pubinfo")
    except:
        register_exception(prefix="cfg section " + function +
                           " has no attr reference_via_pubinfo",
                           alert_admin=True)
        return {}

    #pubrefntag is often 999C5r, pubreftag 999C5s
    if task_get_task_param('verbose') >= 9:
        write_message("pubrefntag " + pubrefntag)
        write_message("pubreftag " + pubreftag)

    citation_list = initial_citationlist
    reference_list = initial_referencelist
    result = initialresult
    d_reports_numbers = citation_informations[
        0]  #dict of recid -> institute_give_publ_id
    d_references_report_numbers = citation_informations[
        1]  #dict of recid -> ['astro-ph/xyz'..]
    d_references_s = citation_informations[2]
    #dict of recid -> publication_infos_in_its_bibliography
    d_records_s = citation_informations[3]  #recid -> its publication inf
    t1 = os.times()[4]

    write_message("Phase 1: d_references_report_numbers")
    #d_references_report_numbers: e.g 8 -> ([astro-ph/9889],[hep-ph/768])
    #meaning: rec 8 contains these in bibliography

    done = 0
    numrecs = len(d_references_report_numbers)
    for thisrecid, refnumbers in d_references_report_numbers.iteritems():
        if (done % 1000 == 0):
            mesg = "d_references_report_numbers done " + str(
                done) + " of " + str(numrecs)
            write_message(mesg)
            task_update_progress(mesg)
            #write to db!
            insert_into_cit_db(reference_list, "reversedict")
            insert_into_cit_db(citation_list, "citationdict")
            #it's ok to sleep too, we got something done
            task_sleep_now_if_required()
        done = done + 1

        for refnumber in refnumbers:
            if refnumber:
                p = refnumber
                f = 'reportnumber'
                #sanitise p
                p.replace("\n", '')
                #search for "hep-th/5644654 or such" in existing records
                rec_ids = get_recids_matching_query(p, f)
                if rec_ids and rec_ids[0]:
                    write_citer_cited(thisrecid, rec_ids[0])
                    remove_from_missing(p)
                    if not result.has_key(rec_ids[0]):
                        result[rec_ids[0]] = 0
                    # Citation list should have rec_ids[0] but check anyway
                    if not citation_list.has_key(rec_ids[0]):
                        citation_list[rec_ids[0]] = []
                    #append unless this key already has the item
                    if not thisrecid in citation_list[rec_ids[0]]:
                        citation_list[rec_ids[0]].append(thisrecid)
                        #and update result
                        result[rec_ids[0]] += 1

                    if not reference_list.has_key(thisrecid):
                        reference_list[thisrecid] = []
                    if not rec_ids[0] in reference_list[thisrecid]:
                        reference_list[thisrecid].append(rec_ids[0])
                else:
                    #the reference we wanted was not found among our records.
                    #put the reference in the "missing".. however, it will look
                    #bad.. gfhgf/1254312, so  get the corresponding 999C5s (full ref) too
                    #This should really be done in the next loop d_references_s
                    #but the 999C5s fields are not yet normalized

                    #rectext = print_record(thisrecid, format='hm', ot=pubreftag[:-1])
                    rectext = ""  # print_record() call disabled to speed things up
                    lines = rectext.split("\n")
                    rpart = p  #to be used..
                    for l in lines:
                        if (
                                l.find(p) > 0
                        ):  #the gfhgf/1254312 was found.. get the s-part of it
                            st = l.find('$s')
                            if (st > 0):
                                end = l.find('$', st)
                                if (end == st):
                                    end = len(l)
                                rpart = l[st + 2:end]
                    insert_into_missing(thisrecid, rpart)

    mesg = "d_references_report_numbers done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t2 = os.times()[4]

    #try to find references based on 999C5s, like Phys.Rev.Lett. 53 (1986) 2285
    write_message("Phase 2: d_references_s")
    done = 0
    numrecs = len(d_references_s)
    for thisrecid, refss in d_references_s.iteritems():
        if (done % 1000 == 0):
            mesg = "d_references_s done " + str(done) + " of " + str(numrecs)
            write_message(mesg)
            task_update_progress(mesg)
            #write to db!
            insert_into_cit_db(reference_list, "reversedict")
            insert_into_cit_db(citation_list, "citationdict")
            task_sleep_now_if_required()

        done = done + 1

        for refs in refss:
            if refs:
                p = refs
                #remove the latter page number if it is like 67-74
                matches = re.compile("(.*)(-\d+$)").findall(p)
                if matches and matches[0]:
                    p = matches[0][0]
                rec_id = None
                try:
                    rec_ids = list(search_unit(p, 'journal'))
                except:
                    rec_ids = None
                write_message("These match searching " + p + " in journal: " +
                              str(rec_id),
                              verbose=9)
                if rec_ids and rec_ids[0]:
                    #the refered publication is in our collection, remove
                    #from missing
                    remove_from_missing(p)
                else:
                    #it was not found so add in missing
                    insert_into_missing(thisrecid, p)
                #check citation and reference for this..
                if rec_ids and rec_ids[0]:
                    #the above should always hold
                    if not result.has_key(rec_ids[0]):
                        result[rec_ids[0]] = 0
                    if not citation_list.has_key(rec_ids[0]):
                        citation_list[rec_ids[0]] = []
                    if not thisrecid in citation_list[rec_ids[0]]:
                        citation_list[rec_ids[0]].append(
                            thisrecid)  #append actual list
                        result[rec_ids[0]] += 1  #add count for this..

                    #update reference_list accordingly
                    if not reference_list.has_key(thisrecid):
                        reference_list[thisrecid] = []
                    if not rec_ids[0] in reference_list[thisrecid]:
                        reference_list[thisrecid].append(rec_ids[0])
    mesg = "d_references_s done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t3 = os.times()[4]
    done = 0
    numrecs = len(d_reports_numbers)
    write_message("Phase 3: d_reports_numbers")

    #search for stuff like CERN-TH-4859/87 in list of refs
    for thisrecid, reportcodes in d_reports_numbers.iteritems():
        if (done % 1000 == 0):
            mesg = "d_report_numbers done " + str(done) + " of " + str(numrecs)
            write_message(mesg)
            task_update_progress(mesg)
        done = done + 1

        for reportcode in reportcodes:
            if reportcode:
                rec_ids = []
                try:
                    rec_ids = get_recids_matching_query(reportcode, pubrefntag)
                except:
                    rec_ids = []

                if rec_ids:
                    for recid in rec_ids:
                        #normal checks..
                        if not citation_list.has_key(thisrecid):
                            citation_list[thisrecid] = []
                        if not reference_list.has_key(recid):
                            reference_list[recid] = []
                        if not result.has_key(thisrecid):
                            result[thisrecid] = 0

                        #normal updates
                        if not recid in citation_list[thisrecid]:
                            result[thisrecid] += 1
                            citation_list[thisrecid].append(recid)
                        if not thisrecid in reference_list[recid]:
                            reference_list[recid].append(thisrecid)

    mesg = "d_report_numbers done fully"
    write_message(mesg)
    task_update_progress(mesg)

    #find this record's pubinfo in other records' bibliography
    write_message("Phase 4: d_records_s")
    done = 0
    numrecs = len(d_records_s)
    t4 = os.times()[4]
    for thisrecid, recs in d_records_s.iteritems():
        if (done % 1000 == 0):
            mesg = "d_records_s done " + str(done) + " of " + str(numrecs)
            write_message(mesg)
            task_update_progress(mesg)
        done = done + 1
        p = recs.replace("\"", "")
        #search the publication string like Phys. Lett., B 482 (2000) 417 in 999C5s
        rec_ids = list(search_unit(f=pubreftag, p=p, m='a'))
        write_message("These records match " + p + " in " + pubreftag + " : " +
                      str(rec_ids),
                      verbose=9)
        if rec_ids:
            for rec_id in rec_ids:
                #normal checks
                if not result.has_key(thisrecid):
                    result[thisrecid] = 0
                if not citation_list.has_key(thisrecid):
                    citation_list[thisrecid] = []
                if not reference_list.has_key(rec_id):
                    reference_list[rec_id] = []

                if not rec_id in citation_list[thisrecid]:
                    result[thisrecid] += 1
                    citation_list[thisrecid].append(rec_id)
                if not thisrecid in reference_list[rec_id]:
                    reference_list[rec_id].append(thisrecid)

    mesg = "d_records_s done fully"
    write_message(mesg)
    task_update_progress(mesg)

    write_message("Phase 5: reverse lists")

    #remove empty lists in citation and reference
    keys = citation_list.keys()
    for k in keys:
        if not citation_list[k]:
            del citation_list[k]

    keys = reference_list.keys()
    for k in keys:
        if not reference_list[k]:
            del reference_list[k]

    write_message("Phase 6: self-citations")
    selfdic = {}
    #get the initial self citation dict
    initial_self_dict = get_cit_dict("selfcitdict")
    selfdic = initial_self_dict
    #add new records to selfdic
    acit = task_get_option("author-citations")
    if not acit:
        write_message(
            "Self cite processing disabled. Use -A option to enable it.")
    else:
        write_message("self cite and author citations enabled")
        selfdic = get_self_citations(updated_rec_list, citation_list,
                                     initial_self_dict, config)
    #selfdic consists of
    #key k -> list of values [v1,v2,..]
    #where k is a record with author A and k cites v1,v2.. and A appears in v1,v2..

    #create a reverse "x cited by y" self cit dict
    selfcitedbydic = {}
    for k in selfdic.keys():
        vlist = selfdic[k]
        for v in vlist:
            if selfcitedbydic.has_key(v):
                tmplist = selfcitedbydic[v]
                if not k in tmplist:
                    tmplist.append(k)
            else:
                tmplist = [k]
            selfcitedbydic[v] = tmplist

    write_message("Getting author citations")

    #get author citations for records in updated_rec_list
    initial_author_dict = get_initial_author_dict()
    authorcitdic = initial_author_dict
    acit = task_get_option("author-citations")
    if not acit:
        print "Author cites disabled. Use -A option to enable it."
    else:
        write_message("author citations enabled")
        authorcitdic = get_author_citations(updated_rec_list, citation_list,
                                            initial_author_dict, config)

    if task_get_task_param('verbose') >= 3:
        #print only X first to prevent flood
        tmpdict = {}
        tmp = citation_list.keys()[0:10]
        for t in tmp:
            tmpdict[t] = citation_list[t]
        write_message("citation_list (x is cited by y): " + str(tmpdict))
        write_message("size: " + str(len(citation_list.keys())))
        tmp = reference_list.keys()[0:10]
        tmpdict = {}
        for t in tmp:
            tmpdict[t] = reference_list[t]
        write_message("reference_list (x cites y): " + str(tmpdict))
        write_message("size: " + str(len(reference_list.keys())))
        tmp = selfcitedbydic.keys()[0:10]
        tmpdict = {}
        for t in tmp:
            tmpdict[t] = selfcitedbydic[t]
        mesg = "selfcitedbydic (x is cited by y and one of the authors of x same as y's):"
        mesg += str(tmpdict)
        write_message(mesg)
        write_message("size: " + str(len(selfcitedbydic.keys())))
        tmp = selfdic.keys()[0:100]
        tmpdict = {}
        for t in tmp:
            tmpdict[t] = selfdic[t]
        mesg = "selfdic (x cites y and one of the authors of x same as y's): " + str(
            tmpdict)
        write_message(mesg)
        write_message("size: " + str(len(selfdic.keys())))
        tmp = authorcitdic.keys()[0:10]
        tmpdict = {}
        for t in tmp:
            tmpdict[t] = authorcitdic[t]
        write_message("authorcitdic (author is cited in recs): " +
                      str(tmpdict))
        write_message("size: " + str(len(authorcitdic.keys())))
    insert_cit_ref_list_intodb(citation_list, reference_list, selfcitedbydic,
                               selfdic, authorcitdic)

    t5 = os.times()[4]

    write_message(
        "Execution time for analyzing the citation information generating the dictionary:"
    )
    write_message("... checking ref number: %.2f sec" % (t2 - t1))
    write_message("... checking ref ypvt: %.2f sec" % (t3 - t2))
    write_message("... checking rec number: %.2f sec" % (t4 - t3))
    write_message("... checking rec ypvt: %.2f sec" % (t5 - t4))
    write_message("... total time of ref_analyze: %.2f sec" % (t5 - t1))

    return result
def get_citation_weight(rank_method_code, config):
    """return a dictionary which is used by bibrank daemon for generating
    the index of sorted research results by citation information
    """
    begin_time = time.time()
    last_update_time = get_bibrankmethod_lastupdate(rank_method_code)

    if task_get_option("quick") == "no":
        last_update_time = "0000-00-00 00:00:00"
        write_message("running thorough indexing since quick option not used", verbose=3)

    last_modified_records = get_last_modified_rec(last_update_time)
    #id option forces re-indexing a certain range even if there are no new recs
    if last_modified_records or task_get_option("id"):
        if task_get_option("id"):
            #construct a range of records to index
            taskid = task_get_option("id")
            first = taskid[0][0]
            last = taskid[0][1]
            #make range, last+1 so that e.g. -i 1-2 really means [1,2] not [1]
            updated_recid_list = range(first, last+1)
        else:
            updated_recid_list = create_recordid_list(last_modified_records)

        write_message("Last update "+str(last_update_time)+" records: "+ \
                       str(len(last_modified_records))+" updates: "+ \
                       str(len(updated_recid_list)))

        #write_message("updated_recid_list: "+str(updated_recid_list))
        result_intermediate = last_updated_result(rank_method_code)

        #result_intermed should be warranted to exists!
        #but if the user entered a "-R" (do all) option, we need to
        #make an empty start set
        if task_get_option("quick") == "no":
            result_intermediate = [{}, {}, {}]

        citation_weight_dic_intermediate = result_intermediate[0]
        citation_list_intermediate = result_intermediate[1]
        reference_list_intermediate = result_intermediate[2]

        #call the procedure that does the hard work by reading fields of
        #citations and references in the updated_recid's (but nothing else)!
        if task_get_task_param('verbose') >= 9:
            write_message("Entering get_citation_informations")
        citation_informations = get_citation_informations(updated_recid_list,
                                                          config)
        #write_message("citation_informations: "+str(citation_informations))
        #create_analysis_tables() #temporary..
                                  #test how much faster in-mem indexing is
        write_message("Entering ref_analyzer", verbose=9)
        #call the analyser that uses the citation_informations to really
        #search x-cites-y in the coll..
        dic = ref_analyzer(citation_informations,
                           citation_weight_dic_intermediate,
                           citation_list_intermediate,
                           reference_list_intermediate,
                           config,updated_recid_list)
                    #dic is docid-numberofreferences like {1: 2, 2: 0, 3: 1}
        #write_message("Docid-number of known references "+str(dic))
        end_time = time.time()
        write_message("Total time of get_citation_weight(): %.2f sec" % (end_time - begin_time))
        task_update_progress("citation analysis done")
    else:
        dic = {}
        write_message("No new records added since last time this rank method was executed")
    return dic
예제 #32
0
def task_run_core():
    """Run the harvesting task.  The row argument is the oaiharvest task
    queue row, containing if, arguments, etc.
    Return 1 in case of success and 0 in case of failure.
    """
    reposlist = []
    datelist = []
    dateflag = 0
    possible_postmodes = [code for code, dummy in CFG_OAI_POSSIBLE_POSTMODES]
    filepath_prefix = tmpHARVESTpath + "_" + str(task_get_task_param("task_id"))
    ### go ahead: build up the reposlist
    if task_get_option("repository") is not None:
        ### user requests harvesting from selected repositories
        write_message("harvesting from selected repositories")
        for reposname in task_get_option("repository"):
            row = get_row_from_reposname(reposname)
            if row == []:
                write_message("source name " + reposname + " is not valid")
                continue
            else:
                reposlist.append(get_row_from_reposname(reposname))
    else:
        ### user requests harvesting from all repositories
        write_message("harvesting from all repositories in the database")
        reposlist = get_all_rows_from_db()

    ### go ahead: check if user requested from-until harvesting
    if task_get_option("dates"):
        ### for each repos simply perform a from-until date harvesting...
        ### no need to update anything
        dateflag = 1
        for element in task_get_option("dates"):
            datelist.append(element)

    error_happened_p = False
    j = 0
    for repos in reposlist:
        j += 1
        task_sleep_now_if_required()
        reponame = str(repos[0][6])
        postmode = str(repos[0][9])
        setspecs = str(repos[0][10])
        harvested_files_list = []
        if postmode in possible_postmodes:
            # Harvest phase
            harvestpath = filepath_prefix + "_" + str(j) + "_" + \
                         time.strftime("%Y%m%d%H%M%S") + "_harvested"
            if dateflag == 1:
                task_update_progress("Harvesting %s from %s to %s (%i/%i)" % \
                                     (reponame, \
                                      str(datelist[0]),
                                      str(datelist[1]),
                                      j, \
                                      len(reposlist)))
                exit_code, file_list = oai_harvest_get(prefix = repos[0][2],
                                      baseurl = repos[0][1],
                                      harvestpath = harvestpath,
                                      fro = str(datelist[0]),
                                      until = str(datelist[1]),
                                      setspecs = setspecs)
                if exit_code == 1 :
                    write_message("source " + reponame + \
                                  " was harvested from " + str(datelist[0]) \
                                  + " to " + str(datelist[1]))
                    harvested_files_list = file_list
                else:
                    write_message("an error occurred while harvesting "
                        "from source " +
                        reponame + " for the dates chosen")
                    error_happened_p = True
                    continue

            elif dateflag != 1 and repos[0][7] is None and repos[0][8] != 0:
                write_message("source " + reponame + \
                              " was never harvested before - harvesting whole "
                              "repository")
                task_update_progress("Harvesting %s (%i/%i)" % \
                                     (reponame,
                                      j, \
                                      len(reposlist)))
                exit_code, file_list = oai_harvest_get(prefix = repos[0][2],
                                      baseurl = repos[0][1],
                                      harvestpath = harvestpath,
                                      setspecs = setspecs)
                if exit_code == 1 :
                    update_lastrun(repos[0][0])
                    harvested_files_list = file_list
                else :
                    write_message("an error occurred while harvesting from "
                        "source " + reponame)
                    error_happened_p = True
                    continue

            elif dateflag != 1 and repos[0][8] != 0:
                ### check that update is actually needed,
                ### i.e. lastrun+frequency>today
                timenow = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                lastrundate = re.sub(r'\.[0-9]+$', '',
                    str(repos[0][7])) # remove trailing .00
                timeinsec = int(repos[0][8]) * 60 * 60
                updatedue = add_timestamp_and_timelag(lastrundate, timeinsec)
                proceed = compare_timestamps_with_tolerance(updatedue, timenow)
                if proceed == 0 or proceed == -1 : #update needed!
                    write_message("source " + reponame +
                        " is going to be updated")
                    fromdate = str(repos[0][7])
                    fromdate = fromdate.split()[0] # get rid of time
                                                   # of the day for the moment
                    task_update_progress("Harvesting %s (%i/%i)" % \
                                         (reponame,
                                         j, \
                                         len(reposlist)))
                    exit_code, file_list = oai_harvest_get(prefix = repos[0][2],
                                          baseurl = repos[0][1],
                                          harvestpath = harvestpath,
                                          fro = fromdate,
                                          setspecs = setspecs)
                    if exit_code == 1 :
                        update_lastrun(repos[0][0])
                        harvested_files_list = file_list
                    else :
                        write_message("an error occurred while harvesting "
                            "from source " + reponame)
                        error_happened_p = True
                        continue
                else:
                    write_message("source " + reponame +
                        " does not need updating")
                    continue

            elif dateflag != 1 and repos[0][8] == 0:
                write_message("source " + reponame + \
                    " has frequency set to 'Never' so it will not be updated")
                continue

            # Harvesting done, now convert/extract/filter/upload as requested
            if len(harvested_files_list) < 1:
                write_message("No records harvested for %s" % (reponame,))
                continue
            active_files_list = harvested_files_list
            # Convert phase
            if 'c' in postmode:
                converted_files_list = []
                i = 0
                for active_file in active_files_list:
                    i += 1
                    task_sleep_now_if_required()
                    task_update_progress("Converting material harvested from %s (%i/%i)" % \
                                         (reponame, \
                                          i, \
                                          len(active_files_list)))
                    converted_file = filepath_prefix + "_" + str(i) + "_" + \
                        time.strftime("%Y%m%d%H%M%S") + "_converted"
                    converted_files_list.append(converted_file)
                    (exitcode, err_msg) = call_bibconvert(config = str(repos[0][5]),
                                                          harvestpath = active_file,
                                                          convertpath = converted_file)
                    if exitcode == 0:
                        write_message("material harvested from source " +
                            reponame + " was successfully converted")
                    else:
                        write_message("an error occurred while converting from " +
                            reponame + ': \n' + err_msg)
                        error_happened_p = True
                        continue
                # print stats:
                for converted_file in converted_files_list:
                    write_message("File %s contains %i records." % \
                                  (converted_file,
                                   get_nb_records_in_file(converted_file)))
                active_files_list = converted_files_list

            if 'e' in postmode:
                # Download tarball for each harvested/converted record, then run plotextrator.
                # Update converted xml files with generated xml or add it for upload
                extracted_files_list = []
                i = 0
                for active_file in active_files_list:
                    i += 1
                    task_sleep_now_if_required()
                    task_update_progress("Extracting material harvested from %s (%i/%i)" % \
                                         (reponame, i, len(active_files_list)))
                    extracted_file = filepath_prefix + "_" + str(i) + "_" + \
                        time.strftime("%Y%m%d%H%M%S") + "_extracted"
                    extracted_files_list.append(extracted_file)
                    (exitcode, err_msg) = call_plotextractor(active_file,
                                                             extracted_file)
                    if exitcode == 0:
                        write_message("material harvested from source " +
                            reponame + " was successfully extracted")
                    else:
                        write_message("an error occurred while extracting from " +
                            reponame + ': \n' + err_msg)
                        error_happened_p = True
                        continue
                # print stats:
                for extracted_file in extracted_files_list:
                    write_message("File %s contains %i records." % \
                                  (extracted_file,
                                   get_nb_records_in_file(extracted_file)))
                active_files_list = extracted_files_list

            # Filter-phase
            if 'f' in postmode:
                # first call bibfilter:
                res = 0
                uploaded = False
                i = 0
                for active_file in active_files_list:
                    i += 1
                    task_sleep_now_if_required()
                    task_update_progress("Filtering material harvested from %s (%i/%i)" % \
                                         (reponame, \
                                          i, \
                                          len(active_files_list)))
                    res += call_bibfilter(str(repos[0][11]), active_file)
                if len(active_files_list) > 0:
                    if res == 0:
                        write_message("material harvested from source " +
                                      reponame + " was successfully bibfiltered")
                    else:
                        write_message("an error occurred while bibfiltering "
                                      "harvest from " + reponame)
                        error_happened_p = True
                        continue
                # print stats:
                for active_file in active_files_list:
                    write_message("File %s contains %i records." % \
                        (active_file + ".insert.xml",
                        get_nb_records_in_file(active_file + ".insert.xml")))
                    write_message("File %s contains %i records." % \
                        (active_file + ".correct.xml",
                        get_nb_records_in_file(active_file + ".correct.xml")))
                    write_message("File %s contains %i records." % \
                        (active_file + ".append.xml",
                        get_nb_records_in_file(active_file + ".append.xml")))
                    write_message("File %s contains %i records." % \
                        (active_file + ".holdingpen.xml",
                        get_nb_records_in_file(active_file + ".holdingpen.xml")))

            # Upload files
            if "u" in postmode:
                if 'f' in postmode:
                    # upload filtered files
                    i = 0
                    for active_file in active_files_list:
                        task_sleep_now_if_required()
                        i += 1
                        if get_nb_records_in_file(active_file + ".insert.xml") > 0:
                            task_update_progress("Uploading new records harvested from %s (%i/%i)" % \
                                                 (reponame, \
                                                  i, \
                                                  len(active_files_list)))
                            res += call_bibupload(active_file + ".insert.xml", \
                                                  ["-i"], oai_src_id = repos[0][0])
                            uploaded = True
                        task_sleep_now_if_required()
                        if get_nb_records_in_file(active_file + ".correct.xml") > 0:
                            task_update_progress("Uploading corrections for records harvested from %s (%i/%i)" % \
                                                 (reponame, \
                                                  i, \
                                                  len(active_files_list)))
                            res += call_bibupload(active_file + ".correct.xml", \
                                                  ["-c"], oai_src_id = repos[0][0])
                            uploaded = True
                        if get_nb_records_in_file(active_file + ".append.xml") > 0:
                            task_update_progress("Uploading additions for records harvested from %s (%i/%i)" % \
                                                 (reponame, \
                                                  i, \
                                                  len(active_files_list)))
                            res += call_bibupload(active_file + ".append.xml", \
                                                  ["-a"], oai_src_id = repos[0][0])
                            uploaded = True
                        if get_nb_records_in_file(active_file + ".holdingpen.xml") > 0:
                            task_update_progress("Uploading records harvested from %s to holding pen (%i/%i)" % \
                                                 (reponame, \
                                                  i, \
                                                  len(active_files_list)))
                            res += call_bibupload(active_file + ".holdingpen.xml", \
                                                  ["-o"], oai_src_id = repos[0][0])
                            uploaded = True
                    if len(active_files_list) > 0:
                        if res == 0:
                            if uploaded:
                                write_message("material harvested from source " +
                                              reponame + " was successfully uploaded")
                            else:
                                write_message("nothing to upload")
                        else:
                            write_message("an error occurred while uploading "
                                          "harvest from " + reponame)
                            error_happened_p = True
                            continue
                else:
                    # upload files normally
                    res = 0
                    i = 0
                    uploaded = False
                    for active_file in active_files_list:
                        i += 1
                        task_sleep_now_if_required()
                        if get_nb_records_in_file(active_file) > 0:
                            task_update_progress("Uploading records harvested from %s (%i/%i)" % \
                                                 (reponame, \
                                                  i, \
                                                  len(active_files_list)))
                            res += call_bibupload(active_file, oai_src_id = repos[0][0])
                            uploaded = True
                        if res == 0:
                            if uploaded:
                                write_message("material harvested from source " +
                                              reponame + " was successfully uploaded")
                            else:
                                write_message("nothing to upload")
                        else:
                            write_message("an error occurred while uploading "
                                          "harvest from " + reponame)
                            error_happened_p = True
                            continue

        else: ### this should not happen
            write_message("invalid postprocess mode: " + postmode +
                " skipping repository")
            error_happened_p = True
            continue

    if error_happened_p:
        return False
    else:
        return True
예제 #33
0
def get_author_citations(updated_redic_list, citedbydict, initial_author_dict,
                         config):
    """Traverses citedbydict in order to build "which author is quoted where" dict.
       The keys of this are author names. An entry like "Apollinaire"->[1,2,3] means
       Apollinaire is cited in records 1,2 and 3.
       Input: citedbydict, updated_redic_list = records to be searched, initial_author_dict:
              the dicts from the database.
       Output: authorciteddict. It is initially set to initial_author_dict
    """

    #sorry bout repeated code to get the tags
    tags = ['first_author', 'additional_author', 'alternative_author_name']
    tagvals = {}
    for t in tags:
        try:
            x = config.get(config.get("rank_method", "function"), t)
            tagvals[t] = x
        except:
            register_exception(prefix="attribute " + t + " missing in config",
                               alert_admin=True)
            return initial_author_dict

    #parse the tags
    mainauthortag = tagify(parse_tag(tagvals['first_author']))
    coauthortag = tagify(parse_tag(tagvals['additional_author']))
    extauthortag = tagify(parse_tag(tagvals['alternative_author_name']))
    if task_get_task_param('verbose') >= 9:
        write_message("mainauthortag " + mainauthortag)
        write_message("coauthortag " + coauthortag)
        write_message("extauthortag " + extauthortag)

    author_cited_in = initial_author_dict
    if citedbydict:
        i = 0  #just a counter for debug
        write_message("Checking records referred to in new records")
        for u in updated_redic_list:
            if (i % 1000 == 0):
                mesg = "Author ref done " + str(i) + " of " + str(
                    len(updated_redic_list)) + " records"
                write_message(mesg)
                task_update_progress(mesg)
            i = i + 1

            if citedbydict.has_key(u):
                these_cite_k = citedbydict[u]
                if (these_cite_k is None):
                    these_cite_k = []  #verify it is an empty list, not None
                authors = get_fieldvalues(u, mainauthortag)
                coauthl = get_fieldvalues(u, coauthortag)
                extauthl = get_fieldvalues(u, extauthortag)
                authors.extend(coauthl)
                authors.extend(extauthl)
                for a in authors:
                    if a and author_cited_in.has_key(a):
                        #add all elements in these_cite_k
                        #that are not there already
                        for citer in these_cite_k:
                            tmplist = author_cited_in[a]
                            if (tmplist.count(citer) == 0):
                                tmplist.append(citer)
                                author_cited_in[a] = tmplist
                            else:
                                author_cited_in[a] = these_cite_k

        mesg = "Author ref done fully"
        write_message(mesg)
        task_update_progress(mesg)

        #go through the dictionary again: all keys but search only if new records are cited
        write_message("Checking authors in new records")
        i = 0
        for k in citedbydict.keys():
            if (i % 1000 == 0):
                mesg = "Author cit done " + str(i) + " of " + str(
                    len(citedbydict.keys())) + " records"
                write_message(mesg)
                task_update_progress(mesg)
            i = i + 1

            these_cite_k = citedbydict[k]
            if (these_cite_k is None):
                these_cite_k = []  #verify it is an empty list, not None
            #do things only if these_cite_k contains any new stuff
            intersec_list = list(set(these_cite_k) & set(updated_redic_list))
            if intersec_list:
                authors = get_fieldvalues(k, mainauthortag)
                coauthl = get_fieldvalues(k, coauthortag)
                extauthl = get_fieldvalues(k, extauthortag)
                authors.extend(coauthl)
                authors.extend(extauthl)
                for a in authors:
                    if a and author_cited_in.has_key(a):
                        #add all elements in these_cite_k
                        #that are not there already
                        for citer in these_cite_k:
                            tmplist = author_cited_in[a]
                            if (tmplist.count(citer) == 0):
                                tmplist.append(citer)
                                author_cited_in[a] = tmplist
                            else:
                                author_cited_in[a] = these_cite_k

        mesg = "Author cit done fully"
        write_message(mesg)
        task_update_progress(mesg)

    return author_cited_in
def get_citation_informations(recid_list, config):
    """scans the collections searching references (999C5x -fields) and
       citations for items in the recid_list
       returns a 4 list of dictionaries that contains the citation information
       of cds records
       examples: [ {} {} {} {} ]
                 [ {5: 'SUT-DP-92-70-5'},
                   { 93: ['astro-ph/9812088']},
                   { 93: ['Phys. Rev. Lett. 96 (2006) 081301'] }, {} ]
        NB: stuff here is for analysing new or changed records.
        see "ref_analyzer" for more.
    """
    begin_time = os.times()[4]
    d_reports_numbers = {} #dict of recid -> institute-given-report-code
    d_references_report_numbers = {} #dict of recid -> ['astro-ph/xyz']
    d_references_s = {} #dict of recid -> list_of_the_entries_of_this_recs_bibliography
    d_records_s = {} #dict of recid -> this_records_publication_info
    citation_informations = []

    write_message("config function "+config.get("rank_method", "function"), verbose=9)
    function = ""
    try:
        function = config.get("rank_method", "function")
    except:
        register_exception(prefix="cfg section [rank_method] has no attribute called function", alert_admin=True)
        #we cannot continue
        return [ {}, {}, {}, {} ]
    record_pri_number_tag = ""
    try:
        record_pri_number_tag = config.get(function, "primary_report_number")
    except:
        register_exception(prefix="cfg section "+function+" has no attribute primary_report_number", alert_admin=True)
        return [ {}, {}, {}, {} ]
    record_add_number_tag = ""
    try:
        record_add_number_tag = config.get(config.get("rank_method", "function"),
                                       "additional_report_number")
    except:
        register_exception(prefix="config error. cfg section "+function+" has no attribute additional_report_number", alert_admin=True)
        return [ {}, {}, {}, {} ]

    reference_number_tag = ""
    try:
        reference_number_tag = config.get(config.get("rank_method", "function"),
                                      "reference_via_report_number")
    except:
        register_exception(prefix="config error. cfg section "+function+" has no attribute reference_via_report_number", alert_admin=True)
        return [ {}, {}, {}, {} ]

    reference_tag = ""
    try:
        reference_tag = config.get(config.get("rank_method", "function"),
                               "reference_via_pubinfo")
    except:
        register_exception(prefix="config error. cfg section "+function+" has no attribute reference_via_pubinfo", alert_admin=True)
        return [ {}, {}, {}, {} ]

    p_record_pri_number_tag = tagify(parse_tag(record_pri_number_tag))
    #037a: contains (often) the "hep-ph/0501084" tag of THIS record
    p_record_add_number_tag = tagify(parse_tag(record_add_number_tag))
    #088a: additional short identifier for the record
    p_reference_number_tag = tagify(parse_tag(reference_number_tag))
    #999C5r. this is in the reference list, refers to other records. Looks like: hep-ph/0408002
    p_reference_tag = tagify(parse_tag(reference_tag))
    #999C5s. A standardized way of writing a reference in the reference list. Like: Nucl. Phys. B 710 (2000) 371
    #fields needed to construct the pubinfo for this record
    publication_pages_tag = ""
    publication_year_tag = ""
    publication_journal_tag = ""
    publication_volume_tag = ""
    publication_format_string = "p v (y) c"
    try:
        tag = config.get(function, "pubinfo_journal_page")
        publication_pages_tag = tagify(parse_tag(tag))
        tag = config.get(function, "pubinfo_journal_year")
        publication_year_tag = tagify(parse_tag(tag))
        tag = config.get(function, "pubinfo_journal_title")
        publication_journal_tag = tagify(parse_tag(tag))
        tag = config.get(function, "pubinfo_journal_volume")
        publication_volume_tag = tagify(parse_tag(tag))
        publication_format_string = config.get(function, "pubinfo_journal_format")
    except:
        pass

    #print values for tags for debugging
    if task_get_task_param('verbose') >= 9:
        write_message("tag values")
        write_message("p_record_pri_number_tag "+str(p_record_pri_number_tag))
        write_message("p_reference_tag "+str(p_reference_tag))
        write_message("publication_journal_tag "+str(publication_journal_tag))
        write_message("publication_format_string is "+publication_format_string)
    done = 0 #for status reporting
    numrecs = len(recid_list)

    # perform quick check to see if there are some records with
    # reference tags, because otherwise get.cit.inf would be slow even
    # if there is nothing to index:
    if run_sql("SELECT value FROM bib%sx WHERE tag=%%s LIMIT 1" % p_reference_tag[0:2],
               (p_reference_tag,)) or \
       run_sql("SELECT value FROM bib%sx WHERE tag=%%s LIMIT 1" % p_reference_number_tag[0:2],
               (p_reference_number_tag,)):
        for recid in recid_list:
            if (done % 10 == 0):
                task_sleep_now_if_required()
                #in fact we can sleep any time here

            if (done % 1000 == 0):
                mesg = "get cit.inf done "+str(done)+" of "+str(numrecs)
                write_message(mesg)
                task_update_progress(mesg)
            done = done+1

            if recid in INTBITSET_OF_DELETED_RECORDS:
                # do not treat this record since it was deleted; we
                # skip it like this in case it was only soft-deleted
                # e.g. via bibedit (i.e. when collection tag 980 is
                # DELETED but other tags like report number or journal
                # publication info remained the same, so the calls to
                # get_fieldvalues() below would return old values)
                continue

            pri_report_numbers = get_fieldvalues(recid, p_record_pri_number_tag)
            add_report_numbers = get_fieldvalues(recid, p_record_add_number_tag)
            reference_report_numbers = get_fieldvalues(recid, p_reference_number_tag)
            references_s = get_fieldvalues(recid, p_reference_tag)

            l_report_numbers = pri_report_numbers
            l_report_numbers.extend(add_report_numbers)
            d_reports_numbers[recid] = l_report_numbers

            if reference_report_numbers:
                d_references_report_numbers[recid] = reference_report_numbers

            references_s = get_fieldvalues(recid, p_reference_tag)
            write_message(str(recid)+"'s "+str(p_reference_tag)+" values "+str(references_s), verbose=9)
            if references_s:
                d_references_s[recid] = references_s

            #get a combination of
            #journal vol (year) pages
            if publication_pages_tag and publication_journal_tag and \
                 publication_volume_tag and publication_year_tag and publication_format_string:
                tagsvalues = {} #we store the tags and their values here
                                #like c->444 y->1999 p->"journal of foo",v->20
                tagsvalues["p"] = ""
                tagsvalues["y"] = ""
                tagsvalues["c"] = ""
                tagsvalues["v"] = ""
                tmp = get_fieldvalues(recid, publication_journal_tag)
                if tmp:
                    tagsvalues["p"] = tmp[0]
                tmp = get_fieldvalues(recid, publication_volume_tag)
                if tmp:
                    tagsvalues["v"] = tmp[0]
                tmp = get_fieldvalues(recid, publication_year_tag)
                if tmp:
                    tagsvalues["y"] = tmp[0]
                tmp = get_fieldvalues(recid, publication_pages_tag)
                if tmp:
                    #if the page numbers have "x-y" take just x
                    pages = tmp[0]
                    hpos = pages.find("-")
                    if hpos > 0:
                        pages = pages[:hpos]
                    tagsvalues["c"] = pages
                #format the publ infostring according to the format
                publ = ""
                ok = 1
                for i in range (0, len(publication_format_string)):
                    current = publication_format_string[i]
                    #these are supported
                    if current == "p" or current == "c" or current == "v" \
                                      or current == "y":
                        if tagsvalues[current]:
                            #add the value in the string
                            publ += tagsvalues[current]
                        else:
                            ok = 0
                            break #it was needed and not found
                    else:
                        publ += current #just add the character in the format string
                if ok:
                    write_message("d_records_s (publication info) for "+str(recid)+" is "+publ, verbose=9)
                    d_records_s[recid] = publ
    else:
        mesg = "Warning: there are no records with tag values for "
        mesg += p_reference_number_tag+" or "+p_reference_tag+". Nothing to do."
        write_message(mesg)

    mesg = "get cit.inf done fully"
    write_message(mesg)
    task_update_progress(mesg)

    citation_informations.append(d_reports_numbers)
    citation_informations.append(d_references_report_numbers)
    citation_informations.append(d_references_s)
    citation_informations.append(d_records_s)
    end_time = os.times()[4]
    write_message("Execution time for generating citation info from record: %.2f sec" % \
                  (end_time - begin_time))
    return citation_informations
예제 #35
0
def get_citation_weight(rank_method_code, config):
    """return a dictionary which is used by bibrank daemon for generating
    the index of sorted research results by citation information
    """
    begin_time = time.time()
    last_update_time = get_bibrankmethod_lastupdate(rank_method_code)

    if task_get_option("quick") == "no":
        last_update_time = "0000-00-00 00:00:00"
        write_message("running thorough indexing since quick option not used",
                      verbose=3)

    last_modified_records = get_last_modified_rec(last_update_time)
    #id option forces re-indexing a certain range even if there are no new recs
    if last_modified_records or task_get_option("id"):
        if task_get_option("id"):
            #construct a range of records to index
            taskid = task_get_option("id")
            first = taskid[0][0]
            last = taskid[0][1]
            #make range, last+1 so that e.g. -i 1-2 really means [1,2] not [1]
            updated_recid_list = range(first, last + 1)
        else:
            updated_recid_list = create_recordid_list(last_modified_records)

        write_message("Last update "+str(last_update_time)+" records: "+ \
                       str(len(last_modified_records))+" updates: "+ \
                       str(len(updated_recid_list)))

        #write_message("updated_recid_list: "+str(updated_recid_list))
        result_intermediate = last_updated_result(rank_method_code)

        #result_intermed should be warranted to exists!
        #but if the user entered a "-R" (do all) option, we need to
        #make an empty start set
        if task_get_option("quick") == "no":
            result_intermediate = [{}, {}, {}]

        citation_weight_dic_intermediate = result_intermediate[0]
        citation_list_intermediate = result_intermediate[1]
        reference_list_intermediate = result_intermediate[2]

        #call the procedure that does the hard work by reading fields of
        #citations and references in the updated_recid's (but nothing else)!
        if task_get_task_param('verbose') >= 9:
            write_message("Entering get_citation_informations")
        citation_informations = get_citation_informations(
            updated_recid_list, config)
        #write_message("citation_informations: "+str(citation_informations))
        #create_analysis_tables() #temporary..
        #test how much faster in-mem indexing is
        write_message("Entering ref_analyzer", verbose=9)
        #call the analyser that uses the citation_informations to really
        #search x-cites-y in the coll..
        dic = ref_analyzer(citation_informations,
                           citation_weight_dic_intermediate,
                           citation_list_intermediate,
                           reference_list_intermediate, config,
                           updated_recid_list)
        #dic is docid-numberofreferences like {1: 2, 2: 0, 3: 1}
        #write_message("Docid-number of known references "+str(dic))
        end_time = time.time()
        write_message("Total time of get_citation_weight(): %.2f sec" %
                      (end_time - begin_time))
        task_update_progress("citation analysis done")
    else:
        dic = {}
        write_message(
            "No new records added since last time this rank method was executed"
        )
    return dic
def get_author_citations(updated_redic_list, citedbydict, initial_author_dict, config):
    """Traverses citedbydict in order to build "which author is quoted where" dict.
       The keys of this are author names. An entry like "Apollinaire"->[1,2,3] means
       Apollinaire is cited in records 1,2 and 3.
       Input: citedbydict, updated_redic_list = records to be searched, initial_author_dict:
              the dicts from the database.
       Output: authorciteddict. It is initially set to initial_author_dict
    """

    #sorry bout repeated code to get the tags
    tags = ['first_author', 'additional_author', 'alternative_author_name']
    tagvals = {}
    for t in tags:
        try:
            x = config.get(config.get("rank_method", "function"), t)
            tagvals[t] = x
        except:
            register_exception(prefix="attribute "+t+" missing in config", alert_admin=True)
            return initial_author_dict

    #parse the tags
    mainauthortag = tagify(parse_tag(tagvals['first_author']))
    coauthortag = tagify(parse_tag(tagvals['additional_author']))
    extauthortag = tagify(parse_tag(tagvals['alternative_author_name']))
    if task_get_task_param('verbose') >= 9:
        write_message("mainauthortag "+mainauthortag)
        write_message("coauthortag "+coauthortag)
        write_message("extauthortag "+extauthortag)

    author_cited_in = initial_author_dict
    if citedbydict:
        i = 0 #just a counter for debug
        write_message("Checking records referred to in new records")
        for u in updated_redic_list:
            if (i % 1000 == 0):
                mesg = "Author ref done "+str(i)+" of "+str(len(updated_redic_list))+" records"
                write_message(mesg)
                task_update_progress(mesg)
            i = i + 1

            if citedbydict.has_key(u):
                these_cite_k = citedbydict[u]
                if (these_cite_k is None):
                    these_cite_k = [] #verify it is an empty list, not None
                authors = get_fieldvalues(u, mainauthortag)
                coauthl = get_fieldvalues(u, coauthortag)
                extauthl = get_fieldvalues(u, extauthortag)
                authors.extend(coauthl)
                authors.extend(extauthl)
                for a in authors:
                    if a and author_cited_in.has_key(a):
                        #add all elements in these_cite_k
                        #that are not there already
                        for citer in these_cite_k:
                            tmplist = author_cited_in[a]
                            if (tmplist.count(citer) == 0):
                                tmplist.append(citer)
                                author_cited_in[a] = tmplist
                            else:
                                author_cited_in[a] = these_cite_k

        mesg = "Author ref done fully"
        write_message(mesg)
        task_update_progress(mesg)

        #go through the dictionary again: all keys but search only if new records are cited
        write_message("Checking authors in new records")
        i = 0
        for k in citedbydict.keys():
            if (i % 1000 == 0):
                mesg = "Author cit done "+str(i)+" of "+str(len(citedbydict.keys()))+" records"
                write_message(mesg)
                task_update_progress(mesg)
            i = i + 1

            these_cite_k = citedbydict[k]
            if (these_cite_k is None):
                these_cite_k = [] #verify it is an empty list, not None
            #do things only if these_cite_k contains any new stuff
            intersec_list = list(set(these_cite_k)&set(updated_redic_list))
            if intersec_list:
                authors = get_fieldvalues(k, mainauthortag)
                coauthl = get_fieldvalues(k, coauthortag)
                extauthl = get_fieldvalues(k, extauthortag)
                authors.extend(coauthl)
                authors.extend(extauthl)
                for a in authors:
                    if a and author_cited_in.has_key(a):
                        #add all elements in these_cite_k
                        #that are not there already
                        for citer in these_cite_k:
                            tmplist = author_cited_in[a]
                            if (tmplist.count(citer) == 0):
                                tmplist.append(citer)
                                author_cited_in[a] = tmplist
                            else:
                                author_cited_in[a] = these_cite_k

        mesg = "Author cit done fully"
        write_message(mesg)
        task_update_progress(mesg)

    return author_cited_in
예제 #37
0
def task_run_core(name=NAME):
    """ Performs a search to find records without a texkey, generates a new
    one and uploads the changes in chunks """
    recids = task_get_task_param('recids')
    if recids:
        start_date = None
        write_message("processing recids from commandline")
    else:
        start_date = datetime.now()
        recids = intbitset()
        recids |= intbitset(
            perform_request_search(p='-035:spirestex -035:inspiretex',
                                   cc='HEP'))

        if task_get_task_param('all'):
            write_message("processing all records without texkey")
        else:
            _, last_date = fetch_last_updated(name)
            recids = recids & fetch_records_modified_since(last_date)
            write_message("processing records modified since: %s" % last_date)

    write_message("Found %s records to assign texkeys" % len(recids))
    processed_recids = []
    xml_to_process = []
    for count, recid in enumerate(recids):
        write_message("processing recid %s" % recid)

        # Check that the record does not have already a texkey
        has_texkey = False
        recstruct = get_record(recid)
        for instance in record_get_field_instances(recstruct,
                                                   tag="035",
                                                   ind1="",
                                                   ind2=""):
            try:
                provenance = field_get_subfield_values(instance, "9")[0]
            except IndexError:
                provenance = ""
            try:
                value = field_get_subfield_values(instance, "a")[0]
            except IndexError:
                value = ""
            provenances = ["SPIRESTeX", "INSPIRETeX"]
            if provenance in provenances and value:
                has_texkey = True
                write_message("INFO: Record %s has already texkey %s" %
                              (recid, value))

        if not has_texkey:
            TexKeySeq = TexkeySeq()
            new_texkey = ""
            try:
                new_texkey = TexKeySeq.next_value(recid)
            except TexkeyNoAuthorError:
                write_message(
                    "WARNING: Record %s has no first author or collaboration" %
                    recid)
                continue
            except TexkeyNoYearError:
                write_message("WARNING: Record %s has no year" % recid)
                continue
            write_message("Created texkey %s for record %d" %
                          (new_texkey, recid))
            xml = create_xml(recid, new_texkey)
            processed_recids.append(recid)
            xml_to_process.append(xml)

        task_update_progress("Done %d out of %d." % (count, len(recids)))
        task_sleep_now_if_required()

    # sequence ID to be used in all subsequent tasks
    sequence_id = str(random.randrange(1, 4294967296))
    if xml_to_process:
        process_chunk(xml_to_process, sequence_id)

    # Finally, index all the records processed
    # FIXME: Waiting for sequence id to be fixed
    # if processed_recids:
    #     submit_bibindex_task(processed_recids, sequence_id)

    if start_date:
        store_last_updated(0, start_date, name)

    return True
def ref_analyzer(citation_informations, initialresult, initial_citationlist,
                 initial_referencelist,config, updated_rec_list ):
    """Analyze the citation informations and calculate the citation weight
       and cited by list dictionary.
    """
    function = ""
    try:
        function = config.get("rank_method", "function")
    except:
        register_exception(prefix="cfg section [rank_method] has no attr function", alert_admin=True)
        return {}

    pubrefntag = ""
    try:
        pubrefntag  = config.get(function, "reference_via_report_number")
    except:
        register_exception(prefix="cfg section "+function+" has no attr reference_via_report_number", alert_admin=True)
        return {}

    pubreftag = ""
    try:
        pubreftag = config.get(function, "reference_via_pubinfo")
    except:
        register_exception(prefix="cfg section "+function+" has no attr reference_via_pubinfo", alert_admin=True)
        return {}

    #pubrefntag is often 999C5r, pubreftag 999C5s
    if task_get_task_param('verbose') >= 9:
        write_message("pubrefntag "+pubrefntag)
        write_message("pubreftag "+pubreftag)

    citation_list = initial_citationlist
    reference_list = initial_referencelist
    result = initialresult
    d_reports_numbers = citation_informations[0] #dict of recid -> institute_give_publ_id
    d_references_report_numbers = citation_informations[1] #dict of recid -> ['astro-ph/xyz'..]
    d_references_s = citation_informations[2]
       #dict of recid -> publication_infos_in_its_bibliography
    d_records_s = citation_informations[3] #recid -> its publication inf
    t1 = os.times()[4]

    write_message("Phase 0: temporarily remove changed records from citation dictionaries; they will be filled later")
    for somerecid in updated_rec_list:
        try:
            del citation_list[somerecid]
        except KeyError:
            pass
        try:
            del reference_list[somerecid]
        except KeyError:
            pass

    write_message("Phase 1: d_references_report_numbers")
    #d_references_report_numbers: e.g 8 -> ([astro-ph/9889],[hep-ph/768])
    #meaning: rec 8 contains these in bibliography

    done = 0
    numrecs = len(d_references_report_numbers)
    for thisrecid, refnumbers in d_references_report_numbers.iteritems():
        if (done % 1000 == 0):
            mesg =  "d_references_report_numbers done "+str(done)+" of "+str(numrecs)
            write_message(mesg)
            task_update_progress(mesg)
            task_sleep_now_if_required()
        done = done+1

        for refnumber in refnumbers:
            if refnumber:
                p = refnumber
                f = 'reportnumber'
                #sanitise p
                p.replace("\n",'')
                #search for "hep-th/5644654 or such" in existing records
                rec_ids = get_recids_matching_query(p, f)
                if rec_ids and rec_ids[0]:
                    write_citer_cited(thisrecid, rec_ids[0])
                    remove_from_missing(p)
                    if not result.has_key(rec_ids[0]):
                        result[rec_ids[0]] = 0
                    # Citation list should have rec_ids[0] but check anyway
                    if not citation_list.has_key(rec_ids[0]):
                        citation_list[rec_ids[0]] = []
                    #append unless this key already has the item
                    if not thisrecid in citation_list[rec_ids[0]]:
                        citation_list[rec_ids[0]].append(thisrecid)
                        #and update result
                        result[rec_ids[0]] += 1

                    if not reference_list.has_key(thisrecid):
                        reference_list[thisrecid] = []
                    if not rec_ids[0] in reference_list[thisrecid]:
                        reference_list[thisrecid].append(rec_ids[0])
                else:
                    #the reference we wanted was not found among our records.
                    #put the reference in the "missing".. however, it will look
                    #bad.. gfhgf/1254312, so  get the corresponding 999C5s (full ref) too
                    #This should really be done in the next loop d_references_s
                    #but the 999C5s fields are not yet normalized

                    #rectext = print_record(thisrecid, format='hm', ot=pubreftag[:-1])
                    rectext = "" # print_record() call disabled to speed things up
                    lines = rectext.split("\n")
                    rpart = p #to be used..
                    for l in lines:
                        if (l.find(p) > 0): #the gfhgf/1254312 was found.. get the s-part of it
                            st = l.find('$s')
                            if (st > 0):
                                end = l.find('$', st)
                                if (end == st):
                                    end = len(l)
                                rpart = l[st+2:end]
                    insert_into_missing(thisrecid, rpart)

    mesg = "d_references_report_numbers done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t2 = os.times()[4]

    #try to find references based on 999C5s, like Phys.Rev.Lett. 53 (1986) 2285
    write_message("Phase 2: d_references_s")
    done = 0
    numrecs = len(d_references_s)
    for thisrecid, refss in d_references_s.iteritems():
        if (done % 1000 == 0):
            mesg = "d_references_s done "+str(done)+" of "+str(numrecs)
            write_message(mesg)
            task_update_progress(mesg)
            task_sleep_now_if_required()

        done = done+1

        for refs in refss:
            if refs:
                p = refs
                #remove the latter page number if it is like 67-74
                matches = re.compile("(.*)(-\d+$)").findall(p)
                if matches and matches[0]:
                    p = matches[0][0]
                rec_id = None
                try:
                    rec_ids = list(search_unit(p, 'journal') - INTBITSET_OF_DELETED_RECORDS)
                except:
                    rec_ids = None
                write_message("These match searching "+p+" in journal: "+str(rec_id), verbose=9)
                if rec_ids and rec_ids[0]:
                    #the refered publication is in our collection, remove
                    #from missing
                    remove_from_missing(p)
                else:
                    #it was not found so add in missing
                    insert_into_missing(thisrecid, p)
                #check citation and reference for this..
                if rec_ids and rec_ids[0]:
                    #the above should always hold
                    if not result.has_key(rec_ids[0]):
                        result[rec_ids[0]] = 0
                    if not citation_list.has_key(rec_ids[0]):
                        citation_list[rec_ids[0]] = []
                    if not thisrecid in citation_list[rec_ids[0]]:
                        citation_list[rec_ids[0]].append(thisrecid) #append actual list
                        result[rec_ids[0]] += 1 #add count for this..

                    #update reference_list accordingly
                    if not reference_list.has_key(thisrecid):
                        reference_list[thisrecid] = []
                    if not rec_ids[0] in reference_list[thisrecid]:
                        reference_list[thisrecid].append(rec_ids[0])
    mesg = "d_references_s done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t3 = os.times()[4]
    done = 0
    numrecs = len(d_reports_numbers)
    write_message("Phase 3: d_reports_numbers")

    #search for stuff like CERN-TH-4859/87 in list of refs
    for thisrecid, reportcodes in d_reports_numbers.iteritems():
        if (done % 1000 == 0):
            mesg = "d_report_numbers done "+str(done)+" of "+str(numrecs)
            write_message(mesg)
            task_update_progress(mesg)
        done = done+1

        for reportcode in reportcodes:
            if reportcode:
                rec_ids = []
                try:
                    rec_ids = get_recids_matching_query(reportcode, pubrefntag)
                except:
                    rec_ids = []

                if rec_ids:
                    for recid in rec_ids:
                        #normal checks..
                        if not citation_list.has_key(thisrecid):
                            citation_list[thisrecid] = []
                        if not reference_list.has_key(recid):
                            reference_list[recid] = []
                        if not result.has_key(thisrecid):
                            result[thisrecid] = 0

                        #normal updates
                        if not recid in citation_list[thisrecid]:
                            result[thisrecid] += 1
                            citation_list[thisrecid].append(recid)
                        if not thisrecid in reference_list[recid]:
                            reference_list[recid].append(thisrecid)

    mesg = "d_report_numbers done fully"
    write_message(mesg)
    task_update_progress(mesg)

    #find this record's pubinfo in other records' bibliography
    write_message("Phase 4: d_records_s")
    done = 0
    numrecs = len(d_records_s)
    t4 = os.times()[4]
    for thisrecid, recs in d_records_s.iteritems():
        if (done % 1000 == 0):
            mesg = "d_records_s done "+str(done)+" of "+str(numrecs)
            write_message(mesg)
            task_update_progress(mesg)
        done = done+1
        p = recs.replace("\"","")
        #search the publication string like Phys. Lett., B 482 (2000) 417 in 999C5s
        rec_ids = list(search_unit(f=pubreftag, p=p, m='a') - INTBITSET_OF_DELETED_RECORDS)
        write_message("These records match "+p+" in "+pubreftag+" : "+str(rec_ids), verbose=9)
        if rec_ids:
            for rec_id in rec_ids:
                #normal checks
                if not result.has_key(thisrecid):
                    result[thisrecid] = 0
                if not citation_list.has_key(thisrecid):
                    citation_list[thisrecid] = []
                if not reference_list.has_key(rec_id):
                    reference_list[rec_id] = []

                if not rec_id in citation_list[thisrecid]:
                    result[thisrecid] += 1
                    citation_list[thisrecid].append(rec_id)
                if not thisrecid in reference_list[rec_id]:
                    reference_list[rec_id].append(thisrecid)

    mesg = "d_records_s done fully"
    write_message(mesg)
    task_update_progress(mesg)

    write_message("Phase 5: reverse lists")

    #remove empty lists in citation and reference
    keys = citation_list.keys()
    for k in keys:
        if not citation_list[k]:
            del citation_list[k]

    keys = reference_list.keys()
    for k in keys:
        if not reference_list[k]:
            del reference_list[k]

    write_message("Phase 6: self-citations")
    selfdic = {}
    #get the initial self citation dict
    initial_self_dict = get_cit_dict("selfcitdict")
    selfdic = initial_self_dict
    #add new records to selfdic
    acit = task_get_option("author-citations")
    if not acit:
        write_message("Self cite processing disabled. Use -A option to enable it.")
    else:
        write_message("self cite and author citations enabled")
        selfdic = get_self_citations(updated_rec_list, citation_list,
                                 initial_self_dict, config)
    #selfdic consists of
    #key k -> list of values [v1,v2,..]
    #where k is a record with author A and k cites v1,v2.. and A appears in v1,v2..

    #create a reverse "x cited by y" self cit dict
    selfcitedbydic = {}
    for k in selfdic.keys():
        vlist = selfdic[k]
        for v in vlist:
            if selfcitedbydic.has_key(v):
                tmplist = selfcitedbydic[v]
                if not k in tmplist:
                    tmplist.append(k)
            else:
                tmplist = [k]
            selfcitedbydic[v] = tmplist

    write_message("Getting author citations")

    #get author citations for records in updated_rec_list
    initial_author_dict = get_initial_author_dict()
    authorcitdic = initial_author_dict
    acit = task_get_option("author-citations")
    if not acit:
        print "Author cites disabled. Use -A option to enable it."
    else:
        write_message("author citations enabled")
        authorcitdic = get_author_citations(updated_rec_list, citation_list,
                                        initial_author_dict, config)


    if task_get_task_param('verbose') >= 3:
        #print only X first to prevent flood
        tmpdict = {}
        tmp = citation_list.keys()[0:10]
        for t in tmp:
            tmpdict[t] = citation_list[t]
        write_message("citation_list (x is cited by y): "+str(tmpdict))
        write_message("size: "+str(len(citation_list.keys())))
        tmp = reference_list.keys()[0:10]
        tmpdict = {}
        for t in tmp:
            tmpdict[t] = reference_list[t]
        write_message("reference_list (x cites y): "+str(tmpdict))
        write_message("size: "+str(len(reference_list.keys())))
        tmp = selfcitedbydic.keys()[0:10]
        tmpdict = {}
        for t in tmp:
            tmpdict[t] = selfcitedbydic[t]
        mesg = "selfcitedbydic (x is cited by y and one of the authors of x same as y's):"
        mesg += str(tmpdict)
        write_message(mesg)
        write_message("size: "+str(len(selfcitedbydic.keys())))
        tmp = selfdic.keys()[0:100]
        tmpdict = {}
        for t in tmp:
            tmpdict[t] = selfdic[t]
        mesg = "selfdic (x cites y and one of the authors of x same as y's): "+str(tmpdict)
        write_message(mesg)
        write_message("size: "+str(len(selfdic.keys())))
        tmp = authorcitdic.keys()[0:10]
        tmpdict = {}
        for t in tmp:
            tmpdict[t] = authorcitdic[t]
        write_message("authorcitdic (author is cited in recs): "+str(tmpdict))
        write_message("size: "+str(len(authorcitdic.keys())))
    insert_cit_ref_list_intodb(citation_list, reference_list,
                               selfcitedbydic, selfdic, authorcitdic)

    t5 = os.times()[4]

    write_message("Execution time for analyzing the citation information generating the dictionary:")
    write_message("... checking ref number: %.2f sec" % (t2-t1))
    write_message("... checking ref ypvt: %.2f sec" % (t3-t2))
    write_message("... checking rec number: %.2f sec" % (t4-t3))
    write_message("... checking rec ypvt: %.2f sec" % (t5-t4))
    write_message("... total time of ref_analyze: %.2f sec" % (t5-t1))

    return result
def get_citation_weight(rank_method_code, config):
    """return a dictionary which is used by bibrank daemon for generating
    the index of sorted research results by citation information
    """
    begin_time = time.time()
    last_update_time = get_bibrankmethod_lastupdate(rank_method_code)

    if task_get_option("quick") == "no":
        last_update_time = "0000-00-00 00:00:00"
        write_message("running thorough indexing since quick option not used", verbose=3)

    last_modified_records = get_last_modified_rec(last_update_time)
    #id option forces re-indexing a certain range even if there are no new recs
    if last_modified_records or task_get_option("id"):
        if task_get_option("id"):
            #construct a range of records to index
            taskid = task_get_option("id")
            first = taskid[0][0]
            last = taskid[0][1]
            #make range, last+1 so that e.g. -i 1-2 really means [1,2] not [1]
            updated_recid_list = range(first, last+1)
        else:
            updated_recid_list = create_recordid_list(last_modified_records)

        write_message("Last update "+str(last_update_time)+" records: "+ \
                       str(len(last_modified_records))+" updates: "+ \
                       str(len(updated_recid_list)))

        #write_message("updated_recid_list: "+str(updated_recid_list))
        result_intermediate = last_updated_result(rank_method_code)

        #result_intermed should be warranted to exists!
        #but if the user entered a "-R" (do all) option, we need to
        #make an empty start set
        if task_get_option("quick") == "no":
            result_intermediate = [{}, {}, {}]
        else:
            # check indexing times of `journal' and `reportnumber`
            # indexes, since if they are not up to date yet, then we
            # should wait and not run citation indexing as of yet:
            last_timestamp_bibrec = run_sql("SELECT DATE_FORMAT(MAX(modification_date), '%%Y-%%m-%%d %%H:%%i:%%s') FROM bibrec", (), 1)[0][0]
            last_timestamp_indexes = run_sql("SELECT DATE_FORMAT(MAX(last_updated), '%%Y-%%m-%%d %%H:%%i:%%s') FROM idxINDEX WHERE name IN (%s,%s)", ('journal', 'reportnumber'), 1)[0][0]
            if not last_timestamp_indexes or \
               not last_timestamp_bibrec or \
               last_timestamp_bibrec > last_timestamp_indexes:
                write_message("Not running citation indexer since journal/reportnumber indexes are not up to date yet.")
                return {}

        citation_weight_dic_intermediate = result_intermediate[0]
        citation_list_intermediate = result_intermediate[1]
        reference_list_intermediate = result_intermediate[2]

        # Enrich updated_recid_list so that it would contain also
        # records citing or referring to updated records, so that
        # their citation information would be updated too.  Not the
        # most efficient way to treat this problem, but the one that
        # requires least code changes until ref_analyzer() is more
        # nicely re-factored.
        updated_recid_list_set = intbitset(updated_recid_list)
        for somerecid in updated_recid_list:
            # add both citers and citees:
            updated_recid_list_set |= intbitset(citation_list_intermediate.get(somerecid, []))
            updated_recid_list_set |= intbitset(reference_list_intermediate.get(somerecid, []))
        updated_recid_list = list(updated_recid_list_set)

        #call the procedure that does the hard work by reading fields of
        #citations and references in the updated_recid's (but nothing else)!
        if task_get_task_param('verbose') >= 9:
            write_message("Entering get_citation_informations")
        citation_informations = get_citation_informations(updated_recid_list,
                                                          config)
        #write_message("citation_informations: "+str(citation_informations))
        #create_analysis_tables() #temporary..
                                  #test how much faster in-mem indexing is
        write_message("Entering ref_analyzer", verbose=9)
        #call the analyser that uses the citation_informations to really
        #search x-cites-y in the coll..
        dic = ref_analyzer(citation_informations,
                           citation_weight_dic_intermediate,
                           citation_list_intermediate,
                           reference_list_intermediate,
                           config,updated_recid_list)
                    #dic is docid-numberofreferences like {1: 2, 2: 0, 3: 1}
        #write_message("Docid-number of known references "+str(dic))
        end_time = time.time()
        write_message("Total time of get_citation_weight(): %.2f sec" % (end_time - begin_time))
        task_update_progress("citation analysis done")
    else:
        dic = {}
        write_message("No new records added since last time this rank method was executed")
    return dic
예제 #40
0
def ref_analyzer(citation_informations, updated_recids, tags, config):
    """Analyze the citation informations and calculate the citation weight
       and cited by list dictionary.
    """
    citations = {}
    for recid in updated_recids:
        citations[recid] = set()
    references = {}
    for recid in updated_recids:
        references[recid] = set()

    def step(msg_prefix, recid, done, total):
        if done % 30 == 0:
            task_sleep_now_if_required()

        if done % 1000 == 0:
            mesg = "%s done %s of %s" % (msg_prefix, done, total)
            write_message(mesg)
            task_update_progress(mesg)

        write_message("Processing: %s" % recid, verbose=9)

    def add_to_cites(citer, citee):
        # Make sure we don't add ourselves
        # Workaround till we know why we are adding ourselves.
        if citer == citee:
            return

        citations[citee].add(citer)
        if citer in updated_recids:
            references[citer].add(citee)

    def add_to_refs(citer, citee):
        # Make sure we don't add ourselves
        # Workaround till we know why we are adding ourselves.
        if citer == citee:
            return

        if citee in updated_recids:
            citations[citee].add(citer)
        references[citer].add(citee)

    # dict of recid -> institute_give_publ_id
    records_info, references_info = citation_informations

    t1 = os.times()[4]

    # Try to find references based on 999C5r
    # e.g 8 -> ([astro-ph/9889],[hep-ph/768])
    # meaning: rec 8 contains these in bibliography
    write_message("Phase 1: Report numbers references")
    done = 0
    for thisrecid, refnumbers in references_info['report-numbers'].iteritems():
        step("Report numbers references", thisrecid, done,
             len(references_info['report-numbers']))
        done += 1

        for refnumber in (r for r in refnumbers if r):
            field = 'reportnumber'
            refnumber = standardize_report_number(refnumber)
            # Search for "hep-th/5644654 or such" in existing records
            recids = get_recids_matching_query(p=refnumber,
                                               f=field,
                                               config=config)
            write_message("These match searching %s in %s: %s" %
                          (refnumber, field, list(recids)),
                          verbose=9)

            if not recids:
                insert_into_missing(thisrecid, refnumber)
            else:
                remove_from_missing(refnumber)

            if len(recids) > 1:
                store_citation_warning('multiple-matches', refnumber)
                msg = "Whoops: record '%d' report number value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, refnumber, repr(recids))
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]:  # take only the first one
                add_to_refs(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t2 = os.times()[4]

    # Try to find references based on 999C5s
    # e.g. Phys.Rev.Lett. 53 (1986) 2285
    write_message("Phase 2: Journal references")
    done = 0
    for thisrecid, refs in references_info['journals'].iteritems():
        step("Journal references", thisrecid, done,
             len(references_info['journals']))
        done += 1

        for reference in (r for r in refs if r):
            p = reference
            field = 'journal'

            # check reference value to see whether it is well formed:
            if not re_CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK.match(p):
                store_citation_warning('not-well-formed', p)
                msg = "Whoops, record '%d' reference value '%s' " \
                      "is not well formed; skipping it." % (thisrecid, p)
                write_message(msg, stream=sys.stderr)
                continue  # skip this ill-formed value

            recids = get_recids_matching_query(p=p, f=field, config=config)
            write_message("These match searching %s in %s: %s" %
                          (reference, field, list(recids)),
                          verbose=9)

            if not recids:
                insert_into_missing(thisrecid, p)
            else:
                remove_from_missing(p)

            if len(recids) > 1:
                store_citation_warning('multiple-matches', p)
                msg = "Whoops: record '%d' reference value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, p, repr(recids))
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]:  # take only the first one
                add_to_refs(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t3 = os.times()[4]

    # Try to find references based on 999C5a
    # e.g. 10.1007/BF03170733
    write_message("Phase 3: DOI references")
    done = 0
    for thisrecid, refs in references_info['doi'].iteritems():
        step("DOI references", thisrecid, done, len(references_info['doi']))
        done += 1

        for reference in (r for r in refs if r):
            p = reference
            field = 'doi'

            recids = get_recids_matching_query(p=p, f=field, config=config)
            write_message("These match searching %s in %s: %s" %
                          (reference, field, list(recids)),
                          verbose=9)

            if not recids:
                insert_into_missing(thisrecid, p)
            else:
                remove_from_missing(p)

            if len(recids) > 1:
                store_citation_warning('multiple-matches', p)
                msg = "Whoops: record '%d' DOI value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, p, repr(recids))
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]:  # take only the first one
                add_to_refs(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t4 = os.times()[4]

    # Try to find references based on 999C5a (hdl references)
    # e.g. 4263537/4000
    write_message("Phase 4: HDL references")
    done = 0
    for thisrecid, refs in references_info['hdl'].iteritems():
        step("HDL references", thisrecid, done, len(references_info['hdl']))
        done += 1

        for reference in (r for r in refs if r):
            p = reference
            field = 'hdl'

            recids = get_recids_matching_query(p=p, f=field, config=config)
            write_message("These match searching %s in %s: %s" %
                          (reference, field, list(recids)),
                          verbose=9)

            if not recids:
                insert_into_missing(thisrecid, p)
            else:
                remove_from_missing(p)

            if len(recids) > 1:
                store_citation_warning('multiple-matches', p)
                msg = "Whoops: record '%d' HDL value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, p, repr(recids))
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]:  # take only the first one
                add_to_refs(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t5 = os.times()[4]

    # Try to find references based on 999C50
    # e.g. 1244
    write_message("Phase 5: Record ID references")
    done = 0
    for thisrecid, refs in references_info['record_id'].iteritems():
        step("Record ID references", thisrecid, done,
             len(references_info['record_id']))
        done += 1
        field = "001"
        for recid in (r for r in refs if r):
            valid = get_recids_matching_query(p=recid, f=field, config=config)
            write_message("These match searching %s in %s: %s" %
                          (recid, field, list(valid)),
                          verbose=9)
            if valid:
                add_to_refs(thisrecid, valid[0])

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t6 = os.times()[4]

    # Try to find references based on 999C5i
    # e.g. 978-3-942171-73-1
    write_message("Phase 6: ISBN references")
    done = 0
    for thisrecid, refs in references_info['isbn'].iteritems():
        step("ISBN references", thisrecid, done, len(references_info['isbn']))
        done += 1

        for reference in (r for r in refs if r):
            p = reference
            field = 'isbn'

            recids = get_recids_matching_query(p=p, f=field, config=config)
            write_message("These match searching %s in %s: %s" %
                          (reference, field, list(recids)),
                          verbose=9)

            if not recids:
                insert_into_missing(thisrecid, p)
            else:
                remove_from_missing(p)

            if len(recids) > 1:
                store_citation_warning('multiple-matches', p)
                msg = "Whoops: record '%d' ISBN value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, p, repr(recids))
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]:  # take only the first one
                add_to_refs(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t7 = os.times()[4]

    # Search for stuff like CERN-TH-4859/87 in list of refs
    write_message("Phase 7: report numbers catchup")
    done = 0
    for thisrecid, reportcodes in records_info['report-numbers'].iteritems():
        step("Report numbers catchup", thisrecid, done,
             len(records_info['report-numbers']))
        done += 1

        for reportcode in (r for r in reportcodes if r):
            if reportcode.startswith('arXiv'):
                std_reportcode = standardize_report_number(reportcode)
                report_pattern = r'^%s( *\[[a-zA-Z.-]*\])?' % \
                                                re.escape(std_reportcode)
                recids = get_recids_matching_query(
                    p=report_pattern,
                    f=tags['refs_report_number'],
                    m='r',
                    config=config)
            else:
                recids = get_recids_matching_query(
                    p=reportcode, f=tags['refs_report_number'], config=config)
            for recid in recids:
                add_to_cites(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    # Find this record's pubinfo in other records' bibliography
    write_message("Phase 8: journals catchup")
    done = 0
    t8 = os.times()[4]
    for thisrecid, rec_journals in records_info['journals'].iteritems():
        step("Journals catchup", thisrecid, done,
             len(records_info['journals']))
        done += 1

        for journal in rec_journals:
            journal = journal.replace("\"", "")
            # Search the publication string like
            # Phys. Lett., B 482 (2000) 417 in 999C5s
            recids = get_recids_matching_query(p=journal,
                                               f=tags['refs_journal'],
                                               config=config)
            write_message("These records match %s in %s: %s" %
                          (journal, tags['refs_journal'], list(recids)),
                          verbose=9)

            for recid in recids:
                add_to_cites(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    write_message("Phase 9: DOI catchup")
    done = 0
    t9 = os.times()[4]
    for thisrecid, dois in records_info['doi'].iteritems():
        step("DOI catchup", thisrecid, done, len(records_info['doi']))
        done += 1

        for doi in dois:
            recids = get_recids_matching_query(p=doi,
                                               f=tags['refs_doi'],
                                               config=config)
            write_message("These records match %s in %s: %s" %
                          (doi, tags['refs_doi'], list(recids)),
                          verbose=9)

            for recid in recids:
                add_to_cites(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    write_message("Phase 10: HDL catchup")
    done = 0
    t10 = os.times()[4]
    for thisrecid, hdls in records_info['hdl'].iteritems():
        step("HDL catchup", thisrecid, done, len(records_info['hdl']))
        done += 1

        for hdl in hdls:
            recids = get_recids_matching_query(p=hdl,
                                               f=tags['refs_doi'],
                                               config=config)
            write_message("These records match %s in %s: %s" %
                          (hdl, tags['refs_doi'], list(recids)),
                          verbose=9)

            for recid in recids:
                add_to_cites(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    write_message("Phase 11: ISBN catchup")
    done = 0
    t11 = os.times()[4]
    for thisrecid, isbns in records_info['isbn'].iteritems():
        step("ISBN catchup", thisrecid, done, len(records_info['isbn']))
        done += 1

        for isbn in isbns:
            recids = get_recids_matching_query(p=isbn,
                                               f=tags['refs_isbn'],
                                               config=config)
            write_message("These records match %s in %s: %s" %
                          (isbn, tags['refs_isbn'], list(recids)),
                          verbose=9)

            for recid in recids:
                add_to_cites(recid, thisrecid)

    write_message("Phase 12: Record ID catchup")
    done = 0
    t12 = os.times()[4]
    for thisrecid, record_ids in records_info['record_id'].iteritems():
        step("Record ID catchup", thisrecid, done,
             len(records_info['record_id']))
        done += 1

        for record_id in record_ids:
            recids = get_recids_matching_query(p=record_id,
                                               f=tags['refs_record_id'],
                                               config=config)
            write_message("These records match %s in %s: %s" %
                          (record_id, tags['refs_record_id'], list(recids)),
                          verbose=9)

            for recid in recids:
                add_to_cites(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    if task_get_task_param('verbose') >= 3:
        # Print only X first to prevent flood
        write_message("citation_list (x is cited by y):")
        write_message(dict(islice(citations.iteritems(), 10)))
        write_message("size: %s" % len(citations))
        write_message("reference_list (x cites y):")
        write_message(dict(islice(references.iteritems(), 10)))
        write_message("size: %s" % len(references))

    t13 = os.times()[4]

    write_message("Execution time for analyzing the citation information "
                  "generating the dictionary:")
    write_message("... checking ref report numbers: %.2f sec" % (t2 - t1))
    write_message("... checking ref journals: %.2f sec" % (t3 - t2))
    write_message("... checking ref DOI: %.2f sec" % (t4 - t3))
    write_message("... checking ref HDL: %.2f sec" % (t5 - t4))
    write_message("... checking ref Record ID: %.2f sec" % (t6 - t5))
    write_message("... checking ref ISBN: %.2f sec" % (t7 - t6))
    write_message("... checking rec report numbers: %.2f sec" % (t8 - t7))
    write_message("... checking rec journals: %.2f sec" % (t9 - t8))
    write_message("... checking rec DOI: %.2f sec" % (t10 - t9))
    write_message("... checking rec HDL: %.2f sec" % (t11 - t10))
    write_message("... checking rec ISBN: %.2f sec" % (t12 - t11))
    write_message("... checking rec Record ID: %.2f sec" % (t13 - t12))
    write_message("... total time of ref_analyze: %.2f sec" % (t13 - t1))

    return citations, references