示例#1
0
文件: task.py 项目: SCOAP3/invenio
def task_run_core(recid, records, bibcatalog_system=None, _arxiv=False):
    setup_loggers(None, use_bibtask=True)

    if _arxiv:
        overwrite = True
    else:
        overwrite = not task_get_option('no-overwrite')

    try:
        record = extract_references_from_record(recid)
        msg = "Extracted references for %s" % recid
        safe_to_extract = True
        if overwrite:
            write_message("%s (overwrite)" % msg)
        else:
            write_message(msg)
            if not check_record_for_refextract(recid):
                write_message('Record not safe for re-extraction, skipping')
                safe_to_extract = False

        if safe_to_extract:
            records.append(record)
            # Create a RT ticket if necessary
            if task_get_option('new') or task_get_option('create-ticket'):
                create_ticket(recid, bibcatalog_system)
    except FullTextNotAvailable:
        write_message("No full text available for %s" % recid)
示例#2
0
def clean_bibxxx():
    """
    Clean unreferenced bibliographic values from bibXXx tables.
    This is useful to prettify browse results, as it removes
    old, no longer used values.

    WARNING: this function must be run only when no bibupload is
    running and/or sleeping.
    """
    write_message("""CLEANING OF UNREFERENCED bibXXx VALUES STARTED""")
    for xx in range(0, 100):
        bibxxx = 'bib%02dx' % xx
        bibrec_bibxxx = 'bibrec_bib%02dx' % xx
        if task_get_option('verbose') >= 9:
            num_unref_values = run_sql("""SELECT COUNT(*) FROM %(bibxxx)s
                     LEFT JOIN %(bibrec_bibxxx)s
                            ON %(bibxxx)s.id=%(bibrec_bibxxx)s.id_bibxxx
                     WHERE %(bibrec_bibxxx)s.id_bibrec IS NULL""" % \
                        {'bibxxx': bibxxx,
                         'bibrec_bibxxx': bibrec_bibxxx, })[0][0]
        run_sql("""DELETE %(bibxxx)s FROM %(bibxxx)s
                     LEFT JOIN %(bibrec_bibxxx)s
                            ON %(bibxxx)s.id=%(bibrec_bibxxx)s.id_bibxxx
                     WHERE %(bibrec_bibxxx)s.id_bibrec IS NULL""" % \
                        {'bibxxx': bibxxx,
                         'bibrec_bibxxx': bibrec_bibxxx, })
        if task_get_option('verbose') >= 9:
            write_message(""" - %d unreferenced %s values cleaned""" % \
                          (num_unref_values, bibxxx))
    write_message("""CLEANING OF UNREFERENCED bibXXx VALUES FINISHED""")
示例#3
0
文件: task.py 项目: mhellmic/b2share
def fetch_concerned_records(name):
    task_update_progress("Fetching record ids")

    last_recid, last_date = fetch_last_updated(name)

    if task_get_option('new'):
        # Fetch all records inserted since last run
        sql = "SELECT `id`, `creation_date` FROM `bibrec` " \
            "WHERE `creation_date` >= %s " \
            "AND `id` > %s " \
            "ORDER BY `creation_date`"
        records = run_sql(sql, (last_date.isoformat(), last_recid))
    elif task_get_option('modified'):
        # Fetch all records inserted since last run
        sql = "SELECT `id`, `modification_date` FROM `bibrec` " \
            "WHERE `modification_date` >= %s " \
            "AND `id` > %s " \
            "ORDER BY `modification_date`"
        records = run_sql(sql, (last_date.isoformat(), last_recid))
    else:
        given_recids = task_get_option('recids')
        for collection in task_get_option('collections'):
            given_recids.add(get_collection_reclist(collection))

        if given_recids:
            format_strings = ','.join(['%s'] * len(given_recids))
            records = run_sql("SELECT `id`, NULL FROM `bibrec` " \
                "WHERE `id` IN (%s) ORDER BY `id`" % format_strings,
                    list(given_recids))
        else:
            records = []

    task_update_progress("Done fetching record ids")

    return records
示例#4
0
文件: cli.py 项目: SCOAP3/invenio
def task_run_core():
    """Run the indexing task. The row argument is the BibSched task
    queue row, containing if, arguments, etc.
    Return 1 in case of success and 0 in case of failure.
    """
    if not task_get_option("run"):
        task_set_option("run", [name[0] for name in run_sql("SELECT name from rnkMETHOD")])

    for key in task_get_option("run"):
        task_sleep_now_if_required(can_stop_too=True)
        write_message("")
        filename = configuration.get(key + '.cfg', '')
        write_message("Getting configuration from file: %s" % filename,
            verbose=9)
        config = ConfigParser.ConfigParser()
        try:
            config.readfp(open(filename))
        except StandardError:
            write_message("Cannot find configuration file: %s. "
                "The rankmethod may also not be registered using "
                "the BibRank Admin Interface." % filename, sys.stderr)
            raise

        #Using the function variable to call the function related to the
        #rank method
        cfg_function = config.get("rank_method", "function")
        func_object = globals().get(cfg_function)
        if func_object:
            func_object(key)
        else:
            write_message("Cannot run method '%s', no function to call"
                % key)

    return True
示例#5
0
def parse_option(key, value, dummy, args):
    """Parse command line options"""

    if args:
        # There should be no standalone arguments for any refextract job
        # This will catch args before the job is shipped to Bibsched
        raise StandardError("Error: Unrecognised argument '%s'." % args[0])

    if key in ('-a', '--new'):
        task_set_option('new', True)
    elif key in ('-m', '--modified'):
        task_set_option('modified', True)
    elif key == '--rebuild':
        task_set_option('rebuild', True)
    elif key in ('-c', '--collections'):
        collections = task_get_option('collections')
        if not collections:
            collections = set()
            task_set_option('collections', collections)
        collections.update(split_cli_ids_arg(value))
    elif key in ('-r', '--recids'):
        recids = task_get_option('recids')
        if not recids:
            recids = set()
            task_set_option('recids', recids)
        recids.update(split_cli_ids_arg(value))

    return True
示例#6
0
def fetch_concerned_records(name):
    task_update_progress("Fetching record ids")

    last_recid, last_date = fetch_last_updated(name)

    if task_get_option('new'):
        # Fetch all records inserted since last run
        sql = "SELECT `id`, `creation_date` FROM `bibrec` " \
            "WHERE `creation_date` >= %s " \
            "AND `id` > %s " \
            "ORDER BY `creation_date`"
        records = run_sql(sql, (last_date.isoformat(), last_recid))
    elif task_get_option('modified'):
        # Fetch all records inserted since last run
        sql = "SELECT `id`, `modification_date` FROM `bibrec` " \
            "WHERE `modification_date` >= %s " \
            "AND `id` > %s " \
            "ORDER BY `modification_date`"
        records = run_sql(sql, (last_date.isoformat(), last_recid))
    else:
        given_recids = task_get_option('recids')
        for collection in task_get_option('collections'):
            given_recids.add(get_collection_reclist(collection))

        if given_recids:
            format_strings = ','.join(['%s'] * len(given_recids))
            records = run_sql("SELECT `id`, NULL FROM `bibrec` " \
                "WHERE `id` IN (%s) ORDER BY `id`" % format_strings,
                    list(given_recids))
        else:
            records = []

    task_update_progress("Done fetching record ids")

    return records
示例#7
0
def _task_submit_elaborate_specific_parameter(key, value, opts, args):
    """Given the string key it checks it's meaning, eventually using the
    value. Usually it fills some key in the options dict.
    It must return True if it has elaborated the key, False, if it doesn't
    know that key.
    eg:
    if key in ('-n', '--number'):
        bibtask.task_get_option(\1) = value
        return True
    return False
    """
    # Recid option
    if key in ("-i", "--recid"):
        try:
            value = int(value)
        except ValueError:
            bibtask.write_message("The value specified for --recid must be a "
                                  "valid integer, not '%s'." % value,
                                  stream=sys.stderr,
                                  verbose=0)
        if not _recid_exists(value):
            bibtask.write_message("ERROR: '%s' is not a valid record ID." %
                                  value,
                                  stream=sys.stderr,
                                  verbose=0)
            return False
        recids = bibtask.task_get_option('recids')
        if recids is None:
            recids = []
        recids.append(value)
        bibtask.task_set_option('recids', recids)

    # Collection option
    elif key in ("-c", "--collection"):
        if not _collection_exists(value):
            bibtask.write_message("ERROR: '%s' is not a valid collection." %
                                  value,
                                  stream=sys.stderr,
                                  verbose=0)
            return False
        collections = bibtask.task_get_option("collections")
        collections = collections or []
        collections.append(value)
        bibtask.task_set_option("collections", collections)

    # Taxonomy option
    elif key in ("-k", "--taxonomy"):
        if not _ontology_exists(value):
            bibtask.write_message("ERROR: '%s' is not a valid taxonomy name." %
                                  value,
                                  stream=sys.stderr,
                                  verbose=0)
            return False
        bibtask.task_set_option("taxonomy", value)
    elif key in ("-f", "--force"):
        bibtask.task_set_option("force", True)
    else:
        return False

    return True
示例#8
0
def _dbdump_run_task_core():
    """
    Run DB dumper core stuff.

    Note: do not use task_can_sleep() stuff here because we don't want
    other tasks to interrupt us while we are dumping the DB content.
    """
    # read params:
    task_update_progress("Reading parameters")
    write_message("Reading parameters started")
    output_dir = task_get_option('output', CFG_LOGDIR)
    output_num = task_get_option('number', 5)
    output_fil_prefix = CFG_DATABASE_NAME + '-dbdump-'
    output_fil_suffix = task_get_task_param('task_starting_time').replace(' ', '_') + '.sql.gz'
    output_fil = output_fil_prefix + output_fil_suffix
    write_message("Reading parameters ended")
    # make dump:
    task_update_progress("Dumping database")
    write_message("Database dump started")
    _dump_database(output_dir, output_fil)
    write_message("Database dump ended")
    # prune old dump files:
    task_update_progress("Pruning old dump files")
    write_message("Pruning old dump files started")
    _delete_old_dumps(output_dir, output_fil_prefix, output_num)
    write_message("Pruning old dump files ended")
    # we are done:
    task_update_progress("Done.")
    return True
示例#9
0
def task_run_core():
    """Runs the task by fetching arguments from the BibSched task queue.  This is
    what BibSched will be invoking via daemon call.
    The task prints Fibonacci numbers for up to NUM on the stdout, and some
    messages on stderr.
    Return 1 in case of success and 0 in case of failure."""
    n = int(task_get_option('number'))
    write_message("Printing %d Fibonacci numbers." % n, verbose=9)
    for i in range(0, n):
        if i > 0 and i % 4 == 0:
            write_message("Error: water in the CPU.  Ignoring and continuing.",
                          sys.stderr,
                          verbose=3)
        elif i > 0 and i % 5 == 0:
            write_message(
                "Error: floppy drive dropped on the floor.  Ignoring and continuing.",
                sys.stderr)
            if task_get_option('error'):
                1 / 0
        write_message("fib(%d)=%d" % (i, fib(i)))
        task_update_progress("Done %d out of %d." % (i, n))
        task_sleep_now_if_required(can_stop_too=True)
        time.sleep(1)
    task_update_progress("Done %d out of %d." % (n, n))
    return 1
示例#10
0
def task_run_core():
    """
    When this function is called, the tool has entered BibSched mode, which means
    that we're going to cache events according to the parameters.
    """
    write_message("Initiating rawdata caching")
    task_update_progress("Initating rawdata caching")

    # Cache key events
    keyevents = task_get_option("keyevents")
    if keyevents and len(keyevents) > 0:
        for i in range(len(keyevents)):
            write_message("Caching key event 1: %s" % keyevents[i])
            webstat.cache_keyevent_trend(keyevents)
            task_update_progress("Part 1/2: done %d/%d" %
                                 (i + 1, len(keyevents)))

    # Cache custom events
    customevents = task_get_option("customevents")
    if len(customevents) > 0:
        for i in range(len(customevents)):
            write_message("Caching custom event 1: %s" % customevents[i])
            webstat.cache_customevent_trend(customevents)
            task_update_progress("Part 2/2: done %d/%d" %
                                 (i + 1, len(customevents)))

    write_message("Finished rawdata caching succesfully")
    task_update_progress("Finished rawdata caching succesfully")

    return True
示例#11
0
def clean_bibxxx():
    """
    Clean unreferenced bibliographic values from bibXXx tables.
    This is useful to prettify browse results, as it removes
    old, no longer used values.

    WARNING: this function must be run only when no bibupload is
    running and/or sleeping.
    """
    write_message("""CLEANING OF UNREFERENCED bibXXx VALUES STARTED""")
    for xx in range(0, 100):
        bibxxx = 'bib%02dx' % xx
        bibrec_bibxxx = 'bibrec_bib%02dx' % xx
        if task_get_option('verbose') >= 9:
            num_unref_values = run_sql("""SELECT COUNT(*) FROM %(bibxxx)s
                     LEFT JOIN %(bibrec_bibxxx)s
                            ON %(bibxxx)s.id=%(bibrec_bibxxx)s.id_bibxxx
                     WHERE %(bibrec_bibxxx)s.id_bibrec IS NULL""" % \
                        {'bibxxx': bibxxx,
                         'bibrec_bibxxx': bibrec_bibxxx, })[0][0]
        run_sql("""DELETE %(bibxxx)s FROM %(bibxxx)s
                     LEFT JOIN %(bibrec_bibxxx)s
                            ON %(bibxxx)s.id=%(bibrec_bibxxx)s.id_bibxxx
                     WHERE %(bibrec_bibxxx)s.id_bibrec IS NULL""" % \
                        {'bibxxx': bibxxx,
                         'bibrec_bibxxx': bibrec_bibxxx, })
        if task_get_option('verbose') >= 9:
            write_message(""" - %d unreferenced %s values cleaned""" % \
                          (num_unref_values, bibxxx))
    write_message("""CLEANING OF UNREFERENCED bibXXx VALUES FINISHED""")
示例#12
0
文件: admin.py 项目: SCOAP3/invenio
def task_run_core():
    """
    When this function is called, the tool has entered BibSched mode, which means
    that we're going to cache events according to the parameters.
    """
    write_message("Initiating rawdata caching")
    task_update_progress("Initating rawdata caching")

    # Cache key events
    keyevents = task_get_option("keyevents")
    if keyevents and len(keyevents) > 0:
        for i in range(len(keyevents)):
            write_message("Caching key event 1: %s" % keyevents[i])
            webstat.cache_keyevent_trend(keyevents)
            task_update_progress("Part 1/2: done %d/%d" % (i + 1, len(keyevents)))

    # Cache custom events
    customevents = task_get_option("customevents")
    if len(customevents) > 0:
        for i in range(len(customevents)):
            write_message("Caching custom event 1: %s" % customevents[i])
            webstat.cache_customevent_trend(customevents)
            task_update_progress("Part 2/2: done %d/%d" % (i + 1, len(customevents)))

    write_message("Finished rawdata caching succesfully")
    task_update_progress("Finished rawdata caching succesfully")

    return True
示例#13
0
文件: task.py 项目: osub3/invenio
def task_run_core(recid, records, bibcatalog_system=None, _arxiv=False):
    setup_loggers(None, use_bibtask=True)

    if _arxiv:
        overwrite = True
    else:
        overwrite = not task_get_option('no-overwrite')

    try:
        record = extract_references_from_record(recid)
        msg = "Extracted references for %s" % recid
        safe_to_extract = True
        if overwrite:
            write_message("%s (overwrite)" % msg)
        else:
            write_message(msg)
            if not check_record_for_refextract(recid):
                write_message('Record not safe for re-extraction, skipping')
                safe_to_extract = False

        if safe_to_extract:
            records.append(record)
            # Create a RT ticket if necessary
            if task_get_option('new') or task_get_option('create-ticket'):
                create_ticket(recid, bibcatalog_system)
    except FullTextNotAvailable:
        write_message("No full text available for %s" % recid)
示例#14
0
文件: task.py 项目: osub3/invenio
def cb_parse_option(key, value, opts, args):
    """ Must be defined for bibtask to create a task """
    if args and len(args) > 0:
        # There should be no standalone arguments for any refextract job
        # This will catch args before the job is shipped to Bibsched
        raise StandardError("Error: Unrecognised argument '%s'." % args[0])

    if key in ('-a', '--new'):
        task_set_option('new', True)
        task_set_option('no-overwrite', True)
    elif key in ('-m', '--modified'):
        task_set_option('modified', True)
        task_set_option('no-overwrite', True)
    elif key == '--inspire':
        msg = """The --inspire option does not exist anymore.
Please set the config variable CFG_INSPIRE_SITE instead."""
        raise StandardError(msg)
    elif key in ('--kb-reports', ):
        task_set_option('kb-reports', value)
    elif key in ('--kb-journals', ):
        task_set_option('kb-journals', value)
    elif key in ('--kb-journals-re', ):
        task_set_option('kb-journals-re', value)
    elif key in ('--kb-authors', ):
        task_set_option('kb-authors', value)
    elif key in ('--kb-books', ):
        task_set_option('kb-books', value)
    elif key in ('--kb-conferences', ):
        task_set_option('kb-conferences', value)
    elif key in ('--create-ticket', ):
        task_set_option('create-ticket', True)
    elif key in ('--no-overwrite', ):
        task_set_option('no-overwrite', True)
    elif key in ('--arxiv'):
        task_set_option('arxiv', True)
    elif key in ('-c', '--collections'):
        collections = task_get_option('collections')
        if not collections:
            collections = set()
            task_set_option('collections', collections)
        for v in value.split(","):
            collections.update(perform_request_search(c=v))
    elif key in ('-i', '--id'):
        recids = task_get_option('recids')
        if not recids:
            recids = set()
            task_set_option('recids', recids)
        recids.update(split_ids(value))
    elif key in ('-r', '--recids'):
        msg = """The --recids has been renamed.
please use --id for specifying recids."""
        raise StandardError(msg)
    elif key == '-f':
        msg = """refextract is now used to run in daemon mode only.
If you would like to run reference extraction on a standalone PDF file,
please use "docextract file.pdf\""""
        raise StandardError(msg)

    return True
示例#15
0
文件: task.py 项目: SCOAP3/invenio
def cb_parse_option(key, value, opts, args):
    """ Must be defined for bibtask to create a task """
    if args and len(args) > 0:
        # There should be no standalone arguments for any refextract job
        # This will catch args before the job is shipped to Bibsched
        raise StandardError("Error: Unrecognised argument '%s'." % args[0])

    if key in ('-a', '--new'):
        task_set_option('new', True)
        task_set_option('no-overwrite', True)
    elif key in ('-m', '--modified'):
        task_set_option('modified', True)
        task_set_option('no-overwrite', True)
    elif key == '--inspire':
        msg = """The --inspire option does not exist anymore.
Please set the config variable CFG_INSPIRE_SITE instead."""
        raise StandardError(msg)
    elif key in ('--kb-reports', ):
        task_set_option('kb-reports', value)
    elif key in ('--kb-journals', ):
        task_set_option('kb-journals', value)
    elif key in ('--kb-journals-re', ):
        task_set_option('kb-journals-re', value)
    elif key in ('--kb-authors', ):
        task_set_option('kb-authors', value)
    elif key in ('--kb-books', ):
        task_set_option('kb-books', value)
    elif key in ('--kb-conferences', ):
        task_set_option('kb-conferences', value)
    elif key in ('--create-ticket', ):
        task_set_option('create-ticket', True)
    elif key in ('--no-overwrite', ):
        task_set_option('no-overwrite', True)
    elif key in ('--arxiv'):
        task_set_option('arxiv', True)
    elif key in ('-c', '--collections'):
        collections = task_get_option('collections')
        if not collections:
            collections = set()
            task_set_option('collections', collections)
        for v in value.split(","):
            collections.update(perform_request_search(c=v))
    elif key in ('-i', '--id'):
        recids = task_get_option('recids')
        if not recids:
            recids = set()
            task_set_option('recids', recids)
        recids.update(split_ids(value))
    elif key in ('-r', '--recids'):
        msg = """The --recids has been renamed.
please use --id for specifying recids."""
        raise StandardError(msg)
    elif key == '-f':
        msg = """refextract is now used to run in daemon mode only.
If you would like to run reference extraction on a standalone PDF file,
please use "docextract file.pdf\""""
        raise StandardError(msg)

    return True
示例#16
0
文件: daemon.py 项目: dset0x/invenio
def _task_submit_elaborate_specific_parameter(key, value, opts, args):
    """Given the string key it checks it's meaning, eventually using the
    value. Usually it fills some key in the options dict.
    It must return True if it has elaborated the key, False, if it doesn't
    know that key.
    eg:
    if key in ('-n', '--number'):
        bibtask.task_get_option(\1) = value
        return True
    return False
    """
    # Recid option
    if key in ("-i", "--recid"):
        try:
            value = int(value)
        except ValueError:
            bibtask.write_message("The value specified for --recid must be a "
                                  "valid integer, not '%s'." % value,
                                  stream=sys.stderr,
                                  verbose=0)
        if not _recid_exists(value):
            bibtask.write_message(
                "ERROR: '%s' is not a valid record ID." % value,
                stream=sys.stderr, verbose=0)
            return False
        recids = bibtask.task_get_option('recids')
        if recids is None:
            recids = []
        recids.append(value)
        bibtask.task_set_option('recids', recids)

    # Collection option
    elif key in ("-c", "--collection"):
        if not _collection_exists(value):
            bibtask.write_message(
                "ERROR: '%s' is not a valid collection." % value,
                stream=sys.stderr, verbose=0)
            return False
        collections = bibtask.task_get_option("collections")
        collections = collections or []
        collections.append(value)
        bibtask.task_set_option("collections", collections)

    # Taxonomy option
    elif key in ("-k", "--taxonomy"):
        if not _ontology_exists(value):
            bibtask.write_message(
                "ERROR: '%s' is not a valid taxonomy name." % value,
                stream=sys.stderr, verbose=0)
            return False
        bibtask.task_set_option("taxonomy", value)
    elif key in ("-f", "--force"):
        bibtask.task_set_option("force", True)
    else:
        return False

    return True
示例#17
0
def get_citation_weight(rank_method_code, config, chunk_size=25000):
    """return a dictionary which is used by bibrank daemon for generating
    the index of sorted research results by citation information
    """
    quick = task_get_option("quick") != "no"

    # id option forces re-indexing a certain range
    # even if there are no new recs
    if task_get_option("id"):
        # construct a range of records to index
        updated_recids = []
        for first, last in task_get_option("id"):
            updated_recids += range(first, last + 1)
        if len(updated_recids) > 10000:
            str_updated_recids = str(updated_recids[:10]) + ' ... ' + str(
                updated_recids[-10:])
        else:
            str_updated_recids = str(updated_recids)
        write_message('Records to process: %s' % str_updated_recids)
        index_update_time = None
    else:
        bibrank_update_time = get_bibrankmethod_lastupdate(rank_method_code)
        if not quick:
            bibrank_update_time = "0000-00-00 00:00:00"
        write_message("bibrank: %s" % bibrank_update_time)
        index_update_time = get_bibindex_update_time()
        write_message("bibindex: %s" % index_update_time)
        if index_update_time > datetime.now().strftime("%Y-%m-%d %H:%M:%S"):
            index_update_time = "0000-00-00 00:00:00"
        updated_recids = get_modified_recs(bibrank_update_time,
                                           index_update_time)
        if len(updated_recids) > 10000:
            str_updated_recids = str(updated_recids[:10]) + ' ... ' + str(
                updated_recids[-10:])
        else:
            str_updated_recids = str(updated_recids)
        write_message("%s records to update" % str_updated_recids)

    if updated_recids:
        begin_time = time.time()
        try:
            function = config.get("rank_method", "function")
            config.get(function, 'collections')
        except ConfigParser.NoOptionError:
            config.set(function, 'collections', None)
        # Process fully the updated records
        weights = process_and_store(updated_recids, config, chunk_size)
        end_time = time.time()
        write_message("Total time of get_citation_weight(): %.2f sec" %
                      (end_time - begin_time))
        task_update_progress("citation analysis done")
    else:
        weights = None
        write_message("No new records added since last time this "
                      "rank method was executed")

    return weights, index_update_time
示例#18
0
def get_citation_weight(rank_method_code, config, chunk_size=25000):
    """return a dictionary which is used by bibrank daemon for generating
    the index of sorted research results by citation information
    """
    quick = task_get_option("quick") != "no"

    # id option forces re-indexing a certain range
    # even if there are no new recs
    if task_get_option("id"):
        # construct a range of records to index
        updated_recids = []
        for first, last in task_get_option("id"):
            updated_recids += range(first, last+1)
        if len(updated_recids) > 10000:
            str_updated_recids = str(updated_recids[:10]) + ' ... ' + str(updated_recids[-10:])
        else:
            str_updated_recids = str(updated_recids)
        write_message('Records to process: %s' % str_updated_recids)
        index_update_time = None
    else:
        bibrank_update_time = get_bibrankmethod_lastupdate(rank_method_code)
        if not quick:
            bibrank_update_time = "0000-00-00 00:00:00"
        write_message("bibrank: %s" % bibrank_update_time)
        index_update_time = get_bibindex_update_time()
        write_message("bibindex: %s" % index_update_time)
        if index_update_time > datetime.now().strftime("%Y-%m-%d %H:%M:%S"):
            index_update_time = "0000-00-00 00:00:00"
        updated_recids = get_modified_recs(bibrank_update_time,
                                           index_update_time)
        if len(updated_recids) > 10000:
            str_updated_recids = str(updated_recids[:10]) + ' ... ' + str(updated_recids[-10:])
        else:
            str_updated_recids = str(updated_recids)
        write_message("%s records to update" % str_updated_recids)

    if updated_recids:
        begin_time = time.time()
        try:
            function = config.get("rank_method", "function")
            config.get(function, 'collections')
        except ConfigParser.NoOptionError:
            config.set(function, 'collections', None)
        # Process fully the updated records
        weights = process_and_store(updated_recids, config, chunk_size)
        end_time = time.time()
        write_message("Total time of get_citation_weight(): %.2f sec" %
                                                      (end_time - begin_time))
        task_update_progress("citation analysis done")
    else:
        weights = None
        write_message("No new records added since last time this "
                      "rank method was executed")

    return weights, index_update_time
示例#19
0
def check_options():
    """Check command line options"""
    if not task_get_option('new') \
            and not task_get_option('modified') \
            and not task_get_option('recids') \
            and not task_get_option('collections') \
            and not task_get_option('rebuild'):
        print('Error: No input file specified, you need' \
            ' to specify which files to run on', file=sys.stderr)
        return False

    return True
示例#20
0
def task_run_core():
    """Run the specific tasklet."""
    tasklet = task_get_option('tasklet')
    arguments = task_get_option('arguments', {})
    write_message('Starting tasklet "%s" (with arguments %s)' % (
        tasklet, arguments))
    task_update_progress('%s started' % tasklet)
    ret = _TASKLETS[tasklet](**arguments)
    task_update_progress('%s finished' % tasklet)
    write_message('Finished tasklet "%s" (with arguments %s)' % (
        tasklet, arguments))
    if ret is not None:
        return ret
    return True
示例#21
0
def task_run_core():
    """Reimplement to add the body of the task"""
    write_message("bibsort starting..")

    cmd = task_get_option('cmd')
    methods = task_get_option('methods')
    recids = task_get_option('recids')
    write_message("Task parameters: command=%s ; methods=%s ; recids=%s" \
                  % (cmd, methods, recids), verbose=2)

    executed_correctly = False

    # if no command is defined, run sorting
    if not cmd:
        cmd = 'sort'

    if cmd == 'load':
        write_message('Starting loading the configuration \
                      from the cfg file to the db.',
                      verbose=5)
        executed_correctly = load_configuration()
        if executed_correctly:
            write_message('Loading completed.', verbose=5)
    elif cmd == 'dump':
        write_message('Starting dumping the configuration \
                      from the db into the cfg file.',
                      verbose=5)
        executed_correctly = dump_configuration()
        if executed_correctly:
            write_message('Dumping completed.', verbose=5)
    elif cmd == 'print':
        executed_correctly = print_sorting_methods()
    elif cmd == 'sort':
        write_message('Starting sorting.', verbose=5)
        executed_correctly = update_sorting(methods, recids)
        if executed_correctly:
            write_message('Sorting completed.', verbose=5)
    elif cmd == 'rebalance':
        write_message('Starting rebalancing the sorting buckets.', verbose=5)
        executed_correctly = rebalance(methods)
        if executed_correctly:
            write_message('Rebalancing completed.', verbose=5)
    else:
        write_message(
            "This action is not possible. \
        See the --help for available actions.", sys.stderr)

    write_message('bibsort exiting..')
    return executed_correctly
示例#22
0
文件: cli.py 项目: SCOAP3/invenio
def task_submit_elaborate_specific_parameter(key, value, opts, dummy):
    """Elaborate a specific parameter of CLI bibrank."""
    if key in ("-a", "--add"):
        task_set_option("cmd", "add")
        if ("-x","") in opts or ("--del","") in opts:
            raise StandardError, "--add incompatible with --del"
    elif key in ("--run", "-w"):
        task_set_option("run", [])
        run = value.split(",")
        for run_key in range(0, len(run)):
            task_get_option('run').append(run[run_key])
    elif key in ("-r", "--repair"):
        task_set_option("cmd", "repair")
    elif key in ("-E", "--print-extcites"):
        try:
            task_set_option("print-extcites", int(value))
        except:
            task_set_option("print-extcites", 10) # default fallback value
        task_set_option("cmd", "print-missing")
    elif key in ("-A", "--author-citations"):
        task_set_option("author-citations", "1")
    elif key in ("-d", "--del"):
        task_set_option("cmd", "del")
    elif key in ("-k", "--check"):
        task_set_option("cmd", "check")
    elif key in ("-S", "--stat"):
        task_set_option("cmd", "stat")
    elif key in ("-i", "--id"):
        task_set_option("id", task_get_option("id") + split_ranges(value))
        task_set_option("last_updated", "")
    elif key in ("-c", "--collection"):
        task_set_option("collection", value)
    elif key in ("-R", "--rebalance"):
        task_set_option("quick", "no")
    elif key in ("-f", "--flush"):
        task_set_option("flush", int(value))
    elif key in ("-M", "--maxmem"):
        task_set_option("maxmem", int(value))
        if task_get_option("maxmem") < base_process_size + 1000:
            raise StandardError, "Memory usage should be higher than %d kB" % \
                (base_process_size + 1000)
    elif key in ("-m", "--modified"):
        task_set_option("modified", get_date_range(value))#2002-10-27 13:57:26)
        task_set_option("last_updated", "")
    elif key in ("-l", "--lastupdate"):
        task_set_option("last_updated", "last_updated")
    else:
        return False
    return True
示例#23
0
文件: task.py 项目: mhellmic/b2share
def check_options():
    """ Reimplement this method for having the possibility to check options
    before submitting the task, in order for example to provide default
    values. It must return False if there are errors in the options.
    """
    if not task_get_option('new') \
            and not task_get_option('modified') \
            and not task_get_option('recids') \
            and not task_get_option('collections') \
            and not task_get_option('arxiv'):
        print('Error: No records specified, you need' \
            ' to specify which files to run on', file=sys.stderr)
        return False

    return True
示例#24
0
def update_rule_last_run(rule_name):
    """
    Set the last time a rule was run to now. This function should be called
    after a rule has been ran.
    """

    if task_has_option('record_ids') or task_get_option('no_upload', False) \
            or task_get_option('no_tickets', False):
        return   # We don't want to update the database in this case

    updated = run_sql("UPDATE bibcheck_rules SET last_run=%s WHERE name=%s;",
                      (task_get_task_param('task_starting_time'), rule_name,))
    if not updated: # rule not in the database, insert it
        run_sql("INSERT INTO bibcheck_rules(name, last_run) VALUES (%s, %s)",
                (rule_name, task_get_task_param('task_starting_time')))
示例#25
0
def task_submit_check_options():
    if not task_get_option('logs') and \
       not task_get_option('tempfiles') and \
       not task_get_option('guests') and \
       not task_get_option('bibxxx') and \
       not task_get_option('documents') and \
       not task_get_option('cache') and \
       not task_get_option('tasks') and \
       not task_get_option('check-tables') and \
       not task_get_option('optimise-tables'):
        task_set_option('sessions', True)
    return True
示例#26
0
文件: task.py 项目: mhellmic/b2share
def cb_parse_option(key, value, opts, args):
    """ Must be defined for bibtask to create a task """
    if args and len(args) > 0:
        # There should be no standalone arguments for any refextract job
        # This will catch args before the job is shipped to Bibsched
        raise StandardError("Error: Unrecognised argument '%s'." % args[0])

    if key in ('-a', '--new'):
        task_set_option('new', True)
        task_set_option('no-overwrite', True)
    elif key in ('-m', '--modified'):
        task_set_option('modified', True)
        task_set_option('no-overwrite', True)
    elif key in ('-i', '--inspire', ):
        task_set_option('inspire', True)
    elif key in ('--kb-reports', ):
        task_set_option('kb-reports', value)
    elif key in ('--kb-journals', ):
        task_set_option('kb-journals', value)
    elif key in ('--kb-journals-re', ):
        task_set_option('kb-journals-re', value)
    elif key in ('--kb-authors', ):
        task_set_option('kb-authors', value)
    elif key in ('--kb-books', ):
        task_set_option('kb-books', value)
    elif key in ('--kb-conferences', ):
        task_set_option('kb-conferences', value)
    elif key in ('--create-ticket', ):
        task_set_option('create-ticket', True)
    elif key in ('--no-overwrite', ):
        task_set_option('no-overwrite', True)
    elif key in ('--arxiv'):
        task_set_option('arxiv', True)
    elif key in ('-c', '--collections'):
        collections = task_get_option('collections')
        if not collections:
            collections = set()
            task_set_option('collections', collections)
        for v in value.split(","):
            collections.update(perform_request_search(c=v))
    elif key in ('-r', '--recids'):
        recids = task_get_option('recids')
        if not recids:
            recids = set()
            task_set_option('recids', recids)
        recids.update(split_ids(value))

    return True
示例#27
0
def upload_amendments(records, holdingpen):
    """ Upload a modified record """

    if task_get_option("no_upload", False) or len(records) == 0:
        return

    xml = '<collection xmlns="http://www.loc.gov/MARC21/slim">'
    for record in records:
        xml += record_xml_output(record)
    xml += "</collection>"

    tmp_file_fd, tmp_file = mkstemp(
        suffix='.xml',
        prefix="bibcheckfile_%s" % time.strftime("%Y-%m-%d_%H:%M:%S"),
        dir=CFG_TMPSHAREDDIR
    )
    os.write(tmp_file_fd, xml)
    os.close(tmp_file_fd)
    os.chmod(tmp_file, 0644)
    if holdingpen:
        flag = "-o"
    else:
        flag = "-r"
    task = task_low_level_submission('bibupload', 'bibcheck', flag, tmp_file)
    write_message("Submitted bibupload task %s" % task)
示例#28
0
文件: daemon.py 项目: SCOAP3/invenio
def task_run_core():
    """
    Runs the task by fetching arguments from the BibSched task queue.  This is
    what BibSched will be invoking via daemon call.
    """
    errors_encountered_p = False
    jobnames = _detect_jobs_to_run(task_get_option('wjob'))
    for jobname in jobnames:
        jobname_export_method = _detect_export_method(jobname)
        if not jobname_export_method:
            write_message("ERROR: cannot detect export method for job %s." % jobname, sys.stderr)
            errors_encountered_p = True
        else:
            try:
                # every bibexport method must define run_export_job() that will do the job
                exec "from invenio.bibexport_method_%s import run_export_method" % jobname_export_method
                write_message("started export job " + jobname, verbose=3)
                # pylint: disable=E0602
                # The import is done via the exec command 2 lines above.
                run_export_method(jobname)
                # pylint: enable=E0602
                _update_job_lastrun_time(jobname)
                write_message("finished export job " + jobname, verbose=3)
            except Exception as msg:
                write_message("ERROR: cannot run export job %s: %s." % (jobname, msg), sys.stderr)
                errors_encountered_p = True
    return not errors_encountered_p
def solr_add_ranges(id_ranges):
    sub_range_length = task_get_option("flush")
    id_ranges_to_index = []
    for id_range in id_ranges:
        lower_recid = id_range[0]
        upper_recid = id_range[1]
        i_low = lower_recid
        while i_low <= upper_recid:
            i_up = min(i_low + sub_range_length - 1, upper_recid)
            id_ranges_to_index.append((i_low, i_up))
            i_low += sub_range_length

    tags_to_index = get_tags()
    # Indexes latest records first by reversing
    # This allows the ranker to return better results during long indexing
    # runs as the ranker cuts the hitset using latest records
    id_ranges_to_index.reverse()
    next_commit_counter = 0
    for id_range_to_index in id_ranges_to_index:
        lower_recid = id_range_to_index[0]
        upper_recid = id_range_to_index[1]
        status_msg = "Solr ranking indexer called for %s-%s" % (lower_recid, upper_recid)
        write_message(status_msg)
        task_update_progress(status_msg)
        next_commit_counter = solr_add_range(lower_recid, upper_recid, tags_to_index, next_commit_counter)

    solr_commit_if_necessary(next_commit_counter, final_commit=True)
示例#30
0
def _task_submit_check_options():
    """Required by bibtask. Checks the options."""
    recids = bibtask.task_get_option('recids')
    collections = bibtask.task_get_option('collections')
    taxonomy = bibtask.task_get_option('taxonomy')

    # If a recid or a collection is specified, check that the taxonomy
    # is also specified.
    if (recids is not None or collections is not None) and \
                    taxonomy is None:
        bibtask.write_message("ERROR: When specifying a record ID or a collection, "
                              "you have to precise which\ntaxonomy to use.", stream=sys.stderr,
                              verbose=0)
        return False

    return True
示例#31
0
def print_missing(num):
    """
    Print the contents of rnkCITATIONDATAEXT table containing external
    records that were cited by NUM or more internal records.

    NUM is by default taken from the -E command line option.
    """
    if not num:
        num = task_get_option("print-extcites")

    write_message(
        "Listing external papers cited by %i or more \
                                                      internal records:"
        % num
    )

    res = run_sql(
        """SELECT COUNT(id_bibrec), extcitepubinfo
                     FROM rnkCITATIONDATAEXT
                     GROUP BY extcitepubinfo HAVING COUNT(id_bibrec) >= %s
                     ORDER BY COUNT(id_bibrec) DESC""",
        (num,),
    )
    for cnt, brec in res:
        print(str(cnt), "\t", brec)

    write_message("Listing done.")
示例#32
0
def task_run_core():
    """
    Runs the task by fetching arguments from the BibSched task queue.  This is
    what BibSched will be invoking via daemon call.
    """
    errors_encountered_p = False
    jobnames = _detect_jobs_to_run(task_get_option('wjob'))
    for jobname in jobnames:
        jobname_export_method = _detect_export_method(jobname)
        if not jobname_export_method:
            write_message(
                "ERROR: cannot detect export method for job %s." % jobname,
                sys.stderr)
            errors_encountered_p = True
        else:
            try:
                # every bibexport method must define run_export_job() that will do the job
                exec "from invenio.bibexport_method_%s import run_export_method" % jobname_export_method
                write_message("started export job " + jobname, verbose=3)
                # pylint: disable=E0602
                # The import is done via the exec command 2 lines above.
                # FIXME undefined name 'run_export_method'
                run_export_method(jobname)
                # pylint: enable=E0602
                _update_job_lastrun_time(jobname)
                write_message("finished export job " + jobname, verbose=3)
            except Exception as msg:
                write_message(
                    "ERROR: cannot run export job %s: %s." % (jobname, msg),
                    sys.stderr)
                errors_encountered_p = True
    return not errors_encountered_p
def solr_commit_if_necessary(next_commit_counter,
                             final_commit=False,
                             recid=None):
    # Counter full or final commit if counter set
    if next_commit_counter == task_get_option("flush") - 1 or (
            final_commit and next_commit_counter > 0):
        recid_info = ''
        if recid:
            recid_info = ' for recid=%s' % recid
        status_msg = 'Solr ranking indexer COMMITTING' + recid_info
        write_message(status_msg)
        task_update_progress(status_msg)

        try:
            # Commits might cause an exception, most likely a
            # timeout while hitting a background merge
            # Changes will then be committed later by the
            # calling (periodical) task
            # Also, autocommits can be used in the solrconfig
            SOLR_CONNECTION.commit()
        except:
            register_exception(alert_admin=True)
        next_commit_counter = 0

        task_sleep_now_if_required(can_stop_too=True)
    else:
        next_commit_counter = next_commit_counter + 1
    return next_commit_counter
示例#34
0
文件: arxiv.py 项目: Theer108/invenio
def fetch_updated_arxiv_records(date):
    """Fetch all the arxiv records modified since the last run"""

    def check_arxiv(recid):
        """Returns True for arxiv papers"""
        for report_number in get_fieldvalues(recid, "037__9"):
            if report_number == "arXiv":
                return True
        return False

    # Fetch all records inserted since last run
    sql = (
        "SELECT `id`, `modification_date` FROM `bibrec` "
        "WHERE `modification_date` >= %s "
        "ORDER BY `modification_date`"
    )
    records = run_sql(sql, [date.isoformat()])
    records = [(r, mod_date) for r, mod_date in records if check_arxiv(r)]

    # Show all records for debugging purposes
    if task_get_option("verbose") >= 9:
        write_message("recids:", verbose=9)
        for recid, mod_date in records:
            write_message("* %s, %s" % (recid, mod_date), verbose=9)

    task_update_progress("Done fetching %s arxiv record ids" % len(records))
    return records
示例#35
0
文件: task.py 项目: osub3/invenio
def check_options():
    """ Reimplement this method for having the possibility to check options
    before submitting the task, in order for example to provide default
    values. It must return False if there are errors in the options.
    """
    if (
        not task_get_option("new")
        and not task_get_option("modified")
        and not task_get_option("recids")
        and not task_get_option("collections")
        and not task_get_option("arxiv")
    ):
        print("Error: No records specified, you need" " to specify which files to run on", file=sys.stderr)
        return False

    return True
def solr_commit_if_necessary(next_commit_counter, final_commit=False, recid=None):
    # Counter full or final commit if counter set
    if next_commit_counter == task_get_option("flush") - 1 or (final_commit and next_commit_counter > 0):
        recid_info = ''
        if recid:
            recid_info = ' for recid=%s' % recid
        status_msg = 'Solr ranking indexer COMMITTING' + recid_info
        write_message(status_msg)
        task_update_progress(status_msg)

        try:
            # Commits might cause an exception, most likely a
            # timeout while hitting a background merge
            # Changes will then be committed later by the
            # calling (periodical) task
            # Also, autocommits can be used in the solrconfig
            SOLR_CONNECTION.commit()
        except:
            register_exception(alert_admin=True)
        next_commit_counter = 0

        task_sleep_now_if_required(can_stop_too=True)
    else:
        next_commit_counter = next_commit_counter + 1
    return next_commit_counter
def solr_add_ranges(id_ranges):
    sub_range_length = task_get_option("flush")
    id_ranges_to_index = []
    for id_range in id_ranges:
        lower_recid = id_range[0]
        upper_recid = id_range[1]
        i_low = lower_recid
        while i_low <= upper_recid:
            i_up = min(i_low + sub_range_length - 1, upper_recid)
            id_ranges_to_index.append((i_low, i_up))
            i_low += sub_range_length

    tags_to_index = get_tags()
    # Indexes latest records first by reversing
    # This allows the ranker to return better results during long indexing
    # runs as the ranker cuts the hitset using latest records
    id_ranges_to_index.reverse()
    next_commit_counter = 0
    for id_range_to_index in id_ranges_to_index:
        lower_recid = id_range_to_index[0]
        upper_recid = id_range_to_index[1]
        status_msg = "Solr ranking indexer called for %s-%s" % (lower_recid,
                                                                upper_recid)
        write_message(status_msg)
        task_update_progress(status_msg)
        next_commit_counter = solr_add_range(lower_recid, upper_recid,
                                             tags_to_index,
                                             next_commit_counter)

    solr_commit_if_necessary(next_commit_counter, final_commit=True)
示例#38
0
文件: daemon.py 项目: k3njiy/invenio
def task_run_core():
    """Reimplement to add the body of the task"""
    write_message("bibsort starting..")

    cmd = task_get_option('cmd')
    methods = task_get_option('methods')
    recids = task_get_option('recids')
    write_message("Task parameters: command=%s ; methods=%s ; recids=%s" \
                  % (cmd, methods, recids), verbose=2)

    executed_correctly = False

    # if no command is defined, run sorting
    if not cmd:
        cmd = 'sort'

    if cmd == 'load':
        write_message('Starting loading the configuration \
                      from the cfg file to the db.', verbose=5)
        executed_correctly = load_configuration()
        if executed_correctly:
            write_message('Loading completed.', verbose=5)
    elif cmd == 'dump':
        write_message('Starting dumping the configuration \
                      from the db into the cfg file.', verbose=5)
        executed_correctly = dump_configuration()
        if executed_correctly:
            write_message('Dumping completed.', verbose=5)
    elif cmd == 'print':
        executed_correctly = print_sorting_methods()
    elif cmd == 'sort':
        write_message('Starting sorting.', verbose=5)
        executed_correctly = update_sorting(methods, recids)
        if executed_correctly:
            write_message('Sorting completed.', verbose=5)
    elif cmd == 'rebalance':
        write_message('Starting rebalancing the sorting buckets.', verbose=5)
        executed_correctly = rebalance(methods)
        if executed_correctly:
            write_message('Rebalancing completed.', verbose=5)
    else:
        write_message("This action is not possible. \
        See the --help for available actions.", sys.stderr)

    write_message('bibsort exiting..')
    return executed_correctly
示例#39
0
文件: task.py 项目: osub3/invenio
def check_options():
    """ Reimplement this method for having the possibility to check options
    before submitting the task, in order for example to provide default
    values. It must return False if there are errors in the options.
    """
    if not task_get_option('new') \
            and not task_get_option('modified') \
            and not task_get_option('recids') \
            and not task_get_option('collections') \
            and not task_get_option('arxiv'):
        print(
            'Error: No records specified, you need'
            ' to specify which files to run on',
            file=sys.stderr)
        return False

    return True
示例#40
0
文件: task.py 项目: osub3/invenio
def task_parse_options(key, value, opts, args):   # pylint: disable-msg=W0613
    """ Must be defined for bibtask to create a task """
    if args:
        # There should be no standalone arguments for any bibcatalog job
        # This will catch args before the job is shipped to Bibsched
        raise StandardError("Error: Unrecognised argument '%s'." % args[0])

    if key in ('-a', '--new'):
        task_set_option('new', True)
    elif key in ('-m', '--modified'):
        task_set_option('modified', True)
    elif key in ('-c', '--collections'):
        collections = task_get_option('collections')
        if not collections:
            collections = set()
            task_set_option('collections', collections)
        for v in value.split(","):
            collections.update(get_collection_reclist(v))
    elif key in ('-i', '--recids'):
        recids = task_get_option('recids')
        if not recids:
            recids = set()
            task_set_option('recids', recids)
        recids.update(split_ids(value))
    elif key in ('--tickets',):
        tickets = task_get_option('tickets')
        if not tickets:
            tickets = set()
            task_set_option('tickets', tickets)
        for item in value.split(','):
            tickets.add(item.strip())
    elif key in ('--all-tickets',):
        task_set_option('all-tickets', True)
    elif key in ('-q', '--query'):
        query = task_get_option('query')
        if not query:
            query = set()
            task_set_option('query', query)
        query.add(value)
    elif key in ('-r', '--reportnumbers'):
        reportnumbers = task_get_option('reportnumbers')
        if not reportnumbers:
            reportnumbers = set()
            task_set_option('reportnumbers', reportnumbers)
        reportnumbers.add(value)
    return True
示例#41
0
def task_parse_options(key, value, opts, args):   # pylint: disable-msg=W0613
    """ Must be defined for bibtask to create a task """
    if args:
        # There should be no standalone arguments for any bibcatalog job
        # This will catch args before the job is shipped to Bibsched
        raise StandardError("Error: Unrecognised argument '%s'." % args[0])

    if key in ('-a', '--new'):
        task_set_option('new', True)
    elif key in ('-m', '--modified'):
        task_set_option('modified', True)
    elif key in ('-c', '--collections'):
        collections = task_get_option('collections')
        if not collections:
            collections = set()
            task_set_option('collections', collections)
        for v in value.split(","):
            collections.update(get_collection_reclist(v))
    elif key in ('-i', '--recids'):
        recids = task_get_option('recids')
        if not recids:
            recids = set()
            task_set_option('recids', recids)
        recids.update(split_ids(value))
    elif key in ('--tickets',):
        tickets = task_get_option('tickets')
        if not tickets:
            tickets = set()
            task_set_option('tickets', tickets)
        for item in value.split(','):
            tickets.add(item.strip())
    elif key in ('--all-tickets',):
        task_set_option('all-tickets', True)
    elif key in ('-q', '--query'):
        query = task_get_option('query')
        if not query:
            query = set()
            task_set_option('query', query)
        query.add(value)
    elif key in ('-r', '--reportnumbers'):
        reportnumbers = task_get_option('reportnumbers')
        if not reportnumbers:
            reportnumbers = set()
            task_set_option('reportnumbers', reportnumbers)
        reportnumbers.add(value)
    return True
示例#42
0
def citation_exec(rank_method_code, name, config):
    """Rank method for citation analysis"""
    #first check if this is a specific task
    if task_get_option("cmd") == "print-missing":
        num = task_get_option("num")
        print_missing(num)
    else:
        dic, index_update_time = get_citation_weight(rank_method_code, config)
        if dic:
            if task_get_option("id") or task_get_option("collection") or \
               task_get_option("modified"):
                # user have asked to citation-index specific records
                # only, so we should not update citation indexer's
                # last run time stamp information
                index_update_time = None
            intoDB(dic, index_update_time, rank_method_code)
        else:
            write_message("No need to update the indexes for citations.")
示例#43
0
def _task_submit_check_options():
    """Required by bibtask. Checks the options."""
    recids = bibtask.task_get_option('recids')
    collections = bibtask.task_get_option('collections')
    taxonomy = bibtask.task_get_option('taxonomy')

    # If a recid or a collection is specified, check that the taxonomy
    # is also specified.
    if (recids is not None or collections is not None) and \
                    taxonomy is None:
        bibtask.write_message(
            "ERROR: When specifying a record ID or a collection, "
            "you have to precise which\ntaxonomy to use.",
            stream=sys.stderr,
            verbose=0)
        return False

    return True
示例#44
0
def check_records(rule, records):
    """
    Check a set of records against a batch rule
    """
    plugins = task_get_option("plugins")
    for record in records:
        record.set_rule(rule)
    plugin = plugins[rule["check"]]
    return plugin["check_records"](records, **rule["checker_params"])
示例#45
0
def citation_exec(rank_method_code, name, config):
    """Rank method for citation analysis"""
    #first check if this is a specific task
    if task_get_option("cmd") == "print-missing":
        num = task_get_option("num")
        print_missing(num)
    else:
        dic, index_update_time = get_citation_weight(rank_method_code, config)
        if dic:
            if task_get_option("id") or task_get_option("collection") or \
               task_get_option("modified"):
                # user have asked to citation-index specific records
                # only, so we should not update citation indexer's
                # last run time stamp information
                index_update_time = None
            intoDB(dic, index_update_time, rank_method_code)
        else:
            write_message("No need to update the indexes for citations.")
示例#46
0
def check_record(rule, record):
    """
    Check a record against a rule
    """
    plugins = task_get_option("plugins")
    record.set_rule(rule)
    plugin = plugins[rule["check"]]
    if not record.is_dummy():
        return plugin["check_record"](record, **rule["checker_params"])
示例#47
0
def task_run_core():
    """
    Main daemon task.

    Returns True when run successfully. False otherwise.
    """
    # Dictionary of "plugin_name" -> func
    tickets_to_apply = task_get_option('tickets')
    write_message("Ticket plugins found: %s" % (str(tickets_to_apply), ),
                  verbose=9)

    task_update_progress("Loading records")
    records_concerned = get_recids_to_load()
    write_message("%i record(s) found" % (len(records_concerned), ))

    records_processed = 0
    for record, last_date in load_records_from_id(records_concerned):
        records_processed += 1
        recid = record_id_from_record(record)
        task_update_progress(
            "Processing records %s/%s (%i%%)" %
            (records_processed, len(records_concerned),
             int(float(records_processed) / len(records_concerned) * 100)))
        task_sleep_now_if_required(can_stop_too=True)
        for ticket_name, plugin in tickets_to_apply.items():
            if plugin:
                write_message("Running template %s for %s" %
                              (ticket_name, recid),
                              verbose=5)
                try:
                    ticket = BibCatalogTicket(recid=int(recid))
                    if plugin['check_record'](ticket, record):
                        ticket = plugin['generate_ticket'](ticket, record)
                        write_message("Ticket to be generated: %s" %
                                      (ticket, ),
                                      verbose=5)
                        res = ticket.submit()
                        if res:
                            write_message("Ticket #%s created for %s" %
                                          (ticket.ticketid, recid))
                        else:
                            write_message("Ticket already exists for %s" %
                                          (recid, ))
                    else:
                        write_message("Skipping record %s", (recid, ))
                except Exception, e:
                    write_message("Error submitting ticket for record %s:" %
                                  (recid, ))
                    write_message(traceback.format_exc())
                    raise e
            else:
                raise BibCatalogPluginException("Plugin not valid in %s" %
                                                (ticket_name, ))

        if last_date:
            store_last_updated(recid, last_date, name="bibcatalog")
示例#48
0
def submit_ticket(record, record_id):
    """ Submit the errors to bibcatalog """

    if task_get_option("no_tickets", False):
        return

    msg = """
Bibcheck found some problems with the record with id %s:

Errors:
%s

Amendments:
%s

Warnings:
%s

Edit this record: %s
"""
    msg = msg % (
        record_id,
        "\n".join(record.errors),
        "\n".join(record.amendments),
        "\n".join(record.warnings),
        "%s/record/%s/edit" % (CFG_SITE_URL, record_id),
    )
    if isinstance(msg, unicode):
        msg = msg.encode("utf-8")

    subject = "Bibcheck rule failed in record %s" % record_id

    ticket_id = BIBCATALOG_SYSTEM.ticket_submit(
        subject=subject,
        recordid=record_id,
        text=subject,
        queue=task_get_option("queue", "Bibcheck")
    )
    write_message("Bibcatalog returned %s" % ticket_id)
    if ticket_id:
        BIBCATALOG_SYSTEM.ticket_comment(None, ticket_id, msg)
示例#49
0
def process_updates(rank_method_code):
    """
    This is what gets executed first when the task is started.
    It handles the --rebuild option. If that option is not specified
    we fall back to the process_one()
    """
    write_message("Running rank method: %s" % rank_method_code, verbose=0)

    selfcites_config = read_configuration(rank_method_code)
    config = {
        'algorithm':
        selfcites_config.get(rank_method_code, "algorithm"),
        'friends_threshold':
        selfcites_config.get(rank_method_code, "friends_threshold")
    }
    quick = task_get_option("quick") != "no"
    if not quick:
        return rebuild_tables(rank_method_code, config)

    tags = get_authors_tags()
    recids, end_date = fetch_concerned_records(rank_method_code,
                                               task_get_option("id"))
    citations_fun = get_citations_fun(config['algorithm'])
    weights = fromDB(rank_method_code)

    write_message("recids %s" % str(recids))

    total = len(recids)
    for count, recid in enumerate(recids):
        task_sleep_now_if_required(can_stop_too=True)
        msg = "Extracting for %s (%d/%d)" % (recid, count + 1, total)
        task_update_progress(msg)
        write_message(msg)

        process_one(recid, tags, citations_fun, weights)

    intoDB(weights, end_date, rank_method_code)
    store_weights_cache(weights)

    write_message("Complete")
    return True
示例#50
0
def task_submit_check_options():
    """Check if a tasklet has been specified and the parameters are good."""
    tasklet = task_get_option('tasklet', None)
    arguments = task_get_option('arguments', {})
    if not tasklet:
        print('ERROR: no tasklet specified', file=sys.stderr)
        return False
    elif tasklet not in _TASKLETS:
        print('ERROR: "%s" is not a valid tasklet. Use '
              '--list-tasklets to obtain a list of the working tasklets.' %
              tasklet, file=sys.stderr)
        return False
    else:
        try:
            check_arguments_compatibility(_TASKLETS[tasklet], arguments)
        except ValueError as err:
            print('ERROR: wrong arguments (%s) specified for '
                  'tasklet "%s": %s' % (
                      arguments, tasklet, err), file=sys.stderr)
            return False
    return True
示例#51
0
def load_rules(plugins):
    """
    Load the rules and return a dict with the rules
    """
    config = task_get_option("config", "rules.cfg")
    filename = os.path.join(CFG_ETCDIR, "bibcheck/", config)
    config = RawConfigParser()
    config.readfp(open(filename))
    rules = {}
    rule_names = config.sections()

    enabled = task_get_option("enabled_rules", None)
    if enabled is not None:
        rule_names = enabled.intersection(rule_names)

    for rule_name in rule_names:
        try:
            rules[rule_name] = load_rule(config, plugins, rule_name)
        except RulesParseError, ex:
            print ex
            write_message(ex)
示例#52
0
def get_recids_for_rules(rules):
    """
    Generates the final list of record IDs to load.

    @param rules dict of rules {rule_name: rule_dict}
    @type rules: dict of rules

    @return dict {rule_name: array of record IDs}
    """
    override_record_ids = task_get_option("record_ids")
    recids = {}
    for rule_name, rule in rules.iteritems():
        if "filter_pattern" in rule:
            query = rule["filter_pattern"]
            if "filter_collection" in rule:
                collections = rule["filter_collection"].split()
            else:
                collections = None
            write_message("Performing given search query: '%s'" % query)
            if collections:
                result = perform_request_search(p=query,
                                                of='intbitset',
                                                wl=rule.get('filter_limit', 0),
                                                f=rule.get(
                                                    'filter_field', None),
                                                c=collections)
            else:
                result = search_pattern(
                    p=query,
                    wl=rule.get('filter_limit', 0),
                    f=rule.get('filter_field', None),
                )
        else:
            result = intbitset(trailing_bits=True)

        if override_record_ids is not None:
            result.intersection_update(override_record_ids)
        else:
            last_run = get_rule_lastrun(rule_name)
            modified_recids = get_modified_records_since(last_run)
            if not "consider_deleted_records" in rule:
                modified_recids -= search_unit_in_bibxxx(p='DELETED',
                                                         f='980__%',
                                                         type='e')
                if CFG_CERN_SITE:
                    modified_recids -= search_unit_in_bibxxx(p='DUMMY',
                                                             f='980__%',
                                                             type='e')
            result.intersection_update(modified_recids)
        recids[rule_name] = result

    return recids
示例#53
0
def get_recids_to_load():
    """
    Generates the final list of record IDs to load.

    Returns a list of tuples like: (recid, date)
    """
    recids_given = task_get_option("recids", default=[])
    query_given = task_get_option("query")
    reportnumbers_given = task_get_option("reportnumbers")
    if query_given:
        write_message("Performing given search query: %s" % (query_given,))
        result = perform_request_search(p=query_given,
                                        of='id',
                                        rg=0,
                                        wl=0)
        recids_given.extend(result)

    if reportnumbers_given:
        write_message("Searching for records referring to given reportnumbers")
        for reportnumber in reportnumbers_given:
            result = perform_request_search(p='reportnumber:%s' % (reportnumber,),
                                            of='id',
                                            rg=0,
                                            wl=0)
            recids_given.extend(result)

    recids_given = [(recid, None) for recid in recids_given]

    last_id, last_date = fetch_last_updated(name="bibcatalog")
    records_found = []
    if task_get_option("new", default=False):
        records_found.extend(get_all_new_records(since=last_date, last_id=last_id))
    if task_get_option("modified", default=False):
        records_found.extend(get_all_modified_records(since=last_date, last_id=last_id))

    for recid, date in records_found:
        recids_given.append((recid, date))
    return recids_given
示例#54
0
def cb_parse_option(key, value, opts, args):
    """Parse command line options"""
    if args:
        # There should be no standalone arguments
        raise StandardError("Error: Unrecognised argument '%s'." % args[0])

    if key in ('-i', '--id'):
        recids = task_get_option('recids')
        if not recids:
            recids = set()
            task_set_option('recids', recids)
        recids.update(split_cli_ids_arg(value))

    return True
示例#55
0
def task_run_core(name, func, extra_vars=None, post_process=None):
    """Calls extract_references in refextract"""
    if task_get_option('task_specific_name'):
        name = "%s:%s" % (name, task_get_option('task_specific_name'))
    write_message("Starting %s" % name)

    if extra_vars is None:
        extra_vars = {}

    records = fetch_concerned_records(name)
    process_records(name, records, func, extra_vars)

    if task_get_option('arxiv'):
        extra_vars['_arxiv'] = True
        arxiv_name = "%s:arxiv" % name
        records = fetch_concerned_arxiv_records(arxiv_name)
        process_records(arxiv_name, records, func, extra_vars)

    if post_process:
        post_process(**extra_vars)

    write_message("Complete")
    return True