def _run_update_authornames_tables_from_paper(record_ids=None, all_records=False):
    '''
    Runs the update on the papers which have been modified since the last run

    @note: This should be run as often as possible to keep authornames and
           authornames_bibrefs cache tables up to date.
    '''
    if not all_records and not record_ids:
        last_log = get_user_log(userinfo='daemon', action='UATFP', only_most_recent=True)
        if len(last_log) >= 1:
            #select only the most recent papers
            recently_modified, min_date = get_papers_recently_modified(date=last_log[0][2])
            insert_user_log('daemon', '-1', 'UATFP', 'bibsched', 'status', comment='bibauthorid_daemon, update_authornames_tables_from_paper', timestamp=min_date[0][0])

            if not recently_modified:
                bibtask.write_message("update_authornames_tables_from_paper: "
                                      "All names up to date.",
                                      stream=sys.stdout, verbose=0)
            else:
                bibtask.write_message("update_authornames_tables_from_paper: Running on %s papers " % str(len(recently_modified)), stream=sys.stdout, verbose=0)
                update_authornames_tables_from_paper(recently_modified)
        else:
            #this is the first time the utility is run, run on all the papers?
            #Probably better to write the log on the first authornames population
            #@todo: authornames population writes the log
            recently_modified, min_date = get_papers_recently_modified()
            insert_user_log('daemon', '-1', 'UATFP', 'bibsched', 'status', comment='bibauthorid_daemon, update_authornames_tables_from_paper', timestamp=min_date[0][0])
            bibtask.write_message("update_authornames_tables_from_paper: Running on %s papers " % str(len(recently_modified)), stream=sys.stdout, verbose=0)
            update_authornames_tables_from_paper(recently_modified)
    else:
        bibtask.write_message("update_authornames_tables_from_paper: Running "
                              "on all papers ",
                              stream=sys.stdout, verbose=0)
        update_authornames_tables_from_paper(record_ids)
def _run_authornames_tables_gc():
    '''
    Runs the garbage collector on the authornames tables, to get rid of
    deleted bibrefs in the respective author tables
    '''
    insert_user_log('daemon', '-1', 'ANTGC', 'bibsched', 'status', comment='bibauthorid_daemon, authornames_tables_gc')
    authornames_tables_gc()
def _run_authornames_tables_gc():
    """
    Runs the garbage collector on the authornames tables, to get rid of
    deleted bibrefs in the respective author tables
    """
    insert_user_log("daemon", "-1", "ANTGC", "bibsched", "status", comment="bibauthorid_daemon, authornames_tables_gc")
    authornames_tables_gc()
def _run_update_personID_table_from_paper(record_ids=None, all_records=False):
    """
    Runs the update on the papers which have been modified since the last run
    This is removing no-longer existing papers from the personid table.

    @note: Update recommended monthly.
    @warning: quite resource intensive.
    """
    if not record_ids and not all_records:
        last_log = get_user_log(userinfo="daemon", action="UPITFP", only_most_recent=True)
        if len(last_log) >= 1:
            # select only the most recent papers
            recently_modified, min_date = get_papers_recently_modified(date=last_log[0][2])
            insert_user_log(
                "daemon",
                "-1",
                "UPITFP",
                "bibsched",
                "status",
                comment="bibauthorid_daemon, update_personID_table_from_paper",
                timestamp=min_date[0][0],
            )

            if not recently_modified:
                bibtask.write_message(
                    "update_personID_table_from_paper: " "All person entities up to date.", stream=sys.stdout, verbose=0
                )
            else:
                bibtask.write_message(
                    "update_personID_table_from_paper: Running on: " + str(recently_modified),
                    stream=sys.stdout,
                    verbose=0,
                )
                update_personID_table_from_paper(recently_modified)
        else:
            # Should not process all papers, hence authornames population writes
            # the appropriate log. In case the log is missing, process everything.
            recently_modified, min_date = get_papers_recently_modified()
            insert_user_log(
                "daemon",
                "-1",
                "UPITFP",
                "bibsched",
                "status",
                comment="bibauthorid_daemon, update_personID_table_from_paper",
                timestamp=min_date[0][0],
            )
            bibtask.write_message(
                "update_personID_table_from_paper: Running on: " + str(recently_modified), stream=sys.stdout, verbose=0
            )
            update_personID_table_from_paper(recently_modified)
        # @todo: develop a method that removes the respective VAs from the database
        # as well since no reference will be there for them any longer. VAs can be
        # found by searching for the authornames ID in the VA table. The
        # method has to kill RA data based on the VA (cf. del_ra_data_by_vaid in
        # ra utils as a reference), all VA2RA links, all VA data, all VAs and
        # finally all doclist refs that point to the respective bibrefs.
    else:
        update_personID_table_from_paper(record_ids)
def _run_authornames_tables_gc():
    '''
    Runs the garbage collector on the authornames tables, to get rid of
    deleted bibrefs in the respective author tables
    '''
    insert_user_log('daemon',
                    '-1',
                    'ANTGC',
                    'bibsched',
                    'status',
                    comment='bibauthorid_daemon, authornames_tables_gc')
    authornames_tables_gc()
def _run_update_personID_table_from_paper():
    '''
    Runs the update on the papers which have been modified since the last run
    This is removing no-longer existing papers from the personid table.

    @note: Update recommended monthly.
    @warning: quite resource intensive.
    '''
    last_log = get_user_log(userinfo='daemon',
                            action='UPITFP',
                            only_most_recent=True)
    if len(last_log) >= 1:
        #select only the most recent papers
        recently_modified, min_date = get_papers_recently_modified(
            date=last_log[0][2])
        insert_user_log(
            'daemon',
            '-1',
            'UPITFP',
            'bibsched',
            'status',
            comment='bibauthorid_daemon, update_personID_table_from_paper',
            timestamp=min_date[0][0])
        bibtask.write_message(
            "update_personID_table_from_paper: Running on: " +
            str(recently_modified),
            stream=sys.stdout,
            verbose=0)
        update_personID_table_from_paper(recently_modified)
    else:
        # Should not process all papers, hence authornames population writes
        # the appropriate log. In case the log is missing, process everything.
        recently_modified, min_date = get_papers_recently_modified()
        insert_user_log(
            'daemon',
            '-1',
            'UPITFP',
            'bibsched',
            'status',
            comment='bibauthorid_daemon, update_personID_table_from_paper',
            timestamp=min_date[0][0])
        bibtask.write_message(
            "update_personID_table_from_paper: Running on: " +
            str(recently_modified),
            stream=sys.stdout,
            verbose=0)
        update_personID_table_from_paper(recently_modified)
def _run_update_authornames_tables_from_paper():
    '''
    Runs the update on the papers which have been modified since the last run

    @note: This should be run as often as possible to keep authornames and
           authornames_bibrefs cache tables up to date.
    '''
    last_log = get_user_log(userinfo='daemon',
                            action='UATFP',
                            only_most_recent=True)
    if len(last_log) >= 1:
        #select only the most recent papers
        recently_modified, min_date = get_papers_recently_modified(
            date=last_log[0][2])
        insert_user_log(
            'daemon',
            '-1',
            'UATFP',
            'bibsched',
            'status',
            comment='bibauthorid_daemon, update_authornames_tables_from_paper',
            timestamp=min_date[0][0])
        bibtask.write_message(
            "update_authornames_tables_from_paper: Running on: " +
            str(recently_modified),
            stream=sys.stdout,
            verbose=0)
        update_authornames_tables_from_paper(recently_modified)
    else:
        #this is the first time the utility is run, run on all the papers?
        #Probably better to write the log on the first authornames population
        #@todo: authornames population writes the log
        recently_modified, min_date = get_papers_recently_modified()
        insert_user_log(
            'daemon',
            '-1',
            'UATFP',
            'bibsched',
            'status',
            comment='bibauthorid_daemon, update_authornames_tables_from_paper',
            timestamp=min_date[0][0])
        bibtask.write_message(
            "update_authornames_tables_from_paper: Running on: " +
            str(recently_modified),
            stream=sys.stdout,
            verbose=0)
        update_authornames_tables_from_paper(recently_modified)
def _run_update_personID_table_from_paper():
    '''
    Runs the update on the papers which have been modified since the last run
    This is removing no-longer existing papers from the personid table.

    @note: Update recommended monthly.
    @warning: quite resource intensive.
    '''
    last_log = get_user_log(userinfo='daemon', action='UPITFP', only_most_recent=True)
    if len(last_log) >= 1:
        #select only the most recent papers
        recently_modified, min_date = get_papers_recently_modified(date=last_log[0][2])
        insert_user_log('daemon', '-1', 'UPITFP', 'bibsched', 'status', comment='bibauthorid_daemon, update_personID_table_from_paper', timestamp=min_date[0][0])
        bibtask.write_message("update_personID_table_from_paper: Running on: " + str(recently_modified), stream=sys.stdout, verbose=0)
        update_personID_table_from_paper(recently_modified)
    else:
        # Should not process all papers, hence authornames population writes
        # the appropriate log. In case the log is missing, process everything.
        recently_modified, min_date = get_papers_recently_modified()
        insert_user_log('daemon', '-1', 'UPITFP', 'bibsched', 'status', comment='bibauthorid_daemon, update_personID_table_from_paper', timestamp=min_date[0][0])
        bibtask.write_message("update_personID_table_from_paper: Running on: " + str(recently_modified), stream=sys.stdout, verbose=0)
        update_personID_table_from_paper(recently_modified)
def _update_authorid_universe():
    '''
    Updates all data related to the authorid algorithm.

    Sequence of operations:
        - Get all recently updated papers and remember time in the log
        - Get all authors on all papers
        - Extract collection of last names
        - For each last name:
            - Populate mem cache with cluster data
            - Delete updated records and their virtual authors from mem cache
            - Create virtual authors for new and updated records
            - Start matching algorithm
        - Update tables with results of the computation
        - Start personid update procedure
    '''

    def create_vas_from_specific_doclist(bibrec_ids):
        '''
        Processes the document list and creates a new minimal virtual author
        for each author in each record specified in the given list.

        @param bibrec_ids: Record IDs to concern in this update
        @type bibrec_ids: list of int
        '''
        num_docs = len([row for row in dat.DOC_LIST
                     if row['bibrecid'] in bibrec_ids])

        bconfig.LOGGER.log(25, "Creating minimal virtual authors for "
                                "all loaded docs (%s)"
                                % (num_docs))

        for docs in [row for row in dat.DOC_LIST
                     if row['bibrecid'] in bibrec_ids]:
            for author_id in docs['authornameids']:
                author_name = [an['name'] for an in dat.AUTHOR_NAMES
                               if an['id'] == author_id]
                refrecs = [ref[1] for ref in docs['authornameid_bibrefrec']
                           if ref[0] == author_id]
                refrec = -1

                if len(refrecs) > 1:
                    print "SCREEEEEEWWWWWWED!!! Several bibrefs on one paper?!"
                    refrec = refrecs[0]
                elif refrecs:
                    refrec = refrecs[0]

                if refrec and author_name:
                    add_minimum_virtualauthor(author_id, author_name[0],
                                              docs['bibrecid'], 0, [], refrec)
                elif author_name:
                    add_minimum_virtualauthor(author_id, author_name[0],
                                              docs['bibrecid'], 0, [])

    dat.reset_mem_cache(True)
    last_log = get_user_log(userinfo='daemon',
                            action='update_aid',
                            only_most_recent=True)
    updated_records = []

    if last_log:
        #select only the most recent papers
        recently_modified, last_update_time = get_papers_recently_modified(
                                                        date=last_log[0][2])
        insert_user_log('daemon', '-1', 'update_aid', 'bibsched', 'status',
                    comment='bibauthorid_daemon, update_authorid_universe',
                    timestamp=last_update_time[0][0])
        bibtask.write_message("Update authorid will operate on %s records."
                              % (len(recently_modified)), stream=sys.stdout,
                              verbose=0)

        if not recently_modified:
            bibtask.write_message("Update authorid: Nothing to do",
                                  stream=sys.stdout, verbose=0)
            return

        for rec in recently_modified:
            updated_records.append(rec[0])
            dat.update_log("rec_updates", rec[0])

    else:
        bibtask.write_message("Update authorid: Nothing to do",
                              stream=sys.stdout, verbose=0)
        return

    authors = []
    author_last_names = set()

    bibtask.task_update_progress('Reading authors from updated records')
    bibtask.write_message("Reading authors from updated records",
                                stream=sys.stdout, verbose=0)
    updated_ras = set()

    # get all authors from all updated records
    for rec in updated_records:
        rec_authors = get_field_values_on_condition(rec, ['100', '700'], "a",
                                                    source="API")

        for rec_author in rec_authors:
            if not rec_author:
                bconfig.LOGGER.error("Invalid empty author string, which "
                                     "will be skipped on record %s"
                                     % (rec))
                continue

            author_in_list = [row for row in authors
                              if row['db_name'] == rec_author]

            if author_in_list:
                for upd in [row for row in authors
                            if row['db_name'] == rec_author]:
                    upd['records'].append(rec)
            else:
                last_name = split_name_parts(rec_author)[0]
                author_last_names.add(last_name)
                authors.append({'db_name': rec_author,
                                'records': [rec],
                                'last_name': last_name})

    for status, author_last_name in enumerate(author_last_names):
        current_authors = [row for row in authors
                           if row['last_name'] == author_last_name]
        total_lnames = len(author_last_names)
        total_authors = len(current_authors)
        bibtask.task_update_progress('Processing %s of %s cluster: "%s" '
                                     '(%s authors)'
                                     % (status + 1, total_lnames,
                                        author_last_name, total_authors))
        bibtask.write_message('Processing %s of %s cluster: "%s" '
                              '(%s authors)'
                              % (status + 1, total_lnames, author_last_name,
                                 total_authors), stream=sys.stdout, verbose=0)
        dat.reset_mem_cache(True)
        init_authornames(author_last_name)
        load_mem_cache_from_tables()
        bconfig.LOGGER.log(25, "-- Relevant data successfully read into memory"
                               " to start processing")

        for current_author in current_authors:
            load_records_to_mem_cache(current_author['records'])
            authornamesid = [row['id'] for row in dat.AUTHOR_NAMES
                             if row['db_name'] == current_author['db_name']]

            if not authornamesid:
                bconfig.LOGGER.error("The author '%s' rec '%s' is not in authornames "
                                     "and will be skipped. You might want "
                                     "to run authornames update before?"
                                     % (current_author['db_name'], rec))
                continue
            else:
                try:
                    authornamesid = int(authornamesid[0])
                except (IndexError, TypeError, ValueError):
                    bconfig.LOGGER.error("Invalid authornames ID!")
                    continue

            if not current_author['records']:
                bconfig.LOGGER.error("The author '%s' is not associated to any"
                                     " document and will be skipped."
                                     % (current_author['db_name']))
                continue

            for rec in current_author['records']:
                # remove VAs already existing for the record
                va_ids = get_va_ids_by_recid_lname(rec,
                                                   current_author["last_name"])

                if va_ids:
                    for va_id in va_ids:
                        ra_list = get_realauthors_by_virtuala_id(va_id)

                        for ra_id in ra_list:
                            remove_va_from_ra(ra_id, va_id)
                            del_ra_data_by_vaid(ra_id, va_id)

                        va_anames_id = get_virtualauthor_records(va_id,
                                                        "orig_authorname_id")

                        for an_list in [row['authornameids'] for row in
                                    dat.DOC_LIST if row['bibrecid'] == rec]:
                            try:
                                an_list.remove(va_anames_id)
                            except (ValueError):
                                # This names id is not in the list...don't care
                                pass

                        delete_virtual_author(va_id)

                # create new VAs for the record.
                update_doclist(rec, authornamesid)
                dat.update_log("rec_updates", rec)

            create_vas_from_specific_doclist(current_author['records'])

        bconfig.LOGGER.log(25, "-- Relevant data pre-processed successfully.")
        start_computation(process_doclist=False,
                          process_orphans=True,
                          print_stats=True)
        bconfig.LOGGER.log(25, "-- Computation finished. Will write back to "
                               "the database now.")
        update_db_result = update_tables_from_mem_cache(return_ra_updates=True)

        if not update_db_result[0]:
            bconfig.LOGGER.log(25, "Writing to persistence layer failed.")
        else:
            if update_db_result[1]:
                for updated_ra in update_db_result[1]:
                    if updated_ra:
                        updated_ras.add(updated_ra[0])

            bconfig.LOGGER.log(25, "Done updating authorid universe.")

    personid_ra_format = []

    for ra_id in updated_ras:
        personid_ra_format.append((ra_id,))

    bconfig.LOGGER.log(25, "Will now run personid update to make the "
                       "changes visible also on the front end and to "
                       "create person IDs for %s newly created and changed "
                       "authors." % len(updated_ras))
    bibtask.task_update_progress('Updating persistent Person IDs')
    update_personID_from_algorithm(personid_ra_format)
    bconfig.LOGGER.log(25, "Done updating everything. Thanks for flying "
                       "with bibauthorid!")
def _task_run_core():
    """
    Runs the requested task in the bibsched environment.
    """

    lastname = bibtask.task_get_option('lastname')
    process_all = bibtask.task_get_option('process_all')
    prepare_grid = bibtask.task_get_option('prepare_grid')
    load_grid = bibtask.task_get_option('load_grid_results')
    data_dir = bibtask.task_get_option('data_dir')
    prefix = bibtask.task_get_option('prefix')
    max_records_option = bibtask.task_get_option('max_records')
    update = bibtask.task_get_option('update')
    clean_cache = bibtask.task_get_option('clean_cache')
    update_cache = bibtask.task_get_option('update_cache')

#    automated_daemon_mode_p = True

    if lastname:
        bibtask.write_message("Processing last name %s" % (lastname),
                              stream=sys.stdout, verbose=0)

    if process_all:
        if bconfig.STANDALONE:
            bibtask.write_message("Processing not possible in standalone!",
                                  stream=sys.stdout, verbose=0)
            return 0

        bibtask.write_message("Processing all names...",
                              stream=sys.stdout, verbose=0)

        lengths = get_len_authornames_bibrefs()

        if not check_and_create_aid_tables():
            bibtask.write_message("Failed to create database tables!",
                                  stream=sys.stdout, verbose=0)
            return 0

        if lengths['names'] < 1:
            bibtask.write_message("Populating Authornames table. It's Empty.",
                                  stream=sys.stdout, verbose=0)
            bibtask.task_update_progress('Populating Authornames table.')
            populate_authornames()
            insert_user_log('daemon', '-1', 'UATFP', 'bibsched', 'status',
                            comment='bibauthorid_daemon, '
                            'update_authornames_tables_from_paper')


        if lengths['bibrefs'] < 1:
            bibtask.write_message("Populating Bibrefs lookup. It's Empty.",
                                  stream=sys.stdout, verbose=0)
            bibtask.task_update_progress('Populating Bibrefs lookup table.')
            populate_authornames_bibrefs_from_authornames()

        bibtask.task_update_progress('Processing all authors.')
        start_full_disambiguation(last_names="all",
                                 process_orphans=True,
                                 db_exists=False,
                                 populate_doclist=True,
                                 write_to_db=True)
        update_personID_from_algorithm()
        insert_user_log('daemon', '-1', 'update_aid', 'bibsched', 'status',
                    comment='bibauthorid_daemon, update_authorid_universe')

    if prepare_grid:
        bibtask.write_message("Preparing Grid Job",
                              stream=sys.stdout, verbose=0)
        data_dir_name = "grid_data"
        workdir_prefix = "job"
        max_records = 4000

        if data_dir:
            data_dir_name = data_dir

        if prefix:
            workdir_prefix = prefix

        if max_records_option:
            max_records = max_records_option

        _prepare_data_files_from_db(data_dir_name, workdir_prefix, max_records)

    if load_grid:
        bibtask.write_message("Reading Grid Job results and will write"
                              " them to the database.",
                              stream=sys.stdout, verbose=0)

        _write_data_files_to_db(data_dir)

    if update or update_cache:
        bibtask.write_message("update-cache: Processing recently updated"
                              " papers", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress('update-cache: Processing recently'
                                     ' updated papers')
        _run_update_authornames_tables_from_paper()
        bibtask.write_message("update-cache: Finished processing papers",
                              stream=sys.stdout, verbose=0)
        bibtask.task_update_progress('update-cache: DONE')

    if update:
        bibtask.write_message("updating authorid universe",
                              stream=sys.stdout, verbose=0)
        bibtask.task_update_progress('updating authorid universe')
        _update_authorid_universe()
        bibtask.write_message("done updating authorid universe",
                              stream=sys.stdout, verbose=0)
        bibtask.task_update_progress('done updating authorid universe')

    if clean_cache:
        bibtask.write_message("clean-cache: Processing recently updated"
                              " papers", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress('clean-cache: Processing recently updated'
                                     ' papers for names')
        _run_authornames_tables_gc()
        bibtask.write_message("update-cache: Finished cleaning authornames "
                              "tables", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress('clean-cache: Processing recently updated'
                                     ' papers for persons')
        _run_update_personID_table_from_paper()
        bibtask.write_message("update-cache: Finished cleaning PersonID"
                              " table", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress('clean-cache: DONE')

    return 1
def _update_authorid_universe():
    '''
    Updates all data related to the authorid algorithm.

    Sequence of operations:
        - Get all recently updated papers and remember time in the log
        - Get all authors on all papers
        - Extract collection of last names
        - For each last name:
            - Populate mem cache with cluster data
            - Delete updated records and their virtual authors from mem cache
            - Create virtual authors for new and updated records
            - Start matching algorithm
        - Update tables with results of the computation
        - Start personid update procedure
    '''
    def create_vas_from_specific_doclist(bibrec_ids):
        '''
        Processes the document list and creates a new minimal virtual author
        for each author in each record specified in the given list.

        @param bibrec_ids: Record IDs to concern in this update
        @type bibrec_ids: list of int
        '''
        num_docs = len(
            [row for row in dat.DOC_LIST if row['bibrecid'] in bibrec_ids])

        bconfig.LOGGER.log(
            25, "Creating minimal virtual authors for "
            "all loaded docs (%s)" % (num_docs))

        for docs in [
                row for row in dat.DOC_LIST if row['bibrecid'] in bibrec_ids
        ]:
            for author_id in docs['authornameids']:
                author_name = [
                    an['name'] for an in dat.AUTHOR_NAMES
                    if an['id'] == author_id
                ]
                refrecs = [
                    ref[1] for ref in docs['authornameid_bibrefrec']
                    if ref[0] == author_id
                ]
                refrec = -1

                if len(refrecs) > 1:
                    print "SCREEEEEEWWWWWWED!!! Several bibrefs on one paper?!"
                    refrec = refrecs[0]
                elif refrecs:
                    refrec = refrecs[0]

                if refrec and author_name:
                    add_minimum_virtualauthor(author_id, author_name[0],
                                              docs['bibrecid'], 0, [], refrec)
                elif author_name:
                    add_minimum_virtualauthor(author_id, author_name[0],
                                              docs['bibrecid'], 0, [])

    dat.reset_mem_cache(True)
    last_log = get_user_log(userinfo='daemon',
                            action='update_aid',
                            only_most_recent=True)
    updated_records = []

    if last_log:
        #select only the most recent papers
        recently_modified, last_update_time = get_papers_recently_modified(
            date=last_log[0][2])
        insert_user_log('daemon',
                        '-1',
                        'update_aid',
                        'bibsched',
                        'status',
                        comment='bibauthorid_daemon, update_authorid_universe',
                        timestamp=last_update_time[0][0])
        bibtask.write_message("Update authorid will operate on %s records." %
                              (len(recently_modified)),
                              stream=sys.stdout,
                              verbose=0)

        if not recently_modified:
            bibtask.write_message("Update authorid: Nothing to do",
                                  stream=sys.stdout,
                                  verbose=0)
            return

        for rec in recently_modified:
            updated_records.append(rec[0])
            dat.update_log("rec_updates", rec[0])

    else:
        bibtask.write_message("Update authorid: Nothing to do",
                              stream=sys.stdout,
                              verbose=0)
        return

    authors = []
    author_last_names = set()

    bibtask.task_update_progress('Reading authors from updated records')
    bibtask.write_message("Reading authors from updated records",
                          stream=sys.stdout,
                          verbose=0)
    updated_ras = set()

    # get all authors from all updated records
    for rec in updated_records:
        rec_authors = get_field_values_on_condition(rec, ['100', '700'],
                                                    "a",
                                                    source="API")

        for rec_author in rec_authors:
            if not rec_author:
                bconfig.LOGGER.error("Invalid empty author string, which "
                                     "will be skipped on record %s" % (rec))
                continue

            author_in_list = [
                row for row in authors if row['db_name'] == rec_author
            ]

            if author_in_list:
                for upd in [
                        row for row in authors if row['db_name'] == rec_author
                ]:
                    upd['records'].append(rec)
            else:
                last_name = split_name_parts(rec_author)[0]
                author_last_names.add(last_name)
                authors.append({
                    'db_name': rec_author,
                    'records': [rec],
                    'last_name': last_name
                })

    for status, author_last_name in enumerate(author_last_names):
        current_authors = [
            row for row in authors if row['last_name'] == author_last_name
        ]
        total_lnames = len(author_last_names)
        total_authors = len(current_authors)
        bibtask.task_update_progress(
            'Processing %s of %s cluster: "%s" '
            '(%s authors)' %
            (status + 1, total_lnames, author_last_name, total_authors))
        bibtask.write_message(
            'Processing %s of %s cluster: "%s" '
            '(%s authors)' %
            (status + 1, total_lnames, author_last_name, total_authors),
            stream=sys.stdout,
            verbose=0)
        dat.reset_mem_cache(True)
        init_authornames(author_last_name)
        load_mem_cache_from_tables()
        bconfig.LOGGER.log(
            25, "-- Relevant data successfully read into memory"
            " to start processing")

        for current_author in current_authors:
            load_records_to_mem_cache(current_author['records'])
            authornamesid = [
                row['id'] for row in dat.AUTHOR_NAMES
                if row['db_name'] == current_author['db_name']
            ]

            if not authornamesid:
                bconfig.LOGGER.error(
                    "The author '%s' rec '%s' is not in authornames "
                    "and will be skipped. You might want "
                    "to run authornames update before?" %
                    (current_author['db_name'], rec))
                continue
            else:
                try:
                    authornamesid = int(authornamesid[0])
                except (IndexError, TypeError, ValueError):
                    bconfig.LOGGER.error("Invalid authornames ID!")
                    continue

            if not current_author['records']:
                bconfig.LOGGER.error("The author '%s' is not associated to any"
                                     " document and will be skipped." %
                                     (current_author['db_name']))
                continue

            for rec in current_author['records']:
                # remove VAs already existing for the record
                va_ids = get_va_ids_by_recid_lname(rec,
                                                   current_author["last_name"])

                if va_ids:
                    for va_id in va_ids:
                        ra_list = get_realauthors_by_virtuala_id(va_id)

                        for ra_id in ra_list:
                            remove_va_from_ra(ra_id, va_id)
                            del_ra_data_by_vaid(ra_id, va_id)

                        va_anames_id = get_virtualauthor_records(
                            va_id, "orig_authorname_id")

                        for an_list in [
                                row['authornameids'] for row in dat.DOC_LIST
                                if row['bibrecid'] == rec
                        ]:
                            try:
                                an_list.remove(va_anames_id)
                            except (ValueError):
                                # This names id is not in the list...don't care
                                pass

                        delete_virtual_author(va_id)

                # create new VAs for the record.
                update_doclist(rec, authornamesid)
                dat.update_log("rec_updates", rec)

            create_vas_from_specific_doclist(current_author['records'])

        bconfig.LOGGER.log(25, "-- Relevant data pre-processed successfully.")
        start_computation(process_doclist=False,
                          process_orphans=True,
                          print_stats=True)
        bconfig.LOGGER.log(
            25, "-- Computation finished. Will write back to "
            "the database now.")
        update_db_result = update_tables_from_mem_cache(return_ra_updates=True)

        if not update_db_result[0]:
            bconfig.LOGGER.log(25, "Writing to persistence layer failed.")
        else:
            if update_db_result[1]:
                for updated_ra in update_db_result[1]:
                    if updated_ra:
                        updated_ras.add(updated_ra[0])

            bconfig.LOGGER.log(25, "Done updating authorid universe.")

    personid_ra_format = []

    for ra_id in updated_ras:
        personid_ra_format.append((ra_id, ))

    bconfig.LOGGER.log(
        25, "Will now run personid update to make the "
        "changes visible also on the front end and to "
        "create person IDs for %s newly created and changed "
        "authors." % len(updated_ras))
    bibtask.task_update_progress('Updating persistent Person IDs')
    update_personID_from_algorithm(personid_ra_format)
    bconfig.LOGGER.log(
        25, "Done updating everything. Thanks for flying "
        "with bibauthorid!")
def _task_run_core():
    """
    Runs the requested task in the bibsched environment.
    """

    lastname = bibtask.task_get_option('lastname')
    process_all = bibtask.task_get_option('process_all')
    prepare_grid = bibtask.task_get_option('prepare_grid')
    load_grid = bibtask.task_get_option('load_grid_results')
    data_dir = bibtask.task_get_option('data_dir')
    prefix = bibtask.task_get_option('prefix')
    max_records_option = bibtask.task_get_option('max_records')
    update = bibtask.task_get_option('update')
    clean_cache = bibtask.task_get_option('clean_cache')
    update_cache = bibtask.task_get_option('update_cache')

    #    automated_daemon_mode_p = True

    if lastname:
        bibtask.write_message("Processing last name %s" % (lastname),
                              stream=sys.stdout,
                              verbose=0)

    if process_all:
        if bconfig.STANDALONE:
            bibtask.write_message("Processing not possible in standalone!",
                                  stream=sys.stdout,
                                  verbose=0)
            return 0

        bibtask.write_message("Processing all names...",
                              stream=sys.stdout,
                              verbose=0)

        lengths = get_len_authornames_bibrefs()

        if not check_and_create_aid_tables():
            bibtask.write_message("Failed to create database tables!",
                                  stream=sys.stdout,
                                  verbose=0)
            return 0

        if lengths['names'] < 1:
            bibtask.write_message("Populating Authornames table. It's Empty.",
                                  stream=sys.stdout,
                                  verbose=0)
            bibtask.task_update_progress('Populating Authornames table.')
            populate_authornames()
            insert_user_log('daemon',
                            '-1',
                            'UATFP',
                            'bibsched',
                            'status',
                            comment='bibauthorid_daemon, '
                            'update_authornames_tables_from_paper')

        if lengths['bibrefs'] < 1:
            bibtask.write_message("Populating Bibrefs lookup. It's Empty.",
                                  stream=sys.stdout,
                                  verbose=0)
            bibtask.task_update_progress('Populating Bibrefs lookup table.')
            populate_authornames_bibrefs_from_authornames()

        bibtask.task_update_progress('Processing all authors.')
        start_full_disambiguation(last_names="all",
                                  process_orphans=True,
                                  db_exists=False,
                                  populate_doclist=True,
                                  write_to_db=True)
        update_personID_from_algorithm()
        insert_user_log('daemon',
                        '-1',
                        'update_aid',
                        'bibsched',
                        'status',
                        comment='bibauthorid_daemon, update_authorid_universe')

    if prepare_grid:
        bibtask.write_message("Preparing Grid Job",
                              stream=sys.stdout,
                              verbose=0)
        data_dir_name = "grid_data"
        workdir_prefix = "job"
        max_records = 4000

        if data_dir:
            data_dir_name = data_dir

        if prefix:
            workdir_prefix = prefix

        if max_records_option:
            max_records = max_records_option

        _prepare_data_files_from_db(data_dir_name, workdir_prefix, max_records)

    if load_grid:
        bibtask.write_message(
            "Reading Grid Job results and will write"
            " them to the database.",
            stream=sys.stdout,
            verbose=0)

        _write_data_files_to_db(data_dir)

    if update or update_cache:
        bibtask.write_message(
            "update-cache: Processing recently updated"
            " papers",
            stream=sys.stdout,
            verbose=0)
        bibtask.task_update_progress('update-cache: Processing recently'
                                     ' updated papers')
        _run_update_authornames_tables_from_paper()
        bibtask.write_message("update-cache: Finished processing papers",
                              stream=sys.stdout,
                              verbose=0)
        bibtask.task_update_progress('update-cache: DONE')

    if update:
        bibtask.write_message("updating authorid universe",
                              stream=sys.stdout,
                              verbose=0)
        bibtask.task_update_progress('updating authorid universe')
        _update_authorid_universe()
        bibtask.write_message("done updating authorid universe",
                              stream=sys.stdout,
                              verbose=0)
        bibtask.task_update_progress('done updating authorid universe')

    if clean_cache:
        bibtask.write_message(
            "clean-cache: Processing recently updated"
            " papers",
            stream=sys.stdout,
            verbose=0)
        bibtask.task_update_progress('clean-cache: Processing recently updated'
                                     ' papers for names')
        _run_authornames_tables_gc()
        bibtask.write_message(
            "update-cache: Finished cleaning authornames "
            "tables",
            stream=sys.stdout,
            verbose=0)
        bibtask.task_update_progress('clean-cache: Processing recently updated'
                                     ' papers for persons')
        _run_update_personID_table_from_paper()
        bibtask.write_message(
            "update-cache: Finished cleaning PersonID"
            " table",
            stream=sys.stdout,
            verbose=0)
        bibtask.task_update_progress('clean-cache: DONE')

    return 1
Exemplo n.º 13
0
def _run_update_personID_table_from_paper(record_ids=None, all_records=False):
    '''
    Runs the update on the papers which have been modified since the last run
    This is removing no-longer existing papers from the personid table.

    @note: Update recommended monthly.
    @warning: quite resource intensive.
    '''
    if not record_ids and not all_records:
        last_log = get_user_log(userinfo='daemon',
                                action='UPITFP',
                                only_most_recent=True)
        if len(last_log) >= 1:
            #select only the most recent papers
            recently_modified, min_date = get_papers_recently_modified(
                date=last_log[0][2])
            insert_user_log(
                'daemon',
                '-1',
                'UPITFP',
                'bibsched',
                'status',
                comment='bibauthorid_daemon, update_personID_table_from_paper',
                timestamp=min_date[0][0])

            if not recently_modified:
                bibtask.write_message(
                    "update_personID_table_from_paper: "
                    "All person entities up to date.",
                    stream=sys.stdout,
                    verbose=0)
            else:
                bibtask.write_message(
                    "update_personID_table_from_paper: Running on: " +
                    str(recently_modified),
                    stream=sys.stdout,
                    verbose=0)
                update_personID_table_from_paper(recently_modified)
        else:
            # Should not process all papers, hence authornames population writes
            # the appropriate log. In case the log is missing, process everything.
            recently_modified, min_date = get_papers_recently_modified()
            insert_user_log(
                'daemon',
                '-1',
                'UPITFP',
                'bibsched',
                'status',
                comment='bibauthorid_daemon, update_personID_table_from_paper',
                timestamp=min_date[0][0])
            bibtask.write_message(
                "update_personID_table_from_paper: Running on: " +
                str(recently_modified),
                stream=sys.stdout,
                verbose=0)
            update_personID_table_from_paper(recently_modified)
        # @todo: develop a method that removes the respective VAs from the database
        # as well since no reference will be there for them any longer. VAs can be
        # found by searching for the authornames ID in the VA table. The
        # method has to kill RA data based on the VA (cf. del_ra_data_by_vaid in
        # ra utils as a reference), all VA2RA links, all VA data, all VAs and
        # finally all doclist refs that point to the respective bibrefs.
    else:
        update_personID_table_from_paper(record_ids)
Exemplo n.º 14
0
def _task_run_core():
    """
    Runs the requested task in the bibsched environment.
    """

    lastname = bibtask.task_get_option("lastname")
    process_all = bibtask.task_get_option("process_all")
    prepare_grid = bibtask.task_get_option("prepare_grid")
    load_grid = bibtask.task_get_option("load_grid_results")
    data_dir = bibtask.task_get_option("data_dir")
    prefix = bibtask.task_get_option("prefix")
    max_records_option = bibtask.task_get_option("max_records")
    update = bibtask.task_get_option("update")
    clean_cache = bibtask.task_get_option("clean_cache")
    update_cache = bibtask.task_get_option("update_cache")
    record_ids = bibtask.task_get_option("record_ids")
    record_ids_nested = None
    all_records = bibtask.task_get_option("all_records")
    repair_pid = bibtask.task_get_option("repair_pid")
    fast_update_personid = bibtask.task_get_option("fast_update_personid")

    if record_ids:
        record_ids_nested = [[p] for p in record_ids]

    if fast_update_personid:
        fast_update_personid = [[p] for p in fast_update_personid]
    #    automated_daemon_mode_p = True

    if lastname:
        bibtask.write_message("Processing last name %s" % (lastname), stream=sys.stdout, verbose=0)

    if process_all:
        if bconfig.STANDALONE:
            bibtask.write_message("Processing not possible in standalone!", stream=sys.stdout, verbose=0)
            return 0

        bibtask.write_message("Processing all names...", stream=sys.stdout, verbose=0)

        lengths = get_len_authornames_bibrefs()

        if not check_and_create_aid_tables():
            bibtask.write_message("Failed to create database tables!", stream=sys.stdout, verbose=0)
            return 0

        if lengths["names"] < 1:
            bibtask.write_message("Populating Authornames table. It's Empty.", stream=sys.stdout, verbose=0)
            bibtask.task_update_progress("Populating Authornames table.")
            populate_authornames()
            insert_user_log(
                "daemon",
                "-1",
                "UATFP",
                "bibsched",
                "status",
                comment="bibauthorid_daemon, " "update_authornames_tables_from_paper",
            )

        if lengths["bibrefs"] < 1:
            bibtask.write_message("Populating Bibrefs lookup. It's Empty.", stream=sys.stdout, verbose=0)
            bibtask.task_update_progress("Populating Bibrefs lookup table.")
            populate_authornames_bibrefs_from_authornames()

        bibtask.task_update_progress("Processing all authors.")
        start_full_disambiguation(
            last_names="all", process_orphans=True, db_exists=False, populate_doclist=True, write_to_db=True
        )
        update_personID_from_algorithm()
        insert_user_log(
            "daemon", "-1", "update_aid", "bibsched", "status", comment="bibauthorid_daemon, update_authorid_universe"
        )

    if prepare_grid:
        bibtask.write_message("Preparing Grid Job", stream=sys.stdout, verbose=0)
        data_dir_name = "grid_data"
        workdir_prefix = "job"
        max_records = 4000

        if data_dir:
            data_dir_name = data_dir

        if prefix:
            workdir_prefix = prefix

        if max_records_option:
            max_records = max_records_option

        _prepare_data_files_from_db(data_dir_name, workdir_prefix, max_records)

    if load_grid:
        bibtask.write_message(
            "Reading Grid Job results and will write" " them to the database.", stream=sys.stdout, verbose=0
        )

        _write_data_files_to_db(data_dir)

    if update or update_cache:
        bibtask.write_message("update-cache: Processing recently updated" " papers", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress("update-cache: Processing recently" " updated papers")
        _run_update_authornames_tables_from_paper(record_ids_nested, all_records)
        bibtask.write_message("update-cache: Finished processing papers", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress("update-cache: DONE")

    if update:
        bibtask.write_message("updating authorid universe", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress("updating authorid universe")
        _update_authorid_universe(record_ids, all_records)
        bibtask.write_message("done updating authorid universe", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress("done updating authorid universe")

    if clean_cache:
        bibtask.write_message("clean-cache: Processing recently updated" " papers", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress("clean-cache: Processing recently updated" " papers for names")
        _run_authornames_tables_gc()
        bibtask.write_message("update-cache: Finished cleaning authornames " "tables", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress("clean-cache: Processing recently updated" " papers for persons")
        _run_update_personID_table_from_paper(record_ids_nested, all_records)
        bibtask.write_message("update-cache: Finished cleaning PersonID" " table", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress("clean-cache: DONE")

    if repair_pid:
        bibtask.task_update_progress("Updating names cache...")
        _run_update_authornames_tables_from_paper()
        bibtask.task_sleep_now_if_required(can_stop_too=False)
        bibtask.task_update_progress("Removing person entities not touched by " "humans...")
        personid_remove_automatically_assigned_papers()
        bibtask.task_sleep_now_if_required(can_stop_too=False)
        bibtask.task_update_progress("Updating person entities...")
        update_personID_from_algorithm()
        bibtask.task_sleep_now_if_required(can_stop_too=False)
        bibtask.task_update_progress("Cleaning person tables...")
        _run_update_personID_table_from_paper()
        bibtask.task_sleep_now_if_required(can_stop_too=False)
        bibtask.task_update_progress("All repairs done.")

    if fast_update_personid:
        bibtask.task_update_progress("Updating personid...")
        _run_personid_fast_assign_papers(fast_update_personid)
        bibtask.task_update_progress("Update finished...")
        # TODO: remember to pass the papers list!
    return 1
Exemplo n.º 15
0
def _run_personid_fast_assign_papers(paperslist):
    insert_user_log(
        "daemon", "-1", "PFAP", "bibsched", "status", comment="bibauthorid_daemon, personid_fast_assign_papers"
    )
    personid_fast_assign_papers(paperslist)