def get_information_from_dataset(va_id, ra_id=-1):
    '''
    Retrieves information about the data
    of a virtual author from the data set.

    In dependency of the real author ID, the information will be written to the
    real author holding this ID. If the real author ID should be the default
    '-1', a list with all the data will be returned.

    @param va_id: Virtual author ID to get the information from
    @type va_id: int
    @param ra_id: Real author ID to set information for.
    @type ra_id: int

    @return: True, if ra_id is set OR A list of the data
    @rtype: True if ra_id > -1 or list of strings
    '''
    if dat.RUNTIME_CONFIG["populate_aid_from_personid"]:
        return True

    va_data = get_virtualauthor_records(va_id)
    bibrec_id = ""
    authorname_id = ""

    for va_data_item in va_data:
        if va_data_item['tag'] == "bibrec_id":
            bibrec_id = va_data_item['value']
        elif va_data_item['tag'] == "orig_authorname_id":
            authorname_id = va_data_item['value']

    authorname_strings = get_name_and_db_name_strings(authorname_id)

    bconfig.LOGGER.info("| Reading info for va %s: %s recid %s"
                  % (va_id, authorname_strings["name"], bibrec_id))

    inspireid = get_field_values_on_condition(
        bibrec_id, ['100', '700'], 'i', 'a',
        authorname_strings["db_name"], "==")

    if inspireid:
        inspireid = list(inspireid)[0]

    if ra_id > -1:
        if inspireid:
            set_realauthor_data(ra_id, "inspireid", "%s" % inspireid)

        return True
    else:
        return inspireid
def get_information_from_dataset(va_id, ra_id= -1):
    '''
    Retrieves information about the citations
    of a virtual author from the data set.

    In dependency of the real author ID, the information will be written to the
    real author holding this ID. If the real author ID should be the default
    '-1', a list with all the data will be returned.

    @param va_id: Virtual author ID to get the information from
    @type va_id: int
    @param ra_id: Real author ID to set information for.
    @type ra_id: int

    @return: True, if ra_id is set OR A list of the data
    @rtype: True if ra_id > -1 or list of strings
    '''
    if dat.RUNTIME_CONFIG["populate_aid_from_personid"]:
        return True

    va_data = get_virtualauthor_records(va_id)
    authorname_id = -1
    bibrec_id = ""

    for va_data_item in va_data:
        if va_data_item['tag'] == "bibrec_id":
            bibrec_id = va_data_item['value']
        elif va_data_item['tag'] == "orig_authorname_id":
            authorname_id = va_data_item['value']

    bconfig.LOGGER.info("| Reading citation info for va %s: %s recid %s"
                  % (va_id, authorname_id, bibrec_id))

    cites = get_field_values_on_condition(bibrec_id, 'cites')

    if ra_id > -1:
        if cites:
            for cite in cites:
                set_realauthor_data(ra_id, "outgoing_citation", "%s" % (cite))

        return True
    else:
        return cites
Пример #3
0
def get_information_from_dataset(va_id, ra_id= -1):
    '''
    Retrieves information about the data
    of a virtual author from the data set.

    In dependency of the real author ID, the information will be written to the
    real author holding this ID. If the real author ID should be the default
    '-1', a list with all the data will be returned.

    @param va_id: Virtual author ID to get the information from
    @type va_id: int
    @param ra_id: Real author ID to set information for.
    @type ra_id: int

    @return: True, if ra_id is set OR A list of the data
    @rtype: True if ra_id > -1 or list of strings
    '''
    va_data = get_virtualauthor_records(va_id)
    bibrec_id = ""

    for va_data_item in va_data:
        if va_data_item['tag'] == "bibrec_id":
            bibrec_id = va_data_item['value']
        elif va_data_item['tag'] == "orig_authorname_id":
            authorname_id = va_data_item['value']

    authorname_strings = get_name_and_db_name_strings(authorname_id)

    bconfig.LOGGER.info("| Reading info for va %s: %s recid %s"
                  % (va_id, authorname_strings["name"], bibrec_id))

    data = get_field_values_on_condition(
        bibrec_id, ['100', '700'], 'a', 'a',
        authorname_strings["db_name"], "!=")

    if ra_id > -1:
        formatted = "something"
        set_realauthor_data(ra_id, "module_tag", "module_value %s"
                            % (formatted))

        return True
    else:
        return data
Пример #4
0
def get_information_from_dataset(va_id, ra_id=-1):
    '''
    Retrieves information about the data
    of a virtual author from the data set.

    In dependency of the real author ID, the information will be written to the
    real author holding this ID. If the real author ID should be the default
    '-1', a list with all the data will be returned.

    @param va_id: Virtual author ID to get the information from
    @type va_id: int
    @param ra_id: Real author ID to set information for.
    @type ra_id: int

    @return: True, if ra_id is set OR A list of the data
    @rtype: True if ra_id > -1 or list of strings
    '''
    va_data = get_virtualauthor_records(va_id)
    bibrec_id = ""

    for va_data_item in va_data:
        if va_data_item['tag'] == "bibrec_id":
            bibrec_id = va_data_item['value']
        elif va_data_item['tag'] == "orig_authorname_id":
            authorname_id = va_data_item['value']

    authorname_strings = get_name_and_db_name_strings(authorname_id)

    bconfig.LOGGER.info("| Reading info for va %s: %s recid %s" %
                        (va_id, authorname_strings["name"], bibrec_id))

    data = get_field_values_on_condition(bibrec_id, ['100', '700'], 'a', 'a',
                                         authorname_strings["db_name"], "!=")

    if ra_id > -1:
        formatted = "something"
        set_realauthor_data(ra_id, "module_tag",
                            "module_value %s" % (formatted))

        return True
    else:
        return data
Пример #5
0
def get_information_from_dataset(va_id, ra_id=-1):
    '''
    Retrieves information about the citations
    of a virtual author from the data set.

    In dependency of the real author ID, the information will be written to the
    real author holding this ID. If the real author ID should be the default
    '-1', a list with all the data will be returned.

    @param va_id: Virtual author ID to get the information from
    @type va_id: int
    @param ra_id: Real author ID to set information for.
    @type ra_id: int

    @return: True, if ra_id is set OR A list of the data
    @rtype: True if ra_id > -1 or list of strings
    '''
    va_data = get_virtualauthor_records(va_id)
    authorname_id = -1
    bibrec_id = ""

    for va_data_item in va_data:
        if va_data_item['tag'] == "bibrec_id":
            bibrec_id = va_data_item['value']
        elif va_data_item['tag'] == "orig_authorname_id":
            authorname_id = va_data_item['value']

    bconfig.LOGGER.info("| Reading citation info for va %s: %s recid %s" %
                        (va_id, authorname_id, bibrec_id))

    cites = get_field_values_on_condition(bibrec_id, 'cites')

    if ra_id > -1:
        if cites:
            for cite in cites:
                set_realauthor_data(ra_id, "outgoing_citation", "%s" % (cite))

        return True
    else:
        return cites
def _update_authorid_universe():
    '''
    Updates all data related to the authorid algorithm.

    Sequence of operations:
        - Get all recently updated papers and remember time in the log
        - Get all authors on all papers
        - Extract collection of last names
        - For each last name:
            - Populate mem cache with cluster data
            - Delete updated records and their virtual authors from mem cache
            - Create virtual authors for new and updated records
            - Start matching algorithm
        - Update tables with results of the computation
        - Start personid update procedure
    '''

    def create_vas_from_specific_doclist(bibrec_ids):
        '''
        Processes the document list and creates a new minimal virtual author
        for each author in each record specified in the given list.

        @param bibrec_ids: Record IDs to concern in this update
        @type bibrec_ids: list of int
        '''
        num_docs = len([row for row in dat.DOC_LIST
                     if row['bibrecid'] in bibrec_ids])

        bconfig.LOGGER.log(25, "Creating minimal virtual authors for "
                                "all loaded docs (%s)"
                                % (num_docs))

        for docs in [row for row in dat.DOC_LIST
                     if row['bibrecid'] in bibrec_ids]:
            for author_id in docs['authornameids']:
                author_name = [an['name'] for an in dat.AUTHOR_NAMES
                               if an['id'] == author_id]
                refrecs = [ref[1] for ref in docs['authornameid_bibrefrec']
                           if ref[0] == author_id]
                refrec = -1

                if len(refrecs) > 1:
                    print "SCREEEEEEWWWWWWED!!! Several bibrefs on one paper?!"
                    refrec = refrecs[0]
                elif refrecs:
                    refrec = refrecs[0]

                if refrec and author_name:
                    add_minimum_virtualauthor(author_id, author_name[0],
                                              docs['bibrecid'], 0, [], refrec)
                elif author_name:
                    add_minimum_virtualauthor(author_id, author_name[0],
                                              docs['bibrecid'], 0, [])

    dat.reset_mem_cache(True)
    last_log = get_user_log(userinfo='daemon',
                            action='update_aid',
                            only_most_recent=True)
    updated_records = []

    if last_log:
        #select only the most recent papers
        recently_modified, last_update_time = get_papers_recently_modified(
                                                        date=last_log[0][2])
        insert_user_log('daemon', '-1', 'update_aid', 'bibsched', 'status',
                    comment='bibauthorid_daemon, update_authorid_universe',
                    timestamp=last_update_time[0][0])
        bibtask.write_message("Update authorid will operate on %s records."
                              % (len(recently_modified)), stream=sys.stdout,
                              verbose=0)

        if not recently_modified:
            bibtask.write_message("Update authorid: Nothing to do",
                                  stream=sys.stdout, verbose=0)
            return

        for rec in recently_modified:
            updated_records.append(rec[0])
            dat.update_log("rec_updates", rec[0])

    else:
        bibtask.write_message("Update authorid: Nothing to do",
                              stream=sys.stdout, verbose=0)
        return

    authors = []
    author_last_names = set()

    bibtask.task_update_progress('Reading authors from updated records')
    bibtask.write_message("Reading authors from updated records",
                                stream=sys.stdout, verbose=0)
    updated_ras = set()

    # get all authors from all updated records
    for rec in updated_records:
        rec_authors = get_field_values_on_condition(rec, ['100', '700'], "a",
                                                    source="API")

        for rec_author in rec_authors:
            if not rec_author:
                bconfig.LOGGER.error("Invalid empty author string, which "
                                     "will be skipped on record %s"
                                     % (rec))
                continue

            author_in_list = [row for row in authors
                              if row['db_name'] == rec_author]

            if author_in_list:
                for upd in [row for row in authors
                            if row['db_name'] == rec_author]:
                    upd['records'].append(rec)
            else:
                last_name = split_name_parts(rec_author)[0]
                author_last_names.add(last_name)
                authors.append({'db_name': rec_author,
                                'records': [rec],
                                'last_name': last_name})

    for status, author_last_name in enumerate(author_last_names):
        current_authors = [row for row in authors
                           if row['last_name'] == author_last_name]
        total_lnames = len(author_last_names)
        total_authors = len(current_authors)
        bibtask.task_update_progress('Processing %s of %s cluster: "%s" '
                                     '(%s authors)'
                                     % (status + 1, total_lnames,
                                        author_last_name, total_authors))
        bibtask.write_message('Processing %s of %s cluster: "%s" '
                              '(%s authors)'
                              % (status + 1, total_lnames, author_last_name,
                                 total_authors), stream=sys.stdout, verbose=0)
        dat.reset_mem_cache(True)
        init_authornames(author_last_name)
        load_mem_cache_from_tables()
        bconfig.LOGGER.log(25, "-- Relevant data successfully read into memory"
                               " to start processing")

        for current_author in current_authors:
            load_records_to_mem_cache(current_author['records'])
            authornamesid = [row['id'] for row in dat.AUTHOR_NAMES
                             if row['db_name'] == current_author['db_name']]

            if not authornamesid:
                bconfig.LOGGER.error("The author '%s' rec '%s' is not in authornames "
                                     "and will be skipped. You might want "
                                     "to run authornames update before?"
                                     % (current_author['db_name'], rec))
                continue
            else:
                try:
                    authornamesid = int(authornamesid[0])
                except (IndexError, TypeError, ValueError):
                    bconfig.LOGGER.error("Invalid authornames ID!")
                    continue

            if not current_author['records']:
                bconfig.LOGGER.error("The author '%s' is not associated to any"
                                     " document and will be skipped."
                                     % (current_author['db_name']))
                continue

            for rec in current_author['records']:
                # remove VAs already existing for the record
                va_ids = get_va_ids_by_recid_lname(rec,
                                                   current_author["last_name"])

                if va_ids:
                    for va_id in va_ids:
                        ra_list = get_realauthors_by_virtuala_id(va_id)

                        for ra_id in ra_list:
                            remove_va_from_ra(ra_id, va_id)
                            del_ra_data_by_vaid(ra_id, va_id)

                        va_anames_id = get_virtualauthor_records(va_id,
                                                        "orig_authorname_id")

                        for an_list in [row['authornameids'] for row in
                                    dat.DOC_LIST if row['bibrecid'] == rec]:
                            try:
                                an_list.remove(va_anames_id)
                            except (ValueError):
                                # This names id is not in the list...don't care
                                pass

                        delete_virtual_author(va_id)

                # create new VAs for the record.
                update_doclist(rec, authornamesid)
                dat.update_log("rec_updates", rec)

            create_vas_from_specific_doclist(current_author['records'])

        bconfig.LOGGER.log(25, "-- Relevant data pre-processed successfully.")
        start_computation(process_doclist=False,
                          process_orphans=True,
                          print_stats=True)
        bconfig.LOGGER.log(25, "-- Computation finished. Will write back to "
                               "the database now.")
        update_db_result = update_tables_from_mem_cache(return_ra_updates=True)

        if not update_db_result[0]:
            bconfig.LOGGER.log(25, "Writing to persistence layer failed.")
        else:
            if update_db_result[1]:
                for updated_ra in update_db_result[1]:
                    if updated_ra:
                        updated_ras.add(updated_ra[0])

            bconfig.LOGGER.log(25, "Done updating authorid universe.")

    personid_ra_format = []

    for ra_id in updated_ras:
        personid_ra_format.append((ra_id,))

    bconfig.LOGGER.log(25, "Will now run personid update to make the "
                       "changes visible also on the front end and to "
                       "create person IDs for %s newly created and changed "
                       "authors." % len(updated_ras))
    bibtask.task_update_progress('Updating persistent Person IDs')
    update_personID_from_algorithm(personid_ra_format)
    bconfig.LOGGER.log(25, "Done updating everything. Thanks for flying "
                       "with bibauthorid!")
def init_va_process_queue(mode="updated"):
    '''
    Initializes the virtual author process queue with all virtual authors
    that are not connected (orphaned) or updated.

    @param mode: Specifies the mode of operation regarding which data to use.
        Modes are: 'orphaned' or 'updated' (default)
    @type mode: string
    '''
    bconfig.LOGGER.log(25, "Initializing processing queue")

    if not dat.VIRTUALAUTHOR_PROCESS_QUEUE.empty():
        bconfig.LOGGER.info("Clearing VA Process Queue")
        dat.VIRTUALAUTHOR_PROCESS_QUEUE.queue.clear()

    va_nosort = {}

    if mode == "updated":
        for va_entry in [row['virtualauthorid'] for row in
                         dat.VIRTUALAUTHOR_DATA
                         if ((row['tag'] == 'updated')
                             and (row['value'] == 'True'))]:
            va_nosort[va_entry] = 0
    elif mode == "orphaned":
        for va_entry in [row['virtualauthorid'] for row in
                         dat.VIRTUALAUTHOR_DATA
                         if ((row['tag'] == 'connected')
                             and (row['value'] == 'False'))]:
            va_nosort[va_entry] = 0

    if dat.RUNTIME_CONFIG["populate_aid_from_personid"]:
        for va_entry in va_nosort:
            dat.VIRTUALAUTHOR_PROCESS_QUEUE.put(va_entry)

        bconfig.LOGGER.log(25, "Done with queue initialization.")

        return

    for va_id in va_nosort:
        va_data = get_virtualauthor_records(va_id)
        authorname_string = ""
        bibrec_id = ""

        for va_data_item in va_data:
            if va_data_item['tag'] == "bibrec_id":
                bibrec_id = va_data_item['value']
            elif va_data_item['tag'] == "orig_name_string":
                authorname_string = va_data_item['value']
            else:
                affiliations = get_field_values_on_condition(bibrec_id,
                                ['100', '700'], 'u', 'a', authorname_string)
                coauthors = get_field_values_on_condition(bibrec_id,
                                ['100', '700'], 'a', 'a', authorname_string, "!=")
                collaboration = get_field_values_on_condition(bibrec_id, "710", "g")
    
                if affiliations:
                    va_nosort[va_id] += 1
    
                if coauthors:
                    va_nosort[va_id] += 1
    
                if collaboration:
                    va_nosort[va_id] += 1

    for va_entry in sorted(va_nosort.items(), key=itemgetter(1), reverse=True):
        dat.VIRTUALAUTHOR_PROCESS_QUEUE.put(va_entry[0])

    bconfig.LOGGER.log(25, "Done with queue initialization.")
def _update_authorid_universe():
    '''
    Updates all data related to the authorid algorithm.

    Sequence of operations:
        - Get all recently updated papers and remember time in the log
        - Get all authors on all papers
        - Extract collection of last names
        - For each last name:
            - Populate mem cache with cluster data
            - Delete updated records and their virtual authors from mem cache
            - Create virtual authors for new and updated records
            - Start matching algorithm
        - Update tables with results of the computation
        - Start personid update procedure
    '''
    def create_vas_from_specific_doclist(bibrec_ids):
        '''
        Processes the document list and creates a new minimal virtual author
        for each author in each record specified in the given list.

        @param bibrec_ids: Record IDs to concern in this update
        @type bibrec_ids: list of int
        '''
        num_docs = len(
            [row for row in dat.DOC_LIST if row['bibrecid'] in bibrec_ids])

        bconfig.LOGGER.log(
            25, "Creating minimal virtual authors for "
            "all loaded docs (%s)" % (num_docs))

        for docs in [
                row for row in dat.DOC_LIST if row['bibrecid'] in bibrec_ids
        ]:
            for author_id in docs['authornameids']:
                author_name = [
                    an['name'] for an in dat.AUTHOR_NAMES
                    if an['id'] == author_id
                ]
                refrecs = [
                    ref[1] for ref in docs['authornameid_bibrefrec']
                    if ref[0] == author_id
                ]
                refrec = -1

                if len(refrecs) > 1:
                    print "SCREEEEEEWWWWWWED!!! Several bibrefs on one paper?!"
                    refrec = refrecs[0]
                elif refrecs:
                    refrec = refrecs[0]

                if refrec and author_name:
                    add_minimum_virtualauthor(author_id, author_name[0],
                                              docs['bibrecid'], 0, [], refrec)
                elif author_name:
                    add_minimum_virtualauthor(author_id, author_name[0],
                                              docs['bibrecid'], 0, [])

    dat.reset_mem_cache(True)
    last_log = get_user_log(userinfo='daemon',
                            action='update_aid',
                            only_most_recent=True)
    updated_records = []

    if last_log:
        #select only the most recent papers
        recently_modified, last_update_time = get_papers_recently_modified(
            date=last_log[0][2])
        insert_user_log('daemon',
                        '-1',
                        'update_aid',
                        'bibsched',
                        'status',
                        comment='bibauthorid_daemon, update_authorid_universe',
                        timestamp=last_update_time[0][0])
        bibtask.write_message("Update authorid will operate on %s records." %
                              (len(recently_modified)),
                              stream=sys.stdout,
                              verbose=0)

        if not recently_modified:
            bibtask.write_message("Update authorid: Nothing to do",
                                  stream=sys.stdout,
                                  verbose=0)
            return

        for rec in recently_modified:
            updated_records.append(rec[0])
            dat.update_log("rec_updates", rec[0])

    else:
        bibtask.write_message("Update authorid: Nothing to do",
                              stream=sys.stdout,
                              verbose=0)
        return

    authors = []
    author_last_names = set()

    bibtask.task_update_progress('Reading authors from updated records')
    bibtask.write_message("Reading authors from updated records",
                          stream=sys.stdout,
                          verbose=0)
    updated_ras = set()

    # get all authors from all updated records
    for rec in updated_records:
        rec_authors = get_field_values_on_condition(rec, ['100', '700'],
                                                    "a",
                                                    source="API")

        for rec_author in rec_authors:
            if not rec_author:
                bconfig.LOGGER.error("Invalid empty author string, which "
                                     "will be skipped on record %s" % (rec))
                continue

            author_in_list = [
                row for row in authors if row['db_name'] == rec_author
            ]

            if author_in_list:
                for upd in [
                        row for row in authors if row['db_name'] == rec_author
                ]:
                    upd['records'].append(rec)
            else:
                last_name = split_name_parts(rec_author)[0]
                author_last_names.add(last_name)
                authors.append({
                    'db_name': rec_author,
                    'records': [rec],
                    'last_name': last_name
                })

    for status, author_last_name in enumerate(author_last_names):
        current_authors = [
            row for row in authors if row['last_name'] == author_last_name
        ]
        total_lnames = len(author_last_names)
        total_authors = len(current_authors)
        bibtask.task_update_progress(
            'Processing %s of %s cluster: "%s" '
            '(%s authors)' %
            (status + 1, total_lnames, author_last_name, total_authors))
        bibtask.write_message(
            'Processing %s of %s cluster: "%s" '
            '(%s authors)' %
            (status + 1, total_lnames, author_last_name, total_authors),
            stream=sys.stdout,
            verbose=0)
        dat.reset_mem_cache(True)
        init_authornames(author_last_name)
        load_mem_cache_from_tables()
        bconfig.LOGGER.log(
            25, "-- Relevant data successfully read into memory"
            " to start processing")

        for current_author in current_authors:
            load_records_to_mem_cache(current_author['records'])
            authornamesid = [
                row['id'] for row in dat.AUTHOR_NAMES
                if row['db_name'] == current_author['db_name']
            ]

            if not authornamesid:
                bconfig.LOGGER.error(
                    "The author '%s' rec '%s' is not in authornames "
                    "and will be skipped. You might want "
                    "to run authornames update before?" %
                    (current_author['db_name'], rec))
                continue
            else:
                try:
                    authornamesid = int(authornamesid[0])
                except (IndexError, TypeError, ValueError):
                    bconfig.LOGGER.error("Invalid authornames ID!")
                    continue

            if not current_author['records']:
                bconfig.LOGGER.error("The author '%s' is not associated to any"
                                     " document and will be skipped." %
                                     (current_author['db_name']))
                continue

            for rec in current_author['records']:
                # remove VAs already existing for the record
                va_ids = get_va_ids_by_recid_lname(rec,
                                                   current_author["last_name"])

                if va_ids:
                    for va_id in va_ids:
                        ra_list = get_realauthors_by_virtuala_id(va_id)

                        for ra_id in ra_list:
                            remove_va_from_ra(ra_id, va_id)
                            del_ra_data_by_vaid(ra_id, va_id)

                        va_anames_id = get_virtualauthor_records(
                            va_id, "orig_authorname_id")

                        for an_list in [
                                row['authornameids'] for row in dat.DOC_LIST
                                if row['bibrecid'] == rec
                        ]:
                            try:
                                an_list.remove(va_anames_id)
                            except (ValueError):
                                # This names id is not in the list...don't care
                                pass

                        delete_virtual_author(va_id)

                # create new VAs for the record.
                update_doclist(rec, authornamesid)
                dat.update_log("rec_updates", rec)

            create_vas_from_specific_doclist(current_author['records'])

        bconfig.LOGGER.log(25, "-- Relevant data pre-processed successfully.")
        start_computation(process_doclist=False,
                          process_orphans=True,
                          print_stats=True)
        bconfig.LOGGER.log(
            25, "-- Computation finished. Will write back to "
            "the database now.")
        update_db_result = update_tables_from_mem_cache(return_ra_updates=True)

        if not update_db_result[0]:
            bconfig.LOGGER.log(25, "Writing to persistence layer failed.")
        else:
            if update_db_result[1]:
                for updated_ra in update_db_result[1]:
                    if updated_ra:
                        updated_ras.add(updated_ra[0])

            bconfig.LOGGER.log(25, "Done updating authorid universe.")

    personid_ra_format = []

    for ra_id in updated_ras:
        personid_ra_format.append((ra_id, ))

    bconfig.LOGGER.log(
        25, "Will now run personid update to make the "
        "changes visible also on the front end and to "
        "create person IDs for %s newly created and changed "
        "authors." % len(updated_ras))
    bibtask.task_update_progress('Updating persistent Person IDs')
    update_personID_from_algorithm(personid_ra_format)
    bconfig.LOGGER.log(
        25, "Done updating everything. Thanks for flying "
        "with bibauthorid!")
Пример #9
0
def init_va_process_queue(mode="updated"):
    '''
    Initializes the virtual author process queue with all virtual authors
    that are not connected (orphaned) or updated.

    @param mode: Specifies the mode of operation regarding which data to use.
        Modes are: 'orphaned' or 'updated' (default)
    @type mode: string
    '''
    bconfig.LOGGER.log(25, "Initializing processing queue")

    if not dat.VIRTUALAUTHOR_PROCESS_QUEUE.empty():
        bconfig.LOGGER.info("Clearing VA Process Queue")
        dat.VIRTUALAUTHOR_PROCESS_QUEUE.queue.clear()

    va_nosort = {}

    if mode == "updated":
        for va_entry in [
                row['virtualauthorid'] for row in dat.VIRTUALAUTHOR_DATA
                if ((row['tag'] == 'updated') and (row['value'] == 'True'))
        ]:
            va_nosort[va_entry] = 0
    elif mode == "orphaned":
        for va_entry in [
                row['virtualauthorid'] for row in dat.VIRTUALAUTHOR_DATA
                if ((row['tag'] == 'connected') and (row['value'] == 'False'))
        ]:
            va_nosort[va_entry] = 0

    for va_id in va_nosort:
        va_data = get_virtualauthor_records(va_id)
        authorname_string = ""
        bibrec_id = ""

        for va_data_item in va_data:
            if va_data_item['tag'] == "bibrec_id":
                bibrec_id = va_data_item['value']
            elif va_data_item['tag'] == "orig_name_string":
                authorname_string = va_data_item['value']

        else:
            affiliations = get_field_values_on_condition(
                bibrec_id, ['100', '700'], 'u', 'a', authorname_string)
            coauthors = get_field_values_on_condition(bibrec_id,
                                                      ['100', '700'], 'a', 'a',
                                                      authorname_string, "!=")
            collaboration = get_field_values_on_condition(
                bibrec_id, "710", "g")

            if affiliations:
                va_nosort[va_id] += 1

            if coauthors:
                va_nosort[va_id] += 1

            if collaboration:
                va_nosort[va_id] += 1

    for va_entry in sorted(va_nosort.items(), key=itemgetter(1), reverse=True):
        dat.VIRTUALAUTHOR_PROCESS_QUEUE.put(va_entry[0])

    bconfig.LOGGER.log(25, "Done with queue initialization.")
Пример #10
0
def get_information_from_dataset(va_id, ra_id=-1):
    '''
    Retrieves information about the coauthors/collaboration attachment
    of a virtual author from the data set.

    In dependency of the real author ID, the information will be written to the
    real author holding this ID. If the real author ID should be the default
    '-1', a list with all the coauthors will be returned.

    @param va_id: Virtual author ID to get the information from
    @type va_id: int
    @param ra_id: Real author ID to set information for.
    @type ra_id: int

    @return: True, if ra_id is set OR A list of coauthors OR the name of a
        collaboration
    @rtype: True if ra_id > -1 or list of strings or string
    '''
    va_data = get_virtualauthor_records(va_id)
    bibrec_id = ""
    authorname_id = -1

    for va_data_item in va_data:
        if va_data_item['tag'] == "bibrec_id":
            bibrec_id = va_data_item['value']
        elif va_data_item['tag'] == "orig_authorname_id":
            authorname_id = va_data_item['value']

    authorname_strings = get_name_and_db_name_strings(authorname_id)

    bconfig.LOGGER.info("| Reading coauthors for va %s: %s recid %s" %
                        (va_id, authorname_strings["name"], bibrec_id))

    coauthors = get_field_values_on_condition(bibrec_id, ['100', '700'], 'a',
                                              'a',
                                              authorname_strings["db_name"],
                                              "!=")

    collaboration = get_field_values_on_condition(bibrec_id, "710", "g")

    if (not coauthors) and (not collaboration):
        bconfig.LOGGER.info("|-> No coauthors and no collaboration found "
                            "for this author on this record")
    elif not ra_id:
        if collaboration:
            bconfig.LOGGER.info("|-> Collaboration found: %s" %
                                (list(collaboration)[0]))
        else:
            bconfig.LOGGER.info("|-> Coauthors found: %s" % (len(coauthors)))

    max_coauthors = MAX_COAUTHORS

    if ra_id > -1:
        if collaboration:
            cname = list(collaboration)[0]
            coauthor_formatted = create_unified_name(cname.lower())
            set_realauthor_data(
                ra_id, "coauthor",
                "%s;;%s" % (authorname_strings["name"], coauthor_formatted))
        else:
            if len(coauthors) <= max_coauthors:
                for coauthor in coauthors:
                    coauthor_formatted = create_unified_name(coauthor.lower())
                    set_realauthor_data(
                        ra_id, "coauthor", "%s;;%s" %
                        (authorname_strings["name"], coauthor_formatted))
            else:
                hashvalue = hash_coauthor_set(coauthors)
                bconfig.LOGGER.info("|--> Coauthor # > %s. To preserve"
                                    " information, a hash will be stored: %s" %
                                    (max_coauthors, hashvalue))
                set_realauthor_data(
                    ra_id, "coauthor",
                    "%s;;%s" % (authorname_strings["name"], hashvalue))

        return True
    else:
        if collaboration:
            return collaboration
        else:
            return coauthors
Пример #11
0
def get_information_from_dataset(va_id, ra_id=-1):
    '''
    Retrieves information about the affiliation of a virtual author
    from the data set.

    In dependency of the real author ID, the information will be written to the
    real author holding this ID. If the real author ID should be the default
    '-1', a list with all the affiliations will be returned.

    @param va_id: Virtual author ID to get the info from
    @type va_id: int
    @param ra_id: Real author ID to set information for.
    @type ra_id: int

    @return: A list of affiliations or simply True, if ra_id is set.
    @rtype: list of strings or True if ra_id > -1
    '''

    va_data = get_virtualauthor_records(va_id)
    authorname_id = -1
    bibrec_id = ""

    for va_data_item in va_data:
        if va_data_item['tag'] == "bibrec_id":
            bibrec_id = va_data_item['value']
        elif va_data_item['tag'] == "orig_authorname_id":
            authorname_id = va_data_item['value']

    authorname_strings = get_name_and_db_name_strings(authorname_id)
    bconfig.LOGGER.info("| Reading affiliations for va %s: %s  recid %s" %
                        (va_id, authorname_strings["name"], bibrec_id))
    affiliations = get_field_values_on_condition(bibrec_id, ['100', '700'],
                                                 'u', 'a',
                                                 authorname_strings["db_name"])
    record_date = get_field_values_on_condition(bibrec_id, '269', 'c')
    constructed_date = []
    datearray = []

    if len(record_date) > 0:
        datearray = list(record_date)[0].split("-")
    else:
        datearray = ['0000', '00']

    length = len(datearray)

    if length == 3:
        datearray.pop()
        constructed_date = datearray
    elif length == 2:
        constructed_date = datearray
    else:
        constructed_date = datearray
        constructed_date += ['10']

    affiliation_date = "%s-%s" % (constructed_date[0], constructed_date[1])

    is_aff = False
    is_aff_date = False

    if not affiliations:
        bconfig.LOGGER.info("|-> No Affiliation for this record. Set to None")
        affiliations = ["None"]
    else:
        bconfig.LOGGER.info("|-> Affiliation found: %s" % (affiliations))
        is_aff = True

    if affiliation_date == "0000-00":
        bconfig.LOGGER.info("|-> No Affiliation Date set to 0000-00")
    else:
        bconfig.LOGGER.info("|-> Affiliation date: %s" % (affiliation_date))
        is_aff_date = True

    aff_collection = []

    if is_aff or is_aff_date:
        for affiliation in affiliations:
            bconfig.LOGGER.info(
                "|--> Found Affiliation: %s;;%s;;%s" %
                (affiliation_date, authorname_strings["name"], affiliation))
            aff_collection.append(
                "%s;;%s;;%s" %
                (affiliation_date, authorname_strings["name"], affiliation))

    if ra_id > -1:
        for affiliation in aff_collection:
            set_realauthor_data(ra_id, "affiliation", affiliation)

        return True
    else:
        return aff_collection
def get_information_from_dataset(va_id, ra_id= -1):
    '''
    Retrieves information about the affiliation of a virtual author
    from the data set.

    In dependency of the real author ID, the information will be written to the
    real author holding this ID. If the real author ID should be the default
    '-1', a list with all the affiliations will be returned.

    @param va_id: Virtual author ID to get the info from
    @type va_id: int
    @param ra_id: Real author ID to set information for.
    @type ra_id: int

    @return: A list of affiliations or simply True, if ra_id is set.
    @rtype: list of strings or True if ra_id > -1
    '''
    src = "MEM"

    if bconfig.STANDALONE or dat.RUNTIME_CONFIG["populate_aid_from_personid"]:
        src = "API"

    va_data = get_virtualauthor_records(va_id)
    authorname_id = -1
    bibrec_id = ""

    for va_data_item in va_data:
        if va_data_item['tag'] == "bibrec_id":
            bibrec_id = va_data_item['value']
        elif va_data_item['tag'] == "orig_authorname_id":
            authorname_id = va_data_item['value']

    authorname_strings = get_name_and_db_name_strings(authorname_id)
    bconfig.LOGGER.info("| Reading affiliations for va %s: %s  recid %s"
                  % (va_id, authorname_strings["name"], bibrec_id))
    affiliations = get_field_values_on_condition(
                                        bibrec_id, ['100', '700'], 'u', 'a',
                                        authorname_strings["db_name"], source=src)
    record_date = get_field_values_on_condition(bibrec_id, '269', 'c', source=src)
    constructed_date = []
    datearray = []

    if len(record_date) > 0:
        datearray = list(record_date)[0].split("-")
    else:
        datearray = ['0000', '00']

    length = len(datearray)

    if length == 3:
        datearray.pop()
        constructed_date = datearray
    elif length == 2:
        constructed_date = datearray
    else:
        constructed_date = datearray
        constructed_date += ['10']

    affiliation_date = "%s-%s" % (constructed_date[0], constructed_date[1])

    is_aff = False
    is_aff_date = False

    if not affiliations:
        bconfig.LOGGER.info("|-> No Affiliation for this record. Set to None")
        affiliations = ["None"]
    else:
        bconfig.LOGGER.info("|-> Affiliation found: %s" % (affiliations))
        is_aff = True

    if affiliation_date == "0000-00":
        bconfig.LOGGER.info("|-> No Affiliation Date set to 0000-00")
    else:
        bconfig.LOGGER.info("|-> Affiliation date: %s" % (affiliation_date))
        is_aff_date = True

    aff_collection = []

    if is_aff or is_aff_date:
        for affiliation in affiliations:
            bconfig.LOGGER.info("|--> Found Affiliation: %s;;%s;;%s"
                          % (affiliation_date, authorname_strings["name"],
                             affiliation))
            aff_collection.append("%s;;%s;;%s" % (affiliation_date,
                                                  authorname_strings["name"],
                                                  affiliation))

    if ra_id > -1:
        for affiliation in aff_collection:
            set_realauthor_data(ra_id, "affiliation", affiliation)

        return True
    else:
        return aff_collection
def get_information_from_dataset(va_id, ra_id= -1):
    '''
    Retrieves information about the coauthors/collaboration attachment
    of a virtual author from the data set.

    In dependency of the real author ID, the information will be written to the
    real author holding this ID. If the real author ID should be the default
    '-1', a list with all the coauthors will be returned.

    @param va_id: Virtual author ID to get the information from
    @type va_id: int
    @param ra_id: Real author ID to set information for.
    @type ra_id: int

    @return: True, if ra_id is set OR A list of coauthors OR the name of a
        collaboration
    @rtype: True if ra_id > -1 or list of strings or string
    '''
    va_data = get_virtualauthor_records(va_id)
    bibrec_id = ""
    authorname_id = -1

    for va_data_item in va_data:
        if va_data_item['tag'] == "bibrec_id":
            bibrec_id = va_data_item['value']
        elif va_data_item['tag'] == "orig_authorname_id":
            authorname_id = va_data_item['value']

    authorname_strings = get_name_and_db_name_strings(authorname_id)

    bconfig.LOGGER.info("| Reading coauthors for va %s: %s recid %s"
                  % (va_id, authorname_strings["name"], bibrec_id))

    coauthors = get_field_values_on_condition(
                                        bibrec_id, ['100', '700'], 'a', 'a',
                                        authorname_strings["db_name"], "!=")

    collaboration = get_field_values_on_condition(bibrec_id, "710", "g")

    if (not coauthors) and (not collaboration):
        bconfig.LOGGER.info("|-> No coauthors and no collaboration found "
                            "for this author on this record")
    elif not ra_id:
        if collaboration:
            bconfig.LOGGER.info("|-> Collaboration found: %s"
                          % (list(collaboration)[0]))
        else:
            bconfig.LOGGER.info("|-> Coauthors found: %s" % (len(coauthors)))

    max_coauthors = MAX_COAUTHORS

    if ra_id > -1:
        if collaboration:
            cname = list(collaboration)[0]
            coauthor_formatted = create_unified_name(cname.lower())
            set_realauthor_data(ra_id, "coauthor", "%s;;%s"
                                % (authorname_strings["name"],
                                   coauthor_formatted))
        else:
            if len(coauthors) <= max_coauthors:
                for coauthor in coauthors:
                    coauthor_formatted = create_unified_name(coauthor.lower())
                    set_realauthor_data(ra_id, "coauthor", "%s;;%s"
                                    % (authorname_strings["name"],
                                       coauthor_formatted))
            else:
                hashvalue = hash_coauthor_set(coauthors)
                bconfig.LOGGER.info("|--> Coauthor # > %s. To preserve"
                                    " information, a hash will be stored: %s"
                                    % (max_coauthors, hashvalue))
                set_realauthor_data(ra_id, "coauthor", "%s;;%s"
                                    % (authorname_strings["name"],
                                       hashvalue))

        return True
    else:
        if collaboration:
            return collaboration
        else:
            return coauthors