def do_upgrade():

    logger = Logger("Rabbit m_name upgrade script")

    warnings.filterwarnings('ignore')
    run_sql(
        "alter table aidPERSONIDPAPERS add `m_name` VARCHAR(255) not null after name"
    )

    run_sql("alter table aidPERSONIDPAPERS add INDEX `m_name-b` (`m_name`)")

    present_bibrefs = set(
        run_sql("select bibref_table, bibref_value from aidPERSONIDPAPERS"))

    total_updates = len(present_bibrefs)

    records_for_rabbit = set()
    for i, bibref in enumerate(present_bibrefs):
        logger.update_status(
            float(i) / total_updates,
            '%s out of %s (%s)' % (str(i), str(total_updates), str(bibref)))
        try:
            name = get_name_by_bibref(bibref)
        except AssertionError, error:
            if "A bibref must have exactly one name" in error.message:
                records_for_rabbit.add(bibref[1])
            else:
                raise error
        else:
            m_name = create_matchable_name(name)
            run_sql(
                "update aidPERSONIDPAPERS set name=%s, m_name=%s where bibref_table=%s "
                "and bibref_value=%s ", (name, m_name, bibref[0], bibref[1]))
Пример #2
0
class BibauthoridBaseMergerTestCase(InvenioTestCase):

    def setUp(self):
        self.verbose = 0
        self.logger = Logger(self.__class__.__name__)  # TODO check
        self.logger.log("Setting up regression tests...")

        self.first_author_name = "Testsurname, Firstperson"
        self.second_author_name = "Testsurname, Secondperson"

        self.cluster = self.first_author_name.split(',')[0].lower()

        self.sigs = list()
        self.author_id_one = get_free_author_id()
        self.author_id_two = self.author_id_one + 1

        self.query = """insert into aidRESULTS
        (personid, bibref_table, bibref_value, bibrec)
        values (%s, %s, %s, %s)"""

        self.merge_func_to_use = merge_dynamic # TOD abstract in main


    def merge_func(self):

        @patch('invenio.bibauthorid_merge.get_cluster_names')
        def mocked_merge(mocked_func):
            mocked_func.return_value = self.get_test_cluster_names()
            self.merge_func_to_use()

        mocked_merge()

    def get_test_cluster_names(self):
        '''
        Mock function replacing get_cluster_names. We only need our test names.
        '''
        return set(run_sql("""select personid
            from aidRESULTS where personid like '%s%%'""" % self.cluster))

    def assertMergeResults(self, recs_one, recs_two, non_deterministic=False):
        author_one_res = run_sql("select bibrec from aidPERSONIDPAPERS where personid = %s",
        (self.author_id_one,))
        author_one_res = set([rec[0] for rec in author_one_res])

        author_two_res = run_sql("select bibrec from aidPERSONIDPAPERS where personid = %s",
                (self.author_id_two,))
        author_two_res = set([rec[0] for rec in author_two_res])

        try:
            self.assertEquals(author_one_res, recs_one)
        except AssertionError,  e:
            if non_deterministic:
                self.assertEquals(author_two_res, recs_one)
                self.assertEquals(author_one_res, recs_two)
            else:
                raise e
        else:
Пример #3
0
    def setUp(self):
        self.verbose = 0
        self.logger = Logger(self.__class__.__name__)  # TODO check
        self.logger.log("Setting up regression tests...")

        self.first_author_name = "Testsurname, Firstperson"
        self.second_author_name = "Testsurname, Secondperson"

        self.cluster = self.first_author_name.split(',')[0].lower()

        self.sigs = list()
        self.author_id_one = get_free_author_id()
        self.author_id_two = self.author_id_one + 1

        self.query = """insert into aidRESULTS
        (personid, bibref_table, bibref_value, bibrec)
        values (%s, %s, %s, %s)"""

        self.merge_func_to_use = merge_dynamic # TOD abstract in main
Пример #4
0
    def __init__(self, name, cluster_set=None, storage_dir_override=None):
        self.name = name
        self._f = None
        self._matrix = None
        self._use_temporary_file = True
        self._size = None

        self._storage_dir_override = storage_dir_override

        if cluster_set:
            self._bibmap = dict(
                (b[1], b[0]) for b in enumerate(cluster_set.all_bibs()))
            width = len(self._bibmap)
            self._size = ((width + 1) * width) / 2
        else:
            self._bibmap = dict()

        self._matrix = None
        self.creation_time = get_db_time()

        self.logger = Logger("bib_matrix")
def do_upgrade():

    logger = Logger("aidPERSONIDPAPERS_duplicates")

    logger.log('Removing duplicate entries in aidPERSONIDPAPERS...')
    duplicates = 0

    while True:  # Needed because there may be >1 duplicates.
        duplicate_entries = run_sql('select * '
                                    'from aidPERSONIDPAPERS   '
                                    'group by personid, bibref_table, '
                                    'bibref_value, bibrec, flag, '
                                    'lcul, last_updated '
                                    'having count(*) > 1')

        if not duplicate_entries:
            break

        for entry in duplicate_entries:
            run_sql('delete from aidPERSONIDPAPERS '
                    'where personid = %s and '
                    'bibref_table = %s and '
                    'bibref_value = %s and '
                    'bibrec = %s and '
                    'name = %s and '
                    'm_name = %s and '
                    'flag = %s and '
                    'lcul = %s and '
                    'last_updated = %s '
                    'limit 1',
                    entry)

        duplicates += len(duplicate_entries)

    logger.log("""%s duplicate entries removed in
              aidPERSONIDPAPERS.""" % duplicates)
Пример #6
0
class Bib_matrix(object):
    '''
    Contains the sparse matrix and encapsulates it.
    '''
    # please increment this value every time you
    # change the output of the comparison functions
    current_comparison_version = 0

    __special_items = ((None, -3.), ('+', -2.), ('-', -1.))
    special_symbols = dict((x[0], x[1]) for x in __special_items)
    special_numbers = dict((x[1], x[0]) for x in __special_items)

    def __init__(self, name, cluster_set=None, storage_dir_override=None):
        self.name = name
        self._f = None
        self._matrix = None
        self._use_temporary_file = True
        self._size = None

        self._storage_dir_override = storage_dir_override

        if cluster_set:
            self._bibmap = dict(
                (b[1], b[0]) for b in enumerate(cluster_set.all_bibs()))
            width = len(self._bibmap)
            self._size = ((width + 1) * width) / 2
        else:
            self._bibmap = dict()

        self._matrix = None
        self.creation_time = get_db_time()

        self.logger = Logger("bib_matrix")

    def _initialize_matrix(self):
        self.open_h5py_file()
        self._matrix = self._f.create_dataset("array", (self._size, 2), 'f')
        self._matrix[...] = self.special_symbols[None]

    def _resolve_entry(self, bibs):
        first, second = bibs
        first, second = self._bibmap[first], self._bibmap[second]
        if first > second:
            first, second = second, first
        return first + (second * second + second) / 2

    def __setitem__(self, bibs, val):
        entry = self._resolve_entry(bibs)
        try:
            self._matrix[entry] = Bib_matrix.special_symbols.get(val, val)
        except TypeError:
            self._initialize_matrix()
            self._matrix[entry] = Bib_matrix.special_symbols.get(val, val)

    def __getitem__(self, bibs):
        entry = self._resolve_entry(bibs)
        try:
            ret = self._matrix[entry]
        except TypeError:
            self._initialize_matrix()
            ret = self._matrix[entry]
        return Bib_matrix.special_numbers.get(ret[0], tuple(ret))

    def getitem_numeric(self, bibs):
        return self._matrix[self._resolve_entry(bibs)]

    def __contains__(self, bib):
        return bib in self._bibmap

    def get_keys(self):
        return self._bibmap.keys()

    def get_file_dir(self):
        if self._storage_dir_override:
            return self._storage_dir_override

        sub_dir = self.name[:2]
        if not sub_dir:
            sub_dir = "empty_last_name"
        return "%s%s/" % (bconfig.TORTOISE_FILES_PATH, sub_dir)

    def get_map_path(self):
        return "%s%s-bibmap.pickle" % (self.get_file_dir(), self.name)

    def get_matrix_path(self):
        path = "%s%s.hdf5" % (self.get_file_dir(), self.name)
        if self._use_temporary_file:
            path = path + '.tmp'
        return path

    def open_h5py_file(self, create_empty_on_failure=True):
        self._prepare_destination_directory()
        path = self.get_matrix_path()

        try:
            self._f = h5py.File(path)
        except IOError as e:
            # If the file is corrupted h5py fails with IOErorr.
            # Give it a second try with an empty file before raising.
            if create_empty_on_failure:
                os.remove(path)
                self._f = h5py.File(path)
            else:
                raise e

    def load(self):
        self._use_temporary_file = False
        files_dir = self.get_file_dir()
        if not os.path.isdir(files_dir):
            self._bibmap = dict()
            self._matrix = None
            return False

        try:
            with open(self.get_map_path(), 'r') as fp:
                bibmap_v = load(fp)
            rec_v, self.creation_time, self._bibmap = bibmap_v  # pylint: disable=W0612
            #                if (rec_v != Bib_matrix.current_comparison_version or
            # you can use negative version to recalculate
            #                    Bib_matrix.current_comparison_version < 0):
            #                    self._bibmap = dict()
            self._use_temporary_file = False
            if self._f:
                self._f.close()
            self.open_h5py_file(create_empty_on_failure=False)
            self._matrix = self._f['array']

        except (IOError, UnpicklingError, KeyError, OSError) as e:

            if e.errno == errno.ENOENT:  # The file has not been created yet. If this the first time bib_matrix runs, it is fine.
                self.logger.log("Warning: The bibmap serialized file ",
                                self.get_map_path(),
                                "is not present. Will not load bibmap.")
            else:
                self.logger.log(
                    'Unexpected error occurred while loading bibmap, cleaning... ',
                    str(type(e)), str(e))
            self._bibmap = dict()
            self._matrix = None

            try:
                os.remove(self.get_map_path())
            except OSError:
                pass
            try:
                os.remove(self.get_matrix_path())
            except OSError:
                pass
            self._use_temporary_file = True
            try:
                os.remove(self.get_matrix_path())
            except OSError:
                pass
            self._bibmap = dict()
            self._matrix = None
            self._use_temporary_file = True
            return False
        return True

    def _prepare_destination_directory(self):
        files_dir = self.get_file_dir()
        if not os.path.isdir(files_dir):
            try:
                os.mkdir(files_dir)
            except OSError as e:
                if e.errno == 17 or 'file exists' in str(e.strerror).lower():
                    pass
                else:
                    raise e

    def store(self):
        # save only if we are not completey empty:
        if self._bibmap:
            self._prepare_destination_directory()
            bibmap_v = (Bib_matrix.current_comparison_version,
                        self.creation_time, self._bibmap)
            with open(self.get_map_path(), 'w') as fp:
                dump(bibmap_v, fp)

            if not self._matrix:
                self._initialize_matrix()

            if self._f:
                self._f.close()

                if self._use_temporary_file:
                    curpath = self.get_matrix_path()
                    self._use_temporary_file = False
                    finalpath = self.get_matrix_path()
                    try:
                        os.rename(curpath, finalpath)
                    except OSError as e:
                        raise e

    def duplicate_existing(self, name, newname):
        '''
        Make sure the original Bib_matrix have been store()-ed before calling this!
        '''
        self._use_temporary_file = False
        self.name = name
        srcmap = self.get_map_path()
        srcmat = self.get_matrix_path()
        self.name = newname
        dstmap = self.get_map_path()
        dstmat = self.get_matrix_path()

        shutil.copy(srcmap, dstmap)
        shutil.copy(srcmat, dstmat)

    def destroy(self):
        if self._f:
            self._f.close()
        try:
            os.remove(self.get_map_path())
        except OSError:
            pass
        try:
            os.remove(self.get_matrix_path())
        except OSError:
            pass
        self._use_temporary_file = True
        try:
            os.remove(self.get_matrix_path())
        except OSError:
            pass
        self._bibmap = dict()
        self._matrix = None
Пример #7
0
# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

#
# This has been temporarily deprecated, please use schedule_workes from general utils instead#
#

import re
import os
import sys
from itertools import dropwhile, chain
from invenio.bibauthorid_general_utils import print_tortoise_memory_log
from invenio import bibauthorid_config as bconfig
from invenio.bibauthorid_general_utils import is_eq
from invenio.bibauthorid_logutils import Logger

logger = Logger("scheduler")

# python2.4 compatibility
from invenio.bibauthorid_general_utils import bai_all as all


def to_number(stry):
    return int(re.sub("\D", "", stry))


def dict_by_file(fpath):
    fp = open(fpath)
    content = fp.read()
    fp.close()
    return dict(x.split(':') for x in content.split("\n")[:-1])
Пример #8
0
def rabbit(bibrecs=None,
           check_invalid_papers=False,
           personids_to_update_extids=None,
           verbose=False):

    logger = Logger("Rabbit")

    if verbose:
        logger.verbose = True

    if not bibrecs:
        logger.log("Running on all records")
    else:
        logger.log("Running on %s " % (str(bibrecs)))

    populate_mnames_pids_cache()

    global M_NAME_PIDS_CACHE

    memoized_compare_names = memoized(comp_names)
    compare_names = lambda x, y: memoized_compare_names(*sorted((x, y)))

    def find_pids_by_matchable_name_with_cache(matchable_name):
        try:
            matched_pids = [M_NAME_PIDS_CACHE[matchable_name]]
        except KeyError:
            matched_pids = get_authors_by_name(matchable_name,
                                               use_matchable_name=True)
            if matched_pids:
                M_NAME_PIDS_CACHE[matchable_name] = matched_pids[0]
        return matched_pids

    if USE_EXT_IDS:

        def get_matched_pids_by_external_ids(sig, rec, pids_having_rec):
            '''
            This function returns all the matched pids after iterating
            through all available external IDs of the system.
            '''
            for get_external_id_of_signature in external_id_getters:
                external_id = get_external_id_of_signature(sig + (rec, ))
                if external_id:
                    matched_pids = list(
                        get_author_by_external_id(external_id[0]))
                    if matched_pids and int(
                            matched_pids[0][0]) in pids_having_rec:
                        matched_pids = list()
                    return matched_pids

    threshold = 0.8

    if not bibrecs or check_invalid_papers:
        all_bibrecs = get_all_valid_bibrecs()

        if not bibrecs:
            bibrecs = all_bibrecs

        if check_invalid_papers:
            filter_bibrecs_outside(all_bibrecs)

    updated_pids = set()
    deleted = frozenset(p[0] for p in get_deleted_papers())

    bibrecs = list(bibrecs)
    for idx, rec in enumerate(bibrecs):

        logger.log("Considering %s" % str(rec))

        if idx % 100 == 0:
            task_update_progress("%d/%d current: %d" %
                                 (idx, len(bibrecs), rec))

        if idx % 1000 == 0:
            destroy_partial_marc_caches()
            populate_partial_marc_caches(bibrecs[idx:idx + 1000])

            logger.log(
                float(idx) / len(bibrecs), "%d/%d" % (idx, len(bibrecs)))

        if rec in deleted:
            remove_papers([rec])
            continue

        author_refs = get_author_refs_of_paper(rec)
        coauthor_refs = get_coauthor_refs_of_paper(rec)

        markrefs = frozenset(
            chain(izip(cycle([100]), imap(itemgetter(0), author_refs)),
                  izip(cycle([700]), imap(itemgetter(0), coauthor_refs))))

        personid_rows = [
            map(int, row[:3]) + [row[4]]
            for row in get_signatures_of_paper(rec)
        ]
        personidrefs_names = dict(
            ((row[1], row[2]), row[3]) for row in personid_rows)

        personidrefs = frozenset(personidrefs_names.keys())
        new_signatures = list(markrefs - personidrefs)
        old_signatures = list(personidrefs - markrefs)

        new_signatures_names = dict(
            (new, get_name_by_bibref(new)) for new in new_signatures)

        # matrix |new_signatures| X |old_signatures|
        matrix = [[
            compare_names(new_signatures_names[new], personidrefs_names[old])
            for old in old_signatures
        ] for new in new_signatures]

        logger.log(" - Deleted signatures: %s" % str(old_signatures))
        logger.log(" - Added signatures: %s" % str(new_signatures))
        logger.log(" - Matrix: %s" % str(matrix))

        #[new_signatures, old_signatures]
        best_match = [(new_signatures[new], old_signatures[old])
                      for new, old, score in maximized_mapping(matrix)
                      if score > threshold]

        logger.log(" - Best match: %s " % str(best_match))

        for new, old in best_match:
            logger.log("  -  -  Moving signature: %s on %s to %s as %s" %
                       (old, rec, new, new_signatures_names[new]))
            modify_signature(old, rec, new, new_signatures_names[new])

        remove_signatures(tuple(list(old) + [rec]) for old in old_signatures)
        not_matched = frozenset(new_signatures) - frozenset(
            map(itemgetter(0), best_match))

        remaining_personid_rows = ([
            x for x in personid_rows if x[1:3] in old_signatures
        ])

        pids_having_rec = set([int(row[0]) for row in remaining_personid_rows])
        logger.log(" - Not matched: %s" % str(not_matched))

        if not_matched:
            used_pids = set(r[0] for r in personid_rows)

        for sig in not_matched:
            name = new_signatures_names[sig]
            matchable_name = create_matchable_name(name)
            matched_pids = list()
            if USE_EXT_IDS:
                matched_pids = get_matched_pids_by_external_ids(
                    sig, rec, pids_having_rec)

                if matched_pids:
                    add_signature(list(sig) + [rec],
                                  name,
                                  matched_pids[0][0],
                                  m_name=matchable_name)
                    M_NAME_PIDS_CACHE[matchable_name] = matched_pids[0][0]
                    updated_pids.add(matched_pids[0][0])
                    pids_having_rec.add(matched_pids[0][0])
                    continue

            matched_pids = find_pids_by_matchable_name_with_cache(
                matchable_name)
            if not matched_pids:
                for matching_function in M_NAME_FUNCTIONS[1:]:
                    matchable_name = matching_function(name)
                    matched_pids = find_pids_by_matchable_name_with_cache(
                        matchable_name)
                    if matched_pids:
                        break

            matched_pids = [p for p in matched_pids if int(p) not in used_pids]

            best_matched_pid = None
            for matched_pid in matched_pids:
                # Because of the wrongly labeled data in the db, all
                # of the possible choices have to be checked. If one of the
                # coauthors, who had his signature already considered, claimed
                # in the past one of the signatures of currently considered
                # author, the algorithm will think that two signatures belong
                # to the same person, and, will create an unnecessary new
                # profile.
                if not int(matched_pid) in pids_having_rec:
                    best_matched_pid = matched_pid
                    break

            if not best_matched_pid:
                new_pid = new_person_from_signature(
                    list(sig) + [rec], name, matchable_name)
                M_NAME_PIDS_CACHE[matchable_name] = new_pid
                used_pids.add(new_pid)
                updated_pids.add(new_pid)
            else:
                add_signature(list(sig) + [rec],
                              name,
                              best_matched_pid,
                              m_name=matchable_name)
                M_NAME_PIDS_CACHE[matchable_name] = best_matched_pid
                used_pids.add(best_matched_pid)
                updated_pids.add(best_matched_pid)
                pids_having_rec.add(best_matched_pid)

        logger.log('Finished with %s' % str(rec))

    logger.update_status_final()

    destroy_partial_marc_caches()

    if personids_to_update_extids:
        updated_pids |= set(personids_to_update_extids)
    if updated_pids:  # an empty set will update all canonical_names
        update_canonical_names_of_authors(updated_pids)
        update_external_ids_of_authors(
            updated_pids,
            limit_to_claimed_papers=bconfig.
            LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS,
            force_cache_tables=True)

    destroy_partial_marc_caches()
    destroy_mnames_pids_cache()

    remove_empty_authors()

    task_update_progress("Done!")
Пример #9
0
from invenio.bibauthorid_backinterface import update_canonical_names_of_authors
from invenio.bibauthorid_backinterface import get_cluster_names
from invenio.bibauthorid_backinterface import get_clusters_by_surname
from invenio.bibauthorid_backinterface import get_author_info_of_confirmed_paper
from invenio.bibauthorid_backinterface import get_author_and_status_of_confirmed_paper
from invenio.bibauthorid_backinterface import move_signature
from invenio.bibauthorid_backinterface import get_claimed_papers_of_author
from invenio.bibauthorid_backinterface import get_free_author_id
from invenio.bibauthorid_backinterface import get_signatures_of_paper_and_author
from invenio.bibauthorid_backinterface import get_free_author_ids as backinterface_get_free_pids
from invenio.bibauthorid_backinterface import get_ordered_author_and_status_of_signature
from invenio.bibauthorid_backinterface import remove_empty_authors
from invenio.bibauthorid_backinterface import get_paper_to_author_and_status_mapping
from invenio.bibauthorid_backinterface import get_authors_by_surname

logger = Logger("merge")


def merge_static_classy():
    '''
        This function merges aidPERSONIDPAPERS with aidRESULTS.
        Use it after tortoise.
        This function is static: if aid* tables are changed while it's running,
        probably everything will crash and a black hole will open, eating all your data.

        NOTE: this is more elegant that merge_static but much slower. Will have to be improved
               before it can replace it.
    '''
    class Sig(object):
        def __init__(self, bibrefrec, pid_flag):
            self.rejected = dict(filter(lambda p: p[1] <= -2, pid_flag))
Пример #10
0
from invenio.bibauthorid_dbinterface import get_name_by_bibref
from invenio.bibauthorid_dbinterface import get_grouped_records
from invenio.bibauthorid_dbinterface import get_authors_of_paper
from invenio.bibauthorid_dbinterface import get_collaborations_for_paper
from invenio.bibauthorid_dbinterface import get_resolved_affiliation
from invenio.bibauthorid_backinterface import get_keywords_for_paper
from invenio.bibrank_citation_searcher import get_cited_by, get_refers_to
# metadat_comparison_print commented everywhere to increase performances,
# import and calls left here to make future debug easier.
from invenio.bibauthorid_logutils import Logger
import gc
import random

CFG_MEMOIZE_DICT_SIZE = 1000000

logger = Logger('metadata_comparison',
                verbose=bconfig.DEBUG_METADATA_COMPARISON_OUTPUT)

# This module is not thread safe!
# Be sure to use processes instead of
# threads if you need parallel
# computation!

use_refrec = itemgetter(slice(None))
use_ref = itemgetter(0, 1)
use_rec = itemgetter(2)
use_string = lambda x: x

CACHES = list()


def create_new_cache():
Пример #11
0
from math import sqrt
from invenio.textutils import translate_to_ascii as original_translate_to_ascii

translate_to_ascii = memoized(original_translate_to_ascii)
SQRT2 = sqrt(2)

try:
    from invenio.config import CFG_ETCDIR
    NO_CFG_ETCDIR = False
except ImportError:
    NO_CFG_ETCDIR = True

from Levenshtein import distance

logger = Logger("name comparison",
                verbose=bconfig.DEBUG_NAME_COMPARISON_OUTPUT)

artifact_removal = re.compile("[^a-zA-Z0-9]")
surname_cleaning = re.compile("-([a-z])")
name_additions_chars = re.compile("\([.]*[^\)]*\)")

name_separators = bconfig.NAMES_SEPARATOR_CHARACTER_LIST
if name_separators == "-1":
    name_separators = ',;.=\-\(\)'
substitution_regexp = re.compile('[%s]' % (name_separators))

# Gender names and names variation files are loaded updon module import to increase performances


@memoized
def split_name_parts(name_string,
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

import warnings
from invenio.dbquery import run_sql
from invenio.bibauthorid_logutils import Logger
from invenio.bibauthorid_dbinterface import get_name_by_bibref, create_matchable_name
from invenio.bibauthorid_rabbit import rabbit

depends_on = ['invenio_2013_11_28_bibauthorid_search_engine_column_changes']

Logger.override_verbosity(True)


def info():
    return "Updates the columns of aidPERSONIDPAPERS, adds the rabbit matchable name, assigns default value to columns."


def do_upgrade():

    logger = Logger("Rabbit m_name upgrade script")

    warnings.filterwarnings('ignore')
    run_sql(
        "alter table aidPERSONIDPAPERS add `m_name` VARCHAR(255) not null after name"
    )
Пример #13
0
# import pyximport; pyximport.install()
from invenio.bibauthorid_bib_matrix import Bib_matrix

if bconfig.DEBUG_CHECKS:

    def _debug_is_eq_v(vl1, vl2):
        if isinstance(vl1, str) and isinstance(vl2, str):
            return vl1 == vl2

        if isinstance(vl1, tuple) and isinstance(vl2, tuple):
            return is_eq(vl1[0], vl2[0]) and is_eq(vl1[1], vl2[1])

        return False


logger = Logger("prob_matrix")


class ProbabilityMatrix(object):
    '''
    This class contains and maintains the comparison
    between all virtual authors. It is able to write
    and read from the database and update the results.
    '''
    def __init__(self, name):
        self._bib_matrix = Bib_matrix(name)

    def load(self, load_map=True, load_matrix=True):
        logger.update_status(0., "Loading probability matrix...")
        self._bib_matrix.load()
        logger.update_status_final("Probability matrix loaded.")
Пример #14
0
import gc

import cPickle

SP_NUMBERS = Bib_matrix.special_numbers
SP_SYMBOLS = Bib_matrix.special_symbols
SP_CONFIRM = Bib_matrix.special_symbols['+']
SP_QUARREL = Bib_matrix.special_symbols['-']

eps = 0.01
edge_cut_prob = ''
wedge_thrsh = ''
h5file = None

logger = Logger("wedge", verbose=bconfig.DEBUG_WEDGE_OUTPUT)

import os
PID = lambda: str(os.getpid())

import pyximport
pyximport.install()
from invenio.bibauthorid_meld_edges import meld_edges


def wedge(cluster_set, report_cluster_status=False, force_wedge_thrsh=False):
    # The lower bound of the edges being processed by the wedge algorithm.
    global edge_cut_prob
    global wedge_thrsh

    if not force_wedge_thrsh:
Пример #15
0
# You should have received a copy of the GNU General Public License
# along with Invenio; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

from itertools import chain, groupby, izip, cycle
from operator import itemgetter
from invenio.bibauthorid_matrix_optimization import maximized_mapping
from invenio.bibauthorid_backinterface import save_cluster
from invenio.bibauthorid_backinterface import get_confirmed_papers_of_authors
from invenio.bibauthorid_backinterface import get_bib10x, get_bib70x
from invenio.bibauthorid_backinterface import get_author_to_confirmed_names_mapping
from invenio.bibauthorid_backinterface import get_signatures_from_bibrefs
from invenio.bibauthorid_name_utils import generate_last_name_cluster_str
from invenio.bibauthorid_logutils import Logger

logger = Logger("cluster_set")


class Blob(object):
    def __init__(self, personid_records):
        '''
        @param personid_records:
            A list of tuples: (personid, bibrefrec, flag).
            Notice that all bibrefrecs should be the same
            since the Blob represents only one bibrefrec.
        '''
        self.bib = personid_records[0][1]
        assert all(p[1] == self.bib for p in personid_records), \
            "All cluster sets should share the bibrefrec"
        self.claimed = set()
        self.assigned = set()
Пример #16
0
from invenio.bibauthorid_logutils import Logger

from invenio.bibauthorid_cluster_set import delayed_cluster_sets_from_marktables
from invenio.bibauthorid_cluster_set import delayed_cluster_sets_from_personid
from invenio.bibauthorid_wedge import wedge
from invenio.bibauthorid_name_utils import generate_last_name_cluster_str
from invenio.bibauthorid_backinterface import empty_tortoise_results_table
from invenio.bibauthorid_backinterface import remove_clusters_by_name
from invenio.bibauthorid_prob_matrix import prepare_matrix
# Scheduler is [temporarily] deprecated in favour of the much simpler schedule_workers
# from invenio.bibauthorid_scheduler import schedule, matrix_coefs

from invenio.bibauthorid_general_utils import schedule_workers

logger = Logger("tortoise")
'''
    There are three main entry points to tortoise

    i) tortoise
        Performs disambiguation iteration.
        The arguemnt pure indicates whether to use
        the claims and the rejections or not.
        Use pure=True only to test the accuracy of tortoise.

    ii) tortoise_from_scratch
        NOT RECOMMENDED!
        Use this function only if you have just
        installed invenio and this is your first
        disambiguation or if personid is broken.