def recalculate(self, cluster_set):
        '''
        Constructs probability matrix. If use_cache is true, it will
        try to load old computations from the database. If save cache
        is true it will save the current results into the database.
        @param cluster_set: A cluster set object, used to initialize
        the matrix.
        '''
        last_cleaned = 0

        old_matrix = self._bib_matrix
        cached_bibs = self.__get_up_to_date_bibs()
        have_cached_bibs = bool(cached_bibs)
        self._bib_matrix = Bib_matrix(cluster_set)

        ncl = cluster_set.num_all_bibs
        expected = ((ncl * (ncl - 1)) / 2)
        if expected == 0:
            expected = 1

        cur_calc, opti = 0, 0
        for cl1 in cluster_set.clusters:
            update_status((float(opti) + cur_calc) / expected,
                          "Prob matrix: calc %d, opti %d." % (cur_calc, opti))

            #clean caches
            if cur_calc - last_cleaned > 2000000:
                clear_comparison_caches()
                last_cleaned = cur_calc

            for cl2 in cluster_set.clusters:
                if id(cl1) < id(cl2) and not cl1.hates(cl2):
                    for bib1 in cl1.bibs:
                        for bib2 in cl2.bibs:
                            if have_cached_bibs and bib1 in cached_bibs and bib2 in cached_bibs:
                                val = old_matrix[bib1, bib2]
                                if not val:
                                    cur_calc += 1
                                    val = compare_bibrefrecs(bib1, bib2)
                                else:
                                    opti += 1
                                    if bconfig.DEBUG_CHECKS:
                                        assert _debug_is_eq_v(
                                            val,
                                            compare_bibrefrecs(bib1, bib2))
                            else:
                                cur_calc += 1
                                val = compare_bibrefrecs(bib1, bib2)

                            self._bib_matrix[bib1, bib2] = val

        clear_comparison_caches()
        update_status_final("Matrix done. %d calc, %d opt." % (cur_calc, opti))
Exemplo n.º 2
0
    def recalculate(self, cluster_set):
        '''
        Constructs probability matrix. If use_cache is true, it will
        try to load old computations from the database. If save cache
        is true it will save the current results into the database.
        @param cluster_set: A cluster set object, used to initialize
        the matrix.
        '''
        last_cleaned = 0

        old_matrix = self._bib_matrix
        cached_bibs = self.__get_up_to_date_bibs()
        have_cached_bibs = bool(cached_bibs)
        self._bib_matrix = Bib_matrix(cluster_set)

        ncl = cluster_set.num_all_bibs
        expected = ((ncl * (ncl - 1)) / 2)
        if expected == 0:
            expected = 1

        cur_calc, opti = 0, 0
        for cl1 in cluster_set.clusters:
            update_status((float(opti) + cur_calc) / expected, "Prob matrix: calc %d, opti %d." % (cur_calc, opti))

            #clean caches
            if cur_calc - last_cleaned > 2000000:
                clear_comparison_caches()
                last_cleaned = cur_calc

            for cl2 in cluster_set.clusters:
                if id(cl1) < id(cl2) and not cl1.hates(cl2):
                    for bib1 in cl1.bibs:
                        for bib2 in cl2.bibs:
                            if have_cached_bibs and bib1 in cached_bibs and bib2 in cached_bibs:
                                val = old_matrix[bib1, bib2]
                                if not val:
                                    cur_calc += 1
                                    val = compare_bibrefrecs(bib1, bib2)
                                else:
                                    opti += 1
                                    if bconfig.DEBUG_CHECKS:
                                        assert _debug_is_eq_v(val, compare_bibrefrecs(bib1, bib2))
                            else:
                                cur_calc += 1
                                val = compare_bibrefrecs(bib1, bib2)

                            self._bib_matrix[bib1, bib2] = val

        clear_comparison_caches()
        update_status_final("Matrix done. %d calc, %d opt." % (cur_calc, opti))
Exemplo n.º 3
0
    def __init__(self,
                 cluster_set,
                 last_name="",
                 cached=[],
                 use_cache=False,
                 save_cache=False):
        '''
        Constructs probability matrix. If use_cache is true, it will
        try to load old computations from the database. If save cache
        is true it will save the current results into the database.
        @param cluster_set: A cluster set object, used to initialize
        the matrix.
        @param last_name: A string which defines the current cluster
        of names. It is used only if use_cache or save_cache is true.
        @param cached: A list with the bibs, which are not touched
        since last save.
        '''
        self._bib_matrix = self.bib_matrix(cluster_set)

        old_matrix = self.bib_matrix()
        if use_cache and probability_table_exists():
            old_matrix.load(last_name)
        elif cached:
            raise AssertionError("You cannot have cached"
                                 "results and empty table!")

        for cl1 in cluster_set.clusters:
            for cl2 in cluster_set.clusters:
                if id(cl1) != id(cl2) and cl1.hates(cl2) == False:
                    for bib1 in cl1.bibs:
                        for bib2 in cl2.bibs:
                            if bib1 in cached and bib2 in cached:
                                val = old_matrix[bib1, bib2]
                                if val == None:
                                    val = compare_bibrefrecs(bib1, bib2)
                            else:
                                val = compare_bibrefrecs(bib1, bib2)
                            self._bib_matrix[bib1, bib2] = val

        if save_cache:
            if not probability_table_exists():
                create_probability_table()
            self._bib_matrix.store(last_name)
    def __init__(self, cluster_set, last_name="", cached = [],
                 use_cache = False, save_cache = False):
        '''
        Constructs probability matrix. If use_cache is true, it will
        try to load old computations from the database. If save cache
        is true it will save the current results into the database.
        @param cluster_set: A cluster set object, used to initialize
        the matrix.
        @param last_name: A string which defines the current cluster
        of names. It is used only if use_cache or save_cache is true.
        @param cached: A list with the bibs, which are not touched
        since last save.
        '''
        self._bib_matrix = self.bib_matrix(cluster_set)

        old_matrix = self.bib_matrix()
        if use_cache and probability_table_exists():
            old_matrix.load(last_name)
        elif cached:
            raise AssertionError("You cannot have cached"
                                  "results and empty table!")

        for cl1 in cluster_set.clusters:
            for cl2 in cluster_set.clusters:
                if id(cl1) != id(cl2) and cl1.hates(cl2) == False:
                    for bib1 in cl1.bibs:
                        for bib2 in cl2.bibs:
                            if bib1 in cached and bib2 in cached:
                                val = old_matrix[bib1, bib2]
                                if val == None:
                                    val = compare_bibrefrecs(bib1, bib2)
                            else:
                                val = compare_bibrefrecs(bib1, bib2)
                            self._bib_matrix[bib1, bib2] = val

        if save_cache:
            if not probability_table_exists():
                create_probability_table()
            self._bib_matrix.store(last_name)
Exemplo n.º 5
0
    def __init__(self, cluster_set, use_cache=False, save_cache=False):
        '''
        Constructs probability matrix. If use_cache is true, it will
        try to load old computations from the database. If save cache
        is true it will save the current results into the database.
        @param cluster_set: A cluster set object, used to initialize
        the matrix.
        '''
        def check_for_cleaning(cur_calc):
            if cur_calc % 10000000 == 0:
                clear_comparison_caches()

        self._bib_matrix = bib_matrix(cluster_set)

        old_matrix = bib_matrix()

        ncl = sum(len(cl.bibs) for cl in cluster_set.clusters)
        expected = ((ncl * (ncl - 1)) / 2)
        if expected == 0:
            expected = 1

        if use_cache and old_matrix.load(cluster_set.last_name):
            cached_bibs = set(filter_modified_record_ids(
                                  old_matrix.get_keys(),
                                  old_matrix.creation_time))
        else:
            cached_bibs = set()

        if save_cache:
            creation_time = get_sql_time()

        cur_calc, opti = 0, 0
        for cl1 in cluster_set.clusters:
            update_status((float(opti) + cur_calc) / expected, "Prob matrix: calc %d, opti %d." % (cur_calc, opti))
            for cl2 in cluster_set.clusters:
                if id(cl1) < id(cl2) and not cl1.hates(cl2):
                    for bib1 in cl1.bibs:
                        for bib2 in cl2.bibs:
                            if bib1 in cached_bibs and bib2 in cached_bibs:
                                val = old_matrix[bib1, bib2]
                                if not val:
                                    cur_calc += 1
                                    check_for_cleaning(cur_calc)
                                    val = compare_bibrefrecs(bib1, bib2)
                                else:
                                    opti += 1
                                    if bconfig.DEBUG_CHECKS:
                                        assert _debug_is_eq_v(val, compare_bibrefrecs(bib1, bib2))
                            else:
                                cur_calc += 1
                                check_for_cleaning(cur_calc)
                                val = compare_bibrefrecs(bib1, bib2)

                            self._bib_matrix[bib1, bib2] = val

        clear_comparison_caches()

        if save_cache:
            update_status(1., "saving...")
            self._bib_matrix.store(cluster_set.last_name, creation_time)

        update_status_final("Matrix done. %d calc, %d opt." % (cur_calc, opti))