Пример #1
0
    def recalculate(self, cluster_set):
        '''
        Constructs probability matrix. If use_cache is true, it will
        try to load old computations from the database. If save cache
        is true it will save the current results into the database.
        @param cluster_set: A cluster set object, used to initialize
        the matrix.
        '''
        last_cleaned = 0
        self._bib_matrix.store()
        try:
            old_matrix = Bib_matrix(self._bib_matrix.name + 'copy')
            old_matrix.duplicate_existing(self._bib_matrix.name,
                                          self._bib_matrix.name + 'copy')
            old_matrix.load()
            cached_bibs = self.__get_up_to_date_bibs(old_matrix)
            have_cached_bibs = bool(cached_bibs)
        except IOError:
            old_matrix.destroy()
            cached_bibs = None
            have_cached_bibs = False

        self._bib_matrix.destroy()
        self._bib_matrix = Bib_matrix(cluster_set.last_name,
                                      cluster_set=cluster_set)

        ncl = cluster_set.num_all_bibs
        expected = ((ncl * (ncl - 1)) / 2)
        if expected == 0:
            expected = 1

        try:
            cur_calc, opti, prints_counter = 0, 0, 0
            for cl1 in cluster_set.clusters:

                if cur_calc + opti - prints_counter > 100000 or cur_calc == 0:
                    update_status(
                        (float(opti) + cur_calc) / expected,
                        "Prob matrix: calc %d, opti %d." % (cur_calc, opti))
                    prints_counter = cur_calc + opti

    #            #clean caches
                if cur_calc - last_cleaned > 20000000:
                    gc.collect()
                    #                clear_comparison_caches()
                    last_cleaned = cur_calc

                for cl2 in cluster_set.clusters:
                    if id(cl1) < id(cl2) and not cl1.hates(cl2):
                        for bib1 in cl1.bibs:
                            for bib2 in cl2.bibs:
                                if have_cached_bibs:
                                    try:
                                        val = old_matrix[bib1, bib2]
                                        opti += 1
                                        if bconfig.DEBUG_CHECKS:
                                            assert _debug_is_eq_v(
                                                val,
                                                compare_bibrefrecs(bib1, bib2))
                                    except KeyError:
                                        cur_calc += 1
                                        val = compare_bibrefrecs(bib1, bib2)
                                    if not val:
                                        cur_calc += 1
                                        val = compare_bibrefrecs(bib1, bib2)
                                else:
                                    cur_calc += 1
                                    val = compare_bibrefrecs(bib1, bib2)
                                self._bib_matrix[bib1, bib2] = val

        except Exception, e:
            raise Exception("""Error happened in prob_matrix.recalculate with
            val:%s
            original_exception: %s
            """ % (str(val), str(e)))
class TestBibMatrix(InvenioTestCase):

    def setUp(self):
        """
        Set up an empty bibmatrix and one filled with ten clusters of 10 elements each.
        """
        self.bm = Bib_matrix('testname', storage_dir_override='/tmp/')
        self.css = ClusterSet()
        self.css.clusters = [ClusterSet.Cluster(range(i*10,i*10+10)) for i in range(10)]
        self.css.update_bibs()
        self.bmcs0 = Bib_matrix('testname2', self.css, storage_dir_override='/tmp/')

    def tearDown(self):
        self.bm.destroy()
        self.bmcs0.destroy()

    def test_resolve_entry_simmetry(self):
        '''
        Bib matrix stores a triangular matrix. Entries should be symmetric.
        '''
        for j in range(100):
            for k in range(100):
                self.assertTrue( self.bmcs0._resolve_entry((j,k))==self.bmcs0._resolve_entry((k,j)) )

    def test_resolve_entry_unicity(self):
        '''
        resolve_entry should produce unuque indexes for any couple of values
        '''
        ntests = 30
        testvalues = set((i,j) for i in range(ntests) for j in range(ntests))
        for k in range(ntests):
            for z in range(ntests):
                tvalues = testvalues - set([(k,z)]) - set([(z,k)])
                val = self.bmcs0._resolve_entry((k,z))
                allvalues = set(self.bmcs0._resolve_entry(v) for v in tvalues)
                self.assertFalse( val in allvalues , str(val)+' is in, from '+str((k,z)))

    def test_matrix_content(self):
        '''
        The matrix should be simmetric, and values should be preserved
        '''
        for i in range(100):
            for j in range(i+1):
                self.bmcs0[i,j] = (i,j)

        for i in range(100):
            for j in range(i+1,100):
                val = self.bmcs0[i,j]
                if i < j:
                    k,z = j,i
                else:
                    k,z = i,j
                self.assertTrue(val[0] == k)
                self.assertTrue(val[1] == z)

    def test_create_empty_matrix(self):
        """
        All elements should be None
        """
        for i in range(9,10):
            for j in range(i*10,i*10+10):
                for k in range(i*10,i*10+10):
                        self.assertTrue(self.bmcs0[(j,k)] == None)

    @nottest
    def FIXME_1678_test_save_matrix(self):
        '''
        Matrix should save, be loadable, and stay equal to a newly loaded one on the same files
        '''
        self.bmcs0.store()
        loaded = Bib_matrix('testname2', storage_dir_override='/tmp/')
        self.assertTrue(loaded.load())
        bmcs0 = self.bmcs0
        for i in range(100):
            for  j in range(100):
                self.assertTrue(bmcs0[i,j] == loaded[i,j])

    def test_duplicate_existing(self):
        self.bmcs0.store()
        self.bm.duplicate_existing('testname2','testnameduplicate')
        self.assertTrue(self.bmcs0.load())
        self.assertTrue(self.bm.load())
        bmcs0 = self.bmcs0
        bm = self.bm
        for i in range(100):
            for  j in range(100):
                self.assertTrue(bmcs0[i,j] == bm[i,j])

    def test_special_items(self):
        self.bmcs0[0,0] = '+'
        self.bmcs0[0,1] = '-'
        self.bmcs0[0,2] = None
        self.assertTrue(self.bmcs0[0,0] == '+')
        self.assertTrue(self.bmcs0[0,1] == '-')
        self.assertTrue(self.bmcs0[0,2] is None)

    def test_getitem_numeric(self):
        self.bmcs0[0,0] = '+'
        self.bmcs0[0,1] = '-'
        self.bmcs0[0,2] = None
        self.assertTrue(self.bmcs0.getitem_numeric([0,0])[0] == -2)
        self.assertTrue(self.bmcs0.getitem_numeric([0,1])[0] == -1)
        self.assertTrue(self.bmcs0.getitem_numeric([0,2])[0] == -3)
    def recalculate(self, cluster_set):
        '''
        Constructs probability matrix. If use_cache is true, it will
        try to load old computations from the database. If save cache
        is true it will save the current results into the database.
        @param cluster_set: A cluster set object, used to initialize
        the matrix.
        '''
        last_cleaned = 0
        self._bib_matrix.store()
        try:
            old_matrix = Bib_matrix(self._bib_matrix.name+'copy')
            old_matrix.duplicate_existing(self._bib_matrix.name, self._bib_matrix.name+'copy')
            old_matrix.load()
            cached_bibs = self.__get_up_to_date_bibs(old_matrix)
            have_cached_bibs = bool(cached_bibs)
        except IOError:
            old_matrix.destroy()
            cached_bibs = None
            have_cached_bibs = False

        self._bib_matrix.destroy()
        self._bib_matrix = Bib_matrix(cluster_set.last_name, cluster_set=cluster_set)

        ncl = cluster_set.num_all_bibs
        expected = ((ncl * (ncl - 1)) / 2)
        if expected == 0:
            expected = 1

        try:
            cur_calc, opti, prints_counter = 0, 0, 0
            for cl1 in cluster_set.clusters:

                if cur_calc+opti - prints_counter > 100000 or cur_calc == 0:
                    update_status((float(opti) + cur_calc) / expected, "Prob matrix: calc %d, opti %d." % (cur_calc, opti))
                    prints_counter = cur_calc+opti

    #            #clean caches
                if cur_calc - last_cleaned > 20000000:
                    gc.collect()
    #                clear_comparison_caches()
                    last_cleaned = cur_calc

                for cl2 in cluster_set.clusters:
                    if id(cl1) < id(cl2) and not cl1.hates(cl2):
                        for bib1 in cl1.bibs:
                            for bib2 in cl2.bibs:
                                if have_cached_bibs:
                                    try:
                                        val = old_matrix[bib1, bib2]
                                        opti += 1
                                        if bconfig.DEBUG_CHECKS:
                                            assert _debug_is_eq_v(val, compare_bibrefrecs(bib1, bib2))
                                    except KeyError:
                                        cur_calc += 1
                                        val = compare_bibrefrecs(bib1, bib2)
                                    if not val:
                                        cur_calc += 1
                                        val = compare_bibrefrecs(bib1, bib2)
                                else:
                                    cur_calc += 1
                                    val = compare_bibrefrecs(bib1, bib2)
                                self._bib_matrix[bib1, bib2] = val

        except Exception, e:
            raise Exception("""Error happened in prob_matrix.recalculate with
            val:%s
            original_exception: %s
            """%(str(val),str(e)))