示例#1
0
    def cluster(self, threshold):

        # Keep the matches sorted in a heap
        heap = []

        for y in xrange(self.clusterDict.getSize()):
            for x in xrange(y):
                if x != y:
                    c = similarity(self.clusterDict.getToken(x).lower(), self.clusterDict.getToken(y).lower())
                    if c >= threshold:
                        heappush(heap, ((1.0 - c), [x, y]))
            QtCore.QCoreApplication.processEvents()

        for i in xrange(self.clusterDict.getSize()):
            word, count = self.clusterDict.getWordAndCount(i)
            if word and count > 1:
                self.clusterBins[self.clusterCount] = [i]
                self.idClusterIndex[i] = self.clusterCount
                self.clusterCount = self.clusterCount + 1

        for i in xrange(len(heap)):
            c, pair = heappop(heap)
            c = 1.0 - c

            try:
                match0 = self.idClusterIndex[pair[0]]
            except:
                match0 = -1

            try:
                match1 = self.idClusterIndex[pair[1]]
            except:
                match1 = -1

            # if neither item is in a cluster, make a new cluster
            if match0 == -1 and match1 == -1:
                self.clusterBins[self.clusterCount] = [pair[0], pair[1]]
                self.idClusterIndex[pair[0]] = self.clusterCount
                self.idClusterIndex[pair[1]] = self.clusterCount
                self.clusterCount = self.clusterCount + 1
                continue

            # If cluster0 is in a bin, stick the other match into that bin
            if match0 >= 0 and match1 < 0:
                self.clusterBins[match0].append(pair[1])
                self.idClusterIndex[pair[1]] = match0
                continue

            # If cluster1 is in a bin, stick the other match into that bin
            if match1 >= 0 and match0 < 0:
                self.clusterBins[match1].append(pair[0])
                self.idClusterIndex[pair[0]] = match1
                continue

            # If both matches are already in two different clusters, merge the clusters
            if match1 != match0:
                self.clusterBins[match0].extend(self.clusterBins[match1])
                for match in self.clusterBins[match1]:
                    self.idClusterIndex[match] = match0
                del self.clusterBins[match1]
示例#2
0
 def test_correct(self):
     self.assertEqual(similarity(u"K!", u"K!"), 1.0)
     self.assertEqual(similarity(u"BBB", u"AAA"), 0.0)
     self.assertAlmostEqual(similarity(u"ABC", u"ABB"), 0.7, 1)
示例#3
0
    def cluster(self, threshold):

        # Keep the matches sorted in a heap
        heap = []

        for y in range(self.clusterDict.getSize()):
            for x in range(y):
                if x != y:
                    c = similarity(
                        self.clusterDict.getToken(x).lower(),
                        self.clusterDict.getToken(y).lower())
                    if c >= threshold:
                        heappush(heap, ((1.0 - c), [x, y]))
            QtCore.QCoreApplication.processEvents()

        for i in range(self.clusterDict.getSize()):
            word, count = self.clusterDict.getWordAndCount(i)
            if word and count > 1:
                self.clusterBins[self.clusterCount] = [i]
                self.idClusterIndex[i] = self.clusterCount
                self.clusterCount = self.clusterCount + 1

        for i in range(len(heap)):
            c, pair = heappop(heap)
            c = 1.0 - c

            try:
                match0 = self.idClusterIndex[pair[0]]
            except:
                match0 = -1

            try:
                match1 = self.idClusterIndex[pair[1]]
            except:
                match1 = -1

            # if neither item is in a cluster, make a new cluster
            if match0 == -1 and match1 == -1:
                self.clusterBins[self.clusterCount] = [pair[0], pair[1]]
                self.idClusterIndex[pair[0]] = self.clusterCount
                self.idClusterIndex[pair[1]] = self.clusterCount
                self.clusterCount = self.clusterCount + 1
                continue

            # If cluster0 is in a bin, stick the other match into that bin
            if match0 >= 0 and match1 < 0:
                self.clusterBins[match0].append(pair[1])
                self.idClusterIndex[pair[1]] = match0
                continue

            # If cluster1 is in a bin, stick the other match into that bin
            if match1 >= 0 and match0 < 0:
                self.clusterBins[match1].append(pair[0])
                self.idClusterIndex[pair[0]] = match1
                continue

            # If both matches are already in two different clusters, merge the clusters
            if match1 != match0:
                self.clusterBins[match0].extend(self.clusterBins[match1])
                for match in self.clusterBins[match1]:
                    self.idClusterIndex[match] = match0
                del self.clusterBins[match1]
示例#4
0
 def test_correct(self):
     self.assertEqual(similarity(u"K!", u"K!"), 1.0)
     self.assertEqual(similarity(u"BBB", u"AAA"), 0.0)
     self.assertAlmostEqual(similarity(u"ABC", u"ABB"), 0.7, 1)
示例#5
0
    def cluster(self, threshold, tagger=None):
        # Keep the matches sorted in a heap
        heap = []
        num_files = self.cluster_dict.get_size()

        # 20 evenly spaced indexes of files being clustered, used as checkpoints for every 5% progress
        status_update_steps = ProgressCheckpoints(num_files, 20)

        for y in process_events_iter(range(num_files)):
            token_y = self.cluster_dict.get_token(y).lower()
            for x in range(y):
                if x != y:
                    token_x = self.cluster_dict.get_token(x).lower()
                    c = similarity(token_x, token_y)
                    if c >= threshold:
                        heappush(heap, ((1.0 - c), [x, y]))

            word, count = self.cluster_dict.get_word_and_count(y)
            if word and count > 1:
                self.cluster_bins[self.cluster_count] = [y]
                self.index_id_cluster[y] = self.cluster_count
                self.cluster_count = self.cluster_count + 1

            if tagger and status_update_steps.is_checkpoint(y):
                statusmsg = N_(
                    "Clustering - step %(step)d/3: %(cluster_type)s (%(update)d%%)"
                )
                mparams = {
                    'step': self.cluster_type.value,
                    'cluster_type': _(self._cluster_type_label()),
                    'update': status_update_steps.progress(y),
                }
                tagger.window.set_statusbar_message(statusmsg, mparams)

        for i in range(len(heap)):
            c, pair = heappop(heap)
            c = 1.0 - c

            try:
                match0 = self.index_id_cluster[pair[0]]
            except BaseException:
                match0 = -1

            try:
                match1 = self.index_id_cluster[pair[1]]
            except BaseException:
                match1 = -1

            # if neither item is in a cluster, make a new cluster
            if match0 == -1 and match1 == -1:
                self.cluster_bins[self.cluster_count] = [pair[0], pair[1]]
                self.index_id_cluster[pair[0]] = self.cluster_count
                self.index_id_cluster[pair[1]] = self.cluster_count
                self.cluster_count = self.cluster_count + 1
                continue

            # If cluster0 is in a bin, stick the other match into that bin
            if match0 >= 0 and match1 < 0:
                self.cluster_bins[match0].append(pair[1])
                self.index_id_cluster[pair[1]] = match0
                continue

            # If cluster1 is in a bin, stick the other match into that bin
            if match1 >= 0 and match0 < 0:
                self.cluster_bins[match1].append(pair[0])
                self.index_id_cluster[pair[0]] = match1
                continue

            # If both matches are already in two different clusters, merge the clusters
            if match1 != match0:
                self.cluster_bins[match0].extend(self.cluster_bins[match1])
                for match in self.cluster_bins[match1]:
                    self.index_id_cluster[match] = match0
                del self.cluster_bins[match1]
示例#6
0
文件: cluster.py 项目: weisslj/picard
    def cluster(self, threshold):

        # Keep the matches sorted in a heap
        heap = []

        for y in xrange(self.clusterDict.getSize()):
            for x in xrange(y):
                if x != y:
                    c = similarity(
                        self.clusterDict.getToken(x).lower(),
                        self.clusterDict.getToken(y).lower())
                    #print "'%s' - '%s' = %f" % (
                    #    self.clusterDict.getToken(x).encode('utf-8', 'replace').lower(),
                    #    self.clusterDict.getToken(y).encode('utf-8', 'replace').lower(), c)

                    if c >= threshold:
                        heappush(heap, ((1.0 - c), [x, y]))
            QtCore.QCoreApplication.processEvents()

        for i in xrange(self.clusterDict.getSize()):
            word, count = self.clusterDict.getWordAndCount(i)
            if word and count > 1:
                self.clusterBins[self.clusterCount] = [i]
                self.idClusterIndex[i] = self.clusterCount
                self.clusterCount = self.clusterCount + 1
                #print "init ",
                #self.printCluster(self.clusterCount - 1)

        for i in xrange(len(heap)):
            c, pair = heappop(heap)
            c = 1.0 - c

            try:
                match0 = self.idClusterIndex[pair[0]]
            except:
                match0 = -1

            try:
                match1 = self.idClusterIndex[pair[1]]
            except:
                match1 = -1

            # if neither item is in a cluster, make a new cluster
            if match0 == -1 and match1 == -1:
                self.clusterBins[self.clusterCount] = [pair[0], pair[1]]
                self.idClusterIndex[pair[0]] = self.clusterCount
                self.idClusterIndex[pair[1]] = self.clusterCount
                self.clusterCount = self.clusterCount + 1
                #print "new ",
                #self.printCluster(self.clusterCount - 1)
                continue

            # If cluster0 is in a bin, stick the other match into that bin
            if match0 >= 0 and match1 < 0:
                self.clusterBins[match0].append(pair[1])
                self.idClusterIndex[pair[1]] = match0
                #print "add '%s' to cluster " % (self.clusterDict.getWord(pair[0])),
                #self.printCluster(match0)
                continue

            # If cluster1 is in a bin, stick the other match into that bin
            if match1 >= 0 and match0 < 0:
                self.clusterBins[match1].append(pair[0])
                self.idClusterIndex[pair[0]] = match1
                #print "add '%s' to cluster " % (self.clusterDict.getWord(pair[1])),
                #self.printCluster(match0)
                continue

            # If both matches are already in two different clusters, merge the clusters
            if match1 != match0:
                self.clusterBins[match0].extend(self.clusterBins[match1])
                for match in self.clusterBins[match1]:
                    self.idClusterIndex[match] = match0
                #print "col cluster %d into cluster" % (match1),
                #self.printCluster(match0)
                del self.clusterBins[match1]

        return self.clusterBins
示例#7
0
 def test_correct(self):
     self.failUnlessEqual(similarity(u"K!", u"K!"), 1.0)
     self.failUnlessEqual(similarity(u"BBB", u"AAA"), 0.0)
     self.failUnlessAlmostEqual(similarity(u"ABC", u"ABB"), 0.7, 1)