Exemplo n.º 1
0
    def reduce(self, uid, values):
        c = Cluster()
        c.uid = uid
        sqdist = 0.0
        # TODO: update the cluster center
        instances = []
        for value in values:
            doc = Document(value)
            instances.append(doc)
            for token in doc.tfidf.keys():
                bool = token in c.tfidf.keys()
                if bool:
                    c.tfidf[token] += doc.tfidf[token]
                else:
                    c.tfidf[token] = doc.tfidf[token]

        size = float(len(values))
        for token in c.tfidf.keys():
            c.tfidf[token] = c.tfidf[token] / size
        #compute the distance
        for instance in instances:
            sqdist += MathUtil.compute_distance(map1=c.tfidf,
                                                map2=instance.tfidf)

        # Output the cluster center into file: clusteri
        self.emit("cluster" + str(c.uid), str(c))
        # Output the within distance into file: distancei
        self.emit("distance" + str(c.uid), str(c.uid) + "|" + str(sqdist))
Exemplo n.º 2
0
    def reduce(self, uid, values):
        values = [eval(text_dict) for text_dict in values]
        c = Cluster()
        c.uid = uid
        c_total = len(values)
        sqdist = 0.0

        # set cluster center to sum of members
        for doc in values:
            for tokenid in doc:
                if c.tfidf.has_key(tokenid):
                    c.tfidf[tokenid] += doc[tokenid]
                else:
                    c.tfidf[tokenid] = doc[tokenid]

        # set cluster center, currently the sum, to the mean
        for tokenid in c.tfidf:
            c.tfidf[tokenid] = c.tfidf[tokenid] / float(c_total)

        # set sqdist to the squared sum of deviations from mean
        for doc in values:
            sqdist += MathUtil.compute_distance(c.tfidf, doc, squared=True)

        # Output the cluster center into file: clusteri
        self.emit("cluster" + str(c.uid), str(c))
        # Output the within distance into file: distancei
        self.emit("distance" + str(c.uid), str(c.uid) + "|" + str(sqdist))
Exemplo n.º 3
0
    def reduce(self, uid, values):
        c = Cluster()
        c.uid = uid
        sqdist = 0.0
        # TODO: update the cluster center 
        instances = []
        for value in values:
            doc = Document(value)
            instances.append(doc)
            for token in doc.tfidf.keys():
                bool = token in c.tfidf.keys()
                if bool:
                    c.tfidf[token] += doc.tfidf[token]
                else:
                    c.tfidf[token] = doc.tfidf[token]
                
        size = float(len(values))
        for token in c.tfidf.keys():
            c.tfidf[token] = c.tfidf[token]/size
        #compute the distance    
        for instance in instances:
            sqdist += MathUtil.compute_distance(map1 = c.tfidf, map2 = instance.tfidf) 

        # Output the cluster center into file: clusteri
        self.emit("cluster" + str(c.uid), str(c))
        # Output the within distance into file: distancei
        self.emit("distance" + str(c.uid), str(c.uid) + "|" + str(sqdist))
Exemplo n.º 4
0
    def reduce(self, uid, values):
        # values is a list of dictionaries
        c = Cluster()
        c.uid = int(uid)
        sqdist = 0.0

        # Compute new center
        count = 0
        for value in values:
            count += 1
            doc = Document(value)
            for key, v in doc.tfidf.items():
                c.tfidf[key] = c.tfidf.get(key, 0.0) + v

        for key in c.tfidf:
            c.tfidf[key] /= count

        # Get within cluster distance
        for value in values:
            doc = Document(value)
            sqdist += MathUtil.compute_distance(c.tfidf, doc.tfidf)

        # Output the cluster center into file: clusteri
        self.emit("cluster" + str(c.uid), str(c))
        # Output the within distance into file: distancei
        self.emit("distance" + str(c.uid), str(c.uid) + "|" + str(sqdist))
Exemplo n.º 5
0
    def reduce(self, uid, values):
        values = [eval(text_dict) for text_dict in values]
        c = Cluster()
        c.uid = uid
        c_total = len(values)
        sqdist = 0.0

        # set cluster center to sum of members
        for doc in values:
            for tokenid in doc:
                if c.tfidf.has_key(tokenid):
                    c.tfidf[tokenid] += doc[tokenid]
                else:
                    c.tfidf[tokenid] = doc[tokenid]


        # set cluster center, currently the sum, to the mean
        for tokenid in c.tfidf:
            c.tfidf[tokenid] = c.tfidf[tokenid] / float(c_total)

        # set sqdist to the squared sum of deviations from mean
        for doc in values:
            sqdist += MathUtil.compute_distance(c.tfidf, doc, squared=True)

        # Output the cluster center into file: clusteri
        self.emit("cluster" + str(c.uid), str(c))
        # Output the within distance into file: distancei
        self.emit("distance" + str(c.uid), str(c.uid) + "|" + str(sqdist))
Exemplo n.º 6
0
 def map(self, line):
     # TODO: call `self.emit(key, value)`
     instance = Document(line)
     min_dist = sys.maxsize
     key = -1
     for cluster in self.clusters:
         dist = MathUtil.compute_distance(map1 = cluster.tfidf, map2 = instance.tfidf) 
         if dist < min_dist:
             key = cluster.uid
             min_dist = dist
     self.emit(key, line) #instance.__str__() 
Exemplo n.º 7
0
 def map(self, line):
     #find cluster assignment by brute force
     doc = Document(line)
     cluster_uid = None
     sqdist_to_nearest = float('inf')
     for cluster_k in self.clusters:
         sqdist_k = MathUtil.compute_distance(map1 = cluster_k.tfidf, map2 = doc.tfidf, squared=True)
         if sqdist_k <= sqdist_to_nearest:
             cluster_uid = cluster_k.uid
     #dutifully emit.
     self.emit(key = cluster_uid, value = doc)
     return
Exemplo n.º 8
0
 def map(self, line):
     # TODO: call `self.emit(key, value)`
     instance = Document(line)
     min_dist = sys.maxsize
     key = -1
     for cluster in self.clusters:
         dist = MathUtil.compute_distance(map1=cluster.tfidf,
                                          map2=instance.tfidf)
         if dist < min_dist:
             key = cluster.uid
             min_dist = dist
     self.emit(key, line)  #instance.__str__()
Exemplo n.º 9
0
    def map(self, line):
        # Key is cluster id - clusters stored in self.clusters
        # Value is the line
        dist = float("inf")
        temp_dist = float("inf")
        doc = Document(line)
        key = doc.uid
        for c in self.clusters:
            temp_dist = MathUtil.compute_distance(doc.tfidf,c.tfidf)
            if temp_dist < dist:
                dist = temp_dist
                key = c.uid

        self.emit(str(key),str(doc))