예제 #1
0
    def __call__(self, key, values_gen):
        values = list(values_gen)
        version, site, platform, type = key

        def result(s_type, c_index, c_size, m_id, message, score):
            sortkey = MAX_SIZE - c_size
            self.cluster_count += 1
            return \
                (sortkey, version, site, platform, s_type, c_index, type, c_size), \
                (m_id, message, score)

        c_index = 1
        if len(values) == 1:
            m_id, message = values[0]
            for s_type in (type, None):
                yield result(s_type, c_index, 1, m_id, message, 1.0)
        else:
            corpus = Corpus()
            unclustered_opinions = {}
            for m_id, message in values:
                unclustered_opinions[m_id] = (m_id, message)
                corpus.add((m_id, message), str=message, key=m_id)

            clusters = corpus.cluster()
            for c in clusters:
                c_index += 1
                rest = [(s["object"], s["similarity"]) for s in c.similars]
                c_size = len(rest) + 1
                for (m_id, message), score in [(c.primary, 1.0)] + rest:
                    del unclustered_opinions[m_id]
                    for s_type in (type, None):
                        yield result(s_type, c_index, c_size, m_id, message,
                                     score)

            for m_id, message in unclustered_opinions.values():
                c_index += 1
                for s_type in (type, None):
                    yield result(s_type, c_index, 1, m_id, message, 1.0)