def __call__(self, key, values_gen): values = list(values_gen) version, site, platform, type = key def result(s_type, c_index, c_size, m_id, message, score): sortkey = MAX_SIZE - c_size self.cluster_count += 1 return \ (sortkey, version, site, platform, s_type, c_index, type, c_size), \ (m_id, message, score) c_index = 1 if len(values) == 1: m_id, message = values[0] for s_type in (type, None): yield result(s_type, c_index, 1, m_id, message, 1.0) else: corpus = Corpus() unclustered_opinions = {} for m_id, message in values: unclustered_opinions[m_id] = (m_id, message) corpus.add((m_id, message), str=message, key=m_id) clusters = corpus.cluster() for c in clusters: c_index += 1 rest = [(s["object"], s["similarity"]) for s in c.similars] c_size = len(rest) + 1 for (m_id, message), score in [(c.primary, 1.0)] + rest: del unclustered_opinions[m_id] for s_type in (type, None): yield result(s_type, c_index, c_size, m_id, message, score) for m_id, message in unclustered_opinions.values(): c_index += 1 for s_type in (type, None): yield result(s_type, c_index, 1, m_id, message, 1.0)