def test_cluster(): c = Corpus(similarity=0.1) for doc in docs: c.add(doc) groups = c.cluster() eq_(len(groups), 2) eq_(len(groups[0].similars), 1) eq_(len(groups[1].similars), 1)
def cluster_queryset(qs): seen = {} c = Corpus(similarity=SIM_THRESHOLD, stopwords=STOPWORDS) for op in qs: if op.description in seen: continue # filter short descriptions if len(op.description) < 15: continue seen[op.description] = 1 c.add(op, str=op.description, key=op.id) return c.cluster()
def process(inStream, outStream, fields={ "id": "id", "text": "text" }, limits={ "clusters": 10, "top_documents": 10 }): all = {} text_field = fields["text"] key_field = fields["id"] max_clusters = limits["clusters"] max_top_docs = limits["top_documents"] c = Corpus() for line in inStream: data = line.split('\t', 1)[1] doc = json.loads(data.decode("utf8")) key = doc[key_field] all[key] = doc text = c.add((key, doc[text_field]), key=key) clusters = c.cluster() results = [] for c in clusters[:max_clusters]: tophits = [c.primary] tophits += [hit["object"] for hit in c.similars[:max_top_docs - 1]] topdocs = [] for (key, text) in tophits: topdocs.append(all[key]) results.append({"top_documents": topdocs}) json.dump({"clusters": results}, outStream)