def write_sequence_output(clusters, ofile): """Write training documents with cluster labels.""" sent2topic = {} sent2tokens = {} docs = {} for i, cluster in enumerate(clusters, 1): if i < len(clusters): topic = u'tpc_{}'.format(i) else: topic = u'tpc_MISC' for sentence in cluster['sentences']: if sentence.filename not in docs: docs[sentence.filename] = [] snum = sentence.corenlp_sentence.idx ug = topics.filter_tokens(sentence.corenlp_sentence) docs[sentence.filename].append((snum, topic, ug)) with codecs.open(ofile, 'w', 'utf-8') as of: for filename, sents in docs.iteritems(): ordered_sents = sorted(sents, key=lambda x: x[0]) for sent in ordered_sents: topic = sent[1] if len(sent[2]) == 0: continue tokens = u' '.join(sent[2]) line = u'{}\t{}\n'.format(topic, tokens) of.write(line) of.flush() of.write(u'\n') of.flush()
def make_instance(sent, position, doc_length, filename): tokens = topics.filter_tokens(sent) return Sentence(unicode(sent), topics.unigrams(tokens), topics.bigrams(tokens), topics.trigrams(tokens), 1 if position / float(doc_length) <= .5 else 2, sent, filename)
def write_clusters_output(clusters, cfile): """Write cluster sentences for each cluster -- this is mainly for debugging.""" with codecs.open(cfile, 'w', 'utf-8') as f: for i, cluster in enumerate(clusters, 1): f.write(u'Topic {:3}\n---------\n'.format(i)) for sentence in cluster['sentences']: ug = topics.filter_tokens(sentence.corenlp_sentence) f.write(u' '.join(ug)) f.write(u'\n') f.write(u'\n') f.flush()