Python filter_tokens示例

编程语言: Python

命名空间/包名称: discourse.topics

方法/功能: filter_tokens

hotexamples.com的示例: 3

Python filter_tokens - 已找到3个示例。这些是从开源项目中提取的最受好评的discourse.topics.filter_tokens现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： cluster.py 项目： kedz/discourse

def write_sequence_output(clusters, ofile):
    """Write training documents with cluster labels."""

    sent2topic = {}
    sent2tokens = {}
    docs = {}

    for i, cluster in enumerate(clusters, 1):
        if i < len(clusters):
            topic = u'tpc_{}'.format(i)
        else:
            topic = u'tpc_MISC'
        for sentence in cluster['sentences']:
            if sentence.filename not in docs:
                docs[sentence.filename] = []
            snum = sentence.corenlp_sentence.idx
            ug = topics.filter_tokens(sentence.corenlp_sentence)
            docs[sentence.filename].append((snum, topic, ug))
            
    with codecs.open(ofile, 'w', 'utf-8') as of:
        for filename, sents in docs.iteritems():
            ordered_sents = sorted(sents, key=lambda x: x[0])
            for sent in ordered_sents:
                topic = sent[1]
                if len(sent[2]) == 0:
                    continue
                tokens = u' '.join(sent[2])
                line = u'{}\t{}\n'.format(topic, tokens)
                of.write(line)
                of.flush()
            of.write(u'\n')
            of.flush()

示例#2

显示文件

文件： cluster.py 项目： kedz/discourse

def make_instance(sent, position, doc_length, filename):
    tokens = topics.filter_tokens(sent)
    return Sentence(unicode(sent),
                    topics.unigrams(tokens),
                    topics.bigrams(tokens),
                    topics.trigrams(tokens),
                    1 if position / float(doc_length) <= .5 else 2,
                    sent,
                    filename)

示例#3

显示文件

文件： cluster.py 项目： kedz/discourse

def write_clusters_output(clusters, cfile):
    """Write cluster sentences for each cluster -- this is mainly
    for debugging."""

    with codecs.open(cfile, 'w', 'utf-8') as f:
        for i, cluster in enumerate(clusters, 1):
            f.write(u'Topic {:3}\n---------\n'.format(i))
            for sentence in cluster['sentences']:
                ug = topics.filter_tokens(sentence.corenlp_sentence)
                f.write(u' '.join(ug))
                f.write(u'\n')
            f.write(u'\n')
            f.flush()