示例#1
0
def main(corpus_file, pvalue, use_perm, out_filename, min_count=5,
         min_bigram_count=5, min_char_count=3):

    """recursively find collocations for a given corpus.  writes out
    the marginal counts to a specified file"""

    sys.stdout.write("computing n-grams from %s\n" % corpus_file)

    ### read corpus
    corpus = open(corpus_file).readlines()

    ### set up recursive hypothesis tests
    lr = tt.LikelihoodRatio(pvalue=pvalue, use_perm=use_perm)
    def iter_gen(): return(corpus)
    # note: some hidden defaults here, e.g., no numbers
    char_filter = tt.make_char_filter(min_char_count)
    def my_filter(w):
        char_filter(w) and tt.stop_filter(w) and tt.digit_filter(w)
    def update_fun(count, doc):
        count.update_counts(doc, root_filter=tt.stop_filter)

    ### compute significant n-grams
    cnts = tt.nested_sig_bigrams(iter_gen, update_fun, lr, min_count)

    ### write n-grams to file
    sys.stdout.write("writing to %s\n" % out_filename)
    f = open(out_filename, 'w')
    # this can be adjusted to write out any information you need
    
    #[f.write('%s|%g\n' % (term, count)) for (term, count)
    #  in sorted(cnts.marg.items(), key=lambda x:-x[1])]
    f.close()

    return(cnts)
示例#2
0
def turbo_topic(corpus, assigns, topic, use_perm=False, pvalue=0.1, min=25):

    def iter_gen(): return(itertools.izip(corpus, assigns))
    def update_fun(counts, doc):
        update_counts_from_topic(doc[0], doc[1], topic, counts)

    test = tt.LikelihoodRatio(pvalue, use_perm=use_perm)
    cnts = tt.nested_sig_bigrams(iter_gen, update_fun, test, min)

    return(cnts)
示例#3
0
def compute(corpus_file,
            pvalue,
            use_perm,
            out_filename,
            stopw=None,
            min_count=5,
            min_bigram_count=5,
            min_char_count=3,
            encoding='utf-8'):
    """
    Recursively find collocations for a given corpus.  writes
    the marginal counts to a specified file
    :param encoding: Encoding of the corpus file
    :param stopw: List of stopwords to apply to the analysis
    :param corpus_file: string with file name
    :param pvalue: self-explanatory
    :param use_perm: Boolean. Score by permutation
    :param out_filename: file name to write into
    :param min_count:
    :param min_bigram_count:
    :param min_char_count:
    """

    sys.stdout.write("computing n-grams from %s\n" % corpus_file)

    if stopw is None:
        tt._stop_words = []
    else:
        assert isinstance(stopw, list)
        tt._stop_words = stopw

    ### read corpus
    with codecs.open(corpus_file, encoding=encoding) as f:
        corpus = f.readlines()

    ### set up recursive hypothesis tests
    lr = tt.LikelihoodRatio(pvalue=pvalue, use_perm=use_perm)

    def iter_gen():
        for doc in corpus:
            yield doc

    # note: some hidden defaults here, e.g., no numbers
    char_filter = tt.make_char_filter(min_char_count)

    def my_filter(w):
        char_filter(w) and tt.stop_filter(w) and tt.digit_filter(w)

    def update_fun(count, doc):
        count.update_counts(doc, root_filter=tt.stop_filter)

    ### compute significant n-grams
    cnts = tt.nested_sig_bigrams(iter_gen, update_fun, lr, min_count)

    ### write n-grams to file
    sys.stdout.write("writing to %s\n" % out_filename)
    with codecs.open(out_filename, 'w', encoding='utf-8') as f:
        # this can be adjusted to write out any information you need
        [
            f.write(u'{0:s}|{1:g}\n'.format(term, count))
            for (term, count) in sorted(cnts.marg.items(), key=lambda x: -x[1])
        ]
    print "Number of seleced bigrams: ", len(cnts.vocab)
    tt.write_vocab(cnts.marg, 'ngram_counts.csv')
    return cnts