def main(corpus_file, pvalue, use_perm, out_filename, min_count=5, min_bigram_count=5, min_char_count=3): """recursively find collocations for a given corpus. writes out the marginal counts to a specified file""" sys.stdout.write("computing n-grams from %s\n" % corpus_file) ### read corpus corpus = open(corpus_file).readlines() ### set up recursive hypothesis tests lr = tt.LikelihoodRatio(pvalue=pvalue, use_perm=use_perm) def iter_gen(): return(corpus) # note: some hidden defaults here, e.g., no numbers char_filter = tt.make_char_filter(min_char_count) def my_filter(w): char_filter(w) and tt.stop_filter(w) and tt.digit_filter(w) def update_fun(count, doc): count.update_counts(doc, root_filter=tt.stop_filter) ### compute significant n-grams cnts = tt.nested_sig_bigrams(iter_gen, update_fun, lr, min_count) ### write n-grams to file sys.stdout.write("writing to %s\n" % out_filename) f = open(out_filename, 'w') # this can be adjusted to write out any information you need #[f.write('%s|%g\n' % (term, count)) for (term, count) # in sorted(cnts.marg.items(), key=lambda x:-x[1])] f.close() return(cnts)
def turbo_topic(corpus, assigns, topic, use_perm=False, pvalue=0.1, min=25): def iter_gen(): return(itertools.izip(corpus, assigns)) def update_fun(counts, doc): update_counts_from_topic(doc[0], doc[1], topic, counts) test = tt.LikelihoodRatio(pvalue, use_perm=use_perm) cnts = tt.nested_sig_bigrams(iter_gen, update_fun, test, min) return(cnts)
def compute(corpus_file, pvalue, use_perm, out_filename, stopw=None, min_count=5, min_bigram_count=5, min_char_count=3, encoding='utf-8'): """ Recursively find collocations for a given corpus. writes the marginal counts to a specified file :param encoding: Encoding of the corpus file :param stopw: List of stopwords to apply to the analysis :param corpus_file: string with file name :param pvalue: self-explanatory :param use_perm: Boolean. Score by permutation :param out_filename: file name to write into :param min_count: :param min_bigram_count: :param min_char_count: """ sys.stdout.write("computing n-grams from %s\n" % corpus_file) if stopw is None: tt._stop_words = [] else: assert isinstance(stopw, list) tt._stop_words = stopw ### read corpus with codecs.open(corpus_file, encoding=encoding) as f: corpus = f.readlines() ### set up recursive hypothesis tests lr = tt.LikelihoodRatio(pvalue=pvalue, use_perm=use_perm) def iter_gen(): for doc in corpus: yield doc # note: some hidden defaults here, e.g., no numbers char_filter = tt.make_char_filter(min_char_count) def my_filter(w): char_filter(w) and tt.stop_filter(w) and tt.digit_filter(w) def update_fun(count, doc): count.update_counts(doc, root_filter=tt.stop_filter) ### compute significant n-grams cnts = tt.nested_sig_bigrams(iter_gen, update_fun, lr, min_count) ### write n-grams to file sys.stdout.write("writing to %s\n" % out_filename) with codecs.open(out_filename, 'w', encoding='utf-8') as f: # this can be adjusted to write out any information you need [ f.write(u'{0:s}|{1:g}\n'.format(term, count)) for (term, count) in sorted(cnts.marg.items(), key=lambda x: -x[1]) ] print "Number of seleced bigrams: ", len(cnts.vocab) tt.write_vocab(cnts.marg, 'ngram_counts.csv') return cnts