def get_comparison_set(fi): f = open(fi,'r') t = f.read().replace('\n',' ').replace('\r',' ') l = lang_model(t) p = set() for phrase, cnt in l.big_fd.iteritems(): p.add(phrase) # if cnt >= MIN_BIGRAMS: # p.add(phrase) # else: # break return p
def get_comparison_set(fi, ngram_length=8): f = open(fi, "r") text = f.read() lang = lang_model(text) return lang.gram(ngram_length)