def main(): #book_sentences = get_book() book_sentences = read_data.get_sentences( 'c://users//stifler//desktop//seg_data.txt') phrased1 = word2phrase.train_model(book_sentences, min_count=3) phrased2 = word2phrase.train_model(phrased1, min_count=3) two_word_counter = Counter() three_word_counter = Counter() for sentence in phrased2: for word in sentence: if word.count('_') == 1: two_word_counter[word] += 1 if word.count('_') == 2: three_word_counter[word] += 1 print('=' * 60) print('Top 20 Two Word Phrases') for phrase, count in two_word_counter.most_common(50): print('%56s %6d' % (phrase, count)) print('=' * 60) print('Top 10 Three Word Phrases') for phrase, count in three_word_counter.most_common(30): print('%56s %6d' % (phrase, count))
def main(): book_sentences = read_sentences() phrased1 = word2phrase.train_model(book_sentences, min_count=4) phrased2 = word2phrase.train_model(phrased1, min_count=4) new_text = '' for sentence in phrased2: for word in sentence: # Replace all single quotes with space # Tek tırnak işaretlerini boşluk ile değiştir word = word.replace("'"," ") new_text += word+' ' ofile = open("alice-phrases","w",encoding="utf-8") print (new_text,file=ofile) ofile.close()
def main(fname): # global two_word,three_word two_word = [] three_word = [] book_sentences = get_book(fname) phrased1 = word2phrase.train_model(book_sentences, min_count=2) phrased2 = word2phrase.train_model(phrased1, min_count=2) two_word_counter = Counter() three_word_counter = Counter() for sentence in phrased2: for word in sentence: if word.count('_') == 1: two_word_counter[word] += 1 two_word.append(word) if word.count('_') == 2: three_word_counter[word] += 1 three_word.append(word) return set(two_word), set(three_word)
def main(): book_sentences = get_book() phrased1 = word2phrase.train_model(book_sentences, min_count=3) phrased2 = word2phrase.train_model(phrased1, min_count=3) two_word_counter = Counter() three_word_counter = Counter() for sentence in phrased2: for word in sentence: if word.count('_') == 1: two_word_counter[word] += 1 if word.count('_') == 2: three_word_counter[word] += 1 print '=' * 60 print 'Top 20 Two Word Phrases' for phrase, count in two_word_counter.most_common(20): print '%56s %6d' % (phrase, count) print print '=' * 60 print 'Top 10 Three Word Phrases' for phrase, count in three_word_counter.most_common(10): print '%56s %6d' % (phrase, count)
def main(): book_sentences = get_book() phrased1 = word2phrase.train_model(book_sentences, min_count=3) phrased2 = word2phrase.train_model(phrased1, min_count=3) two_word_counter = Counter() three_word_counter = Counter() for sentence in phrased2: for word in sentence: if word.count('_') == 1: two_word_counter[word] += 1 if word.count('_') == 2: three_word_counter[word] += 1 pickle.dump([two_word_counter,three_word_counter],open('quizletvocab3.pick','wb+')) print('=' * 60) print('Top 20 Two Word Phrases') for phrase, count in two_word_counter.most_common(20): print('%56s %6d' % (phrase, count)) print() print('=' * 60) print('Top 10 Three Word Phrases') for phrase, count in three_word_counter.most_common(10): print('%56s %6d' % (phrase, count))
word_suffix[token][line[i + 2]] = 1 for token in new_words: # calculate pre entropy total = 0 for k, v in word_prefix[token].items(): total += int(v) pre_entropy = 0 for k, v in word_prefix[token].items(): pre_entropy += -(int(v) / total) * math.log(int(v) / total) # suf entropy total = 0 for k, v in word_suffix[token].items(): total += int(v) suf_entropy = 0 for k, v in word_suffix[token].items(): suf_entropy += -(int(v) / total) * math.log(int(v) / total) word_pre_suf_entropy[token] = [pre_entropy, suf_entropy] return word_pre_suf_entropy import read_data book_sentences = read_data.get_sentences( 'c://users//stifler//desktop//data.txt') out_sentences, new_words = word2phrase.train_model(book_sentences, threshold=100) for i in new_words: print(i) word_pre_suf_entropy = info_entropy(out_sentences, new_words) for i in word_pre_suf_entropy: print(i)