Пример #1
0
def main():
    #book_sentences = get_book()

    book_sentences = read_data.get_sentences(
        'c://users//stifler//desktop//seg_data.txt')
    phrased1 = word2phrase.train_model(book_sentences, min_count=3)
    phrased2 = word2phrase.train_model(phrased1, min_count=3)
    two_word_counter = Counter()
    three_word_counter = Counter()
    for sentence in phrased2:
        for word in sentence:
            if word.count('_') == 1:
                two_word_counter[word] += 1
            if word.count('_') == 2:
                three_word_counter[word] += 1

    print('=' * 60)
    print('Top 20 Two Word Phrases')
    for phrase, count in two_word_counter.most_common(50):
        print('%56s %6d' % (phrase, count))

    print('=' * 60)
    print('Top 10 Three Word Phrases')
    for phrase, count in three_word_counter.most_common(30):
        print('%56s %6d' % (phrase, count))
Пример #2
0
def main():
    book_sentences = read_sentences()
    phrased1 = word2phrase.train_model(book_sentences, min_count=4)
    phrased2 = word2phrase.train_model(phrased1, min_count=4)
    new_text = ''
    for sentence in phrased2:
        for word in sentence:
            # Replace all single quotes with space
            # Tek tırnak işaretlerini boşluk ile değiştir
            word = word.replace("'"," ")
            new_text += word+' '
    ofile = open("alice-phrases","w",encoding="utf-8")
    print (new_text,file=ofile)
    ofile.close()
Пример #3
0
def main(fname):
    #    global two_word,three_word
    two_word = []
    three_word = []
    book_sentences = get_book(fname)
    phrased1 = word2phrase.train_model(book_sentences, min_count=2)
    phrased2 = word2phrase.train_model(phrased1, min_count=2)
    two_word_counter = Counter()
    three_word_counter = Counter()
    for sentence in phrased2:
        for word in sentence:
            if word.count('_') == 1:
                two_word_counter[word] += 1
                two_word.append(word)
            if word.count('_') == 2:
                three_word_counter[word] += 1
                three_word.append(word)

    return set(two_word), set(three_word)
Пример #4
0
def main():
    book_sentences = get_book()
    phrased1 = word2phrase.train_model(book_sentences, min_count=3)
    phrased2 = word2phrase.train_model(phrased1, min_count=3)
    two_word_counter = Counter()
    three_word_counter = Counter()
    for sentence in phrased2:
        for word in sentence:
            if word.count('_') == 1:
                two_word_counter[word] += 1
            if word.count('_') == 2:
                three_word_counter[word] += 1

    print '=' * 60
    print 'Top 20 Two Word Phrases'
    for phrase, count in two_word_counter.most_common(20):
        print '%56s %6d' % (phrase, count)

    print
    print '=' * 60
    print 'Top 10 Three Word Phrases'
    for phrase, count in three_word_counter.most_common(10):
        print '%56s %6d' % (phrase, count)
Пример #5
0
def main():
	book_sentences = get_book()
	phrased1 = word2phrase.train_model(book_sentences, min_count=3)
	phrased2 = word2phrase.train_model(phrased1, min_count=3)
	two_word_counter = Counter()
	three_word_counter = Counter()
	for sentence in phrased2:
		for word in sentence:
			if word.count('_') == 1:
				two_word_counter[word] += 1
			if word.count('_') == 2:
				three_word_counter[word] += 1
	pickle.dump([two_word_counter,three_word_counter],open('quizletvocab3.pick','wb+'))
	print('=' * 60)
	print('Top 20 Two Word Phrases')
	for phrase, count in two_word_counter.most_common(20):
		print('%56s %6d' % (phrase, count))

	print()
	print('=' * 60)
	print('Top 10 Three Word Phrases')
	for phrase, count in three_word_counter.most_common(10):
		print('%56s %6d' % (phrase, count))
Пример #6
0
                    word_suffix[token][line[i + 2]] = 1
    for token in new_words:
        # calculate pre entropy
        total = 0
        for k, v in word_prefix[token].items():
            total += int(v)
        pre_entropy = 0
        for k, v in word_prefix[token].items():
            pre_entropy += -(int(v) / total) * math.log(int(v) / total)
        # suf entropy
        total = 0
        for k, v in word_suffix[token].items():
            total += int(v)
        suf_entropy = 0
        for k, v in word_suffix[token].items():
            suf_entropy += -(int(v) / total) * math.log(int(v) / total)
        word_pre_suf_entropy[token] = [pre_entropy, suf_entropy]
    return word_pre_suf_entropy


import read_data
book_sentences = read_data.get_sentences(
    'c://users//stifler//desktop//data.txt')
out_sentences, new_words = word2phrase.train_model(book_sentences,
                                                   threshold=100)
for i in new_words:
    print(i)
word_pre_suf_entropy = info_entropy(out_sentences, new_words)
for i in word_pre_suf_entropy:
    print(i)