Exemplo n.º 1
0
def train_bigram_words(bi_words_count, out_filename):
	bigram_words_trained = {}

	i = 0
	for pair in bi_words_count.items():
		print(str(i) + "/" + str(len(bi_words_count.items())), end="\r")

		text = (' '.join(pair[0].split())).split()
		first_word = text[0]
		second_word = text[1]
		second_number = word_to_number(second_word)
		freq = pair[1]

		if first_word in bigram_words_trained:
			possibilities = bigram_words_trained[first_word]

			if second_number in possibilities:
				possibilities[second_number].append((second_word, freq))
			else:
				possibilities[second_number] = [(second_word, freq)]
		else:
			bigram_words_trained[first_word] = { second_number: [(second_word, freq)] }

		i += 1

	with open(out_filename, 'w') as fp:
		json.dump(bigram_words_trained, fp)

	return 1
Exemplo n.º 2
0
def train_bigram_letters(bi_letters_count, out_filename):
	bigram_letters_trained = {};

	i = 1
	for pair in bi_letters_count.items():
		print(str(i) + "/" + str(len(bi_letters_count.items())), end="\r")
		
		first_letter = pair[0][0]
		second_letter = pair[0][1]
		freq = pair[1]

		number = word_to_number(second_letter)
		code = first_letter + number

		if code in bigram_letters_trained:
			bigram_letters_trained[code].append((pair[0], freq))
		else:
			bigram_letters_trained[code] = [(pair[0], freq)]

		i += 1

	with open(out_filename, 'w') as outfile:
		json.dump(bigram_letters_trained, outfile);

	return
Exemplo n.º 3
0
def train_bigram_words(bi_words_count, out_filename):
    bigram_words_trained = {}

    i = 0
    for pair in bi_words_count.items():
        print(str(i) + "/" + str(len(bi_words_count.items())), end="\r")

        text = (' '.join(pair[0].split())).split()
        first_word = text[0]
        second_word = text[1]
        second_number = word_to_number(second_word)
        freq = pair[1]

        if first_word in bigram_words_trained:
            possibilities = bigram_words_trained[first_word]

            if second_number in possibilities:
                possibilities[second_number].append((second_word, freq))
            else:
                possibilities[second_number] = [(second_word, freq)]
        else:
            bigram_words_trained[first_word] = {
                second_number: [(second_word, freq)]
            }

        i += 1

    with open(out_filename, 'w') as fp:
        json.dump(bigram_words_trained, fp)

    return 1
Exemplo n.º 4
0
def train_bigram_letters(bi_letters_count, out_filename):
    bigram_letters_trained = {}

    i = 1
    for pair in bi_letters_count.items():
        print(str(i) + "/" + str(len(bi_letters_count.items())), end="\r")

        first_letter = pair[0][0]
        second_letter = pair[0][1]
        freq = pair[1]

        number = word_to_number(second_letter)
        code = first_letter + number

        if code in bigram_letters_trained:
            bigram_letters_trained[code].append((pair[0], freq))
        else:
            bigram_letters_trained[code] = [(pair[0], freq)]

        i += 1

    with open(out_filename, 'w') as outfile:
        json.dump(bigram_letters_trained, outfile)

    return
Exemplo n.º 5
0
def train_unigram_words(words_count, out_filename):
	unigram_words_trained = {}

	i = 0;
	for pair in words_count.items():
		print(str(i) + "/" + str(len(words_count.items())), end="\r")

		word = pair[0]
		number = functions.word_to_number(word)
		freq = pair[1]

		if number in unigram_words_trained:
			unigram_words_trained[number].append((word, freq))
		else:
			unigram_words_trained[number] = [(word, freq)]
		i += 1

	with open(out_filename, 'w') as fp:
		json.dump(unigram_words_trained, fp)

	return
Exemplo n.º 6
0
def train_unigram_words(words_count, out_filename):
    unigram_words_trained = {}

    i = 0
    for pair in words_count.items():
        print(str(i) + "/" + str(len(words_count.items())), end="\r")

        word = pair[0]
        number = functions.word_to_number(word)
        freq = pair[1]

        if number in unigram_words_trained:
            unigram_words_trained[number].append((word, freq))
        else:
            unigram_words_trained[number] = [(word, freq)]
        i += 1

    with open(out_filename, 'w') as fp:
        json.dump(unigram_words_trained, fp)

    return