Exemplo n.º 1
0
def main():
    questions_path, answers_path = sys.argv[1:]

    print("Reading Corpus:")
    train_sentences = read_corpus('train_data', disp=True)

    print('\nTraining on Corpus')
    model = NGram.train_model(train_sentences, disp=True)

    with open(answers_path, 'r') as answer_file:
        answers = get_sentences(untokenized_text=answer_file.read(),
                                is_tokenized=True,
                                token_start_end=('<s>', '</s>'))

    dev_sentences = answers[:520]

    print('Calculating Probabilities for Dev Sentences:')
    model.sentences_probabilities(dev_sentences, disp=True)
    lambdas = optimize_lambdas(model)

    with open(questions_path, 'r') as question_file:
        questions = get_sentences(untokenized_text=question_file.read(),
                                  is_tokenized=True,
                                  token_start_end=('<s>', '</s>'))

    print('Calculating Probabilities for Test Sentences:')
    model.sentences_probabilities(sentences=questions, disp=True)
    _, sentences_perplexity = model.perplexity(lambdas=lambdas)

    print('Writing sentences and perplexities to file')
    with open('output.txt', 'w') as out_file:
        for i, perplexity in enumerate(sentences_perplexity):
            out_file.write('{}\t{}\n'.format(' '.join(questions[i]).replace('<s0> <s1>', '<s>'), perplexity))
Exemplo n.º 2
0
def main():
    questions_path, answers_path = sys.argv[1:]

    print("Reading Corpus:")
    train_sentences = read_corpus('train_data', disp=True)

    print('\nTraining on Corpus')
    model = NGram.train_model(train_sentences, disp=True)

    with open(answers_path, 'r') as answer_file:
        answers = get_sentences(untokenized_text=answer_file.read(),
                                is_tokenized=True,
                                token_start_end=('<s>', '</s>'))

    dev_sentences = answers[:520]

    print('Calculating Probabilities for Dev Sentences:')
    model.sentences_probabilities(dev_sentences, disp=True)
    lambdas = optimize_lambdas(model)

    with open(questions_path, 'r') as question_file:
        questions = get_sentences(untokenized_text=question_file.read(),
                                  is_tokenized=True,
                                  token_start_end=('<s>', '</s>'))

    print('Calculating Probabilities for Test Sentences:')
    model.sentences_probabilities(sentences=questions, disp=True)
    _, sentences_perplexity = model.perplexity(lambdas=lambdas)

    print('Writing sentences and perplexities to file')
    with open('output.txt', 'w') as out_file:
        for i, perplexity in enumerate(sentences_perplexity):
            out_file.write('{}\t{}\n'.format(
                ' '.join(questions[i]).replace('<s0> <s1>', '<s>'),
                perplexity))