Exemplo n.º 1
0
def dump(**kwargs):
    common = common_args(**kwargs)

    t = Terminal()
    ranking_line = "   {prob:6.2f}% → {color}{text}{t.normal}"
    actual_line = "{t.red}Actual{t.normal}: {t.bold}{actual_text}{t.normal}"

    sent_forwards = Sentences(common.file_vector,
                              size=SENTENCE_LENGTH,
                              backwards=False)
    sent_backwards = Sentences(common.file_vector,
                               size=SENTENCE_LENGTH,
                               backwards=True)

    least_agreement = []
    forwards_predictions = []
    backwards_predictions = []
    ranks = []

    contexts = enumerate(zip(chop_prefix(common.tokens, PREFIX_LENGTH),
                             sent_forwards, chop_prefix(sent_backwards)))

    # Note, the index is offset from the true start; i.e., when
    # index == 0, the true index is SENTENCE_LENGTH
    for index, (token, (prefix, x1), (suffix, x2)) in contexts:
        assert x1 == x2
        actual = x1
        print(unvocabularize(prefix[-5:]),
              t.bold_underline(token.value),
              unvocabularize(suffix[:5]))

        prefix_pred = common.forwards_model.predict(prefix)
        suffix_pred = common.backwards_model.predict(suffix)

        # harmonic mean
        mean = consensus(suffix_pred, prefix_pred)

        forwards_predictions.append(index_of_max(prefix_pred))
        backwards_predictions.append(index_of_max(suffix_pred))

        paired_rankings = rank(mean)
        ranked_vocab = list(tuple(zip(*paired_rankings))[0])
        top_5 = paired_rankings[:5]
        top_5_words = ranked_vocab[:5]

        for token_id, weight in top_5:
            color = t.green if token_id == actual else ''
            text = vocabulary.to_text(token_id)
            prob = weight * 100.0
            print(ranking_line.format_map(locals()))

        ranks.append(ranked_vocab.index(actual) + 1)
        min_token_id, min_prob = paired_rankings[0]
        least_agreement.append(Agreement(min_prob, index))

        if actual not in top_5_words:
            actual_text = vocabulary.to_text(actual)
            print(actual_line.format_map(locals()))

        print()

        if not ranks:
            print(t.red("Could not analyze file!"), file=sys.stderr)
            return

    print("MRR: ", mean_reciprocal_rank(ranks))
    print("Lowest rank:", max(ranks))
    print("Time at #1: {:.2f}%".format(
          100 * sum(1 for rank in ranks if rank == 1) / len(ranks)))
    print()

    forwards_text = [vocabulary.to_text(num) for num in forwards_predictions]
    backwards_text = [vocabulary.to_text(num) for num in backwards_predictions]

    least_agreement.sort()
    # Compensate for offset indices
    file_vector = common.file_vector[SENTENCE_LENGTH:]
    tokens_text = [tok.value for tok in common.tokens[PREFIX_LENGTH:]]
    for disagreement in least_agreement[:5]:
        print(disagreement.probability)
        prefix = ' '.join(disagreement.prefix(tokens_text))
        suffix = ' '.join(disagreement.suffix(tokens_text))

        print("   ", prefix, t.yellow(forwards_text @ disagreement), suffix)
        print("   ", prefix, t.underline(tokens_text @ disagreement), suffix)
        print("   ", prefix, t.blue(backwards_text @ disagreement), suffix)
        print()