예제 #1
0
def get_nearest_neighbors(w, embeddings, k=1000):
    """
    For every transition in every pattern, gets the word with the highest
    score for that transition.
    Only looks at the first `k` words in the vocab (makes sense, assuming
    they're sorted by descending frequency).
    """
    return argmax(torch.mm(w, embeddings[:k, :]))
def interpret_documents(model, batch_size, dev_data, dev_text, ofile,
                        max_doc_len):
    j = 0
    with open(ofile, "w") as ofh:
        for batch_idx, chunk in enumerate(chunked(dev_data, batch_size)):
            batch = Batch([x for x, y in chunk], model.embeddings,
                          model.to_cuda)
            res, scores = model.forward(batch, 1)
            print("ss", scores.size())

            output = softmax(res).data

            predictions = [int(x) for x in argmax(output)]

            num_patts = scores.size()[1]

            diffs = np.zeros((num_patts, batch.size()))

            # Traversing all patterns.
            for i in range(num_patts):
                # Copying scores data to numpy array.
                scores_data = np.array(scores.data.numpy(), copy=True)

                # Zeroing out pattern number i across batch
                scores_data[:, i] = 0

                # Running mlp.forward() with zeroed out scores.
                forwarded = softmax(
                    model.mlp.forward(Variable(
                        torch.FloatTensor(scores_data)))).data.numpy()

                # Computing difference between forwarded scores and original scores.
                for k in range(batch.size()):
                    # diffs[i,k] = output[k, predictions[k]] - \
                    #              output[k, 1 - predictions[k]] - \
                    #              forwarded[k, predictions[k]] + \
                    #              forwarded[k, 1 - predictions[k]]

                    diffs[i, k] = forwarded[k, 1 - predictions[k]] - output[
                        k, 1 - predictions[k]]

            # Now, traversing documents in batch
            for i in range(batch.size()):
                # Document string
                text_str = str(" ".join(dev_text[j]).encode('utf-8'))[2:-1]

                # Top ten patterns with largest differences between leave-one-out score and original score.
                top_ten_deltas = sorted(enumerate(diffs[:, i]),
                                        key=lambda x: x[1],
                                        reverse=True)[:10]
                top_ten_neg_deltas = sorted(enumerate(diffs[:, i]),
                                            key=lambda x: x[1])[:10]
                # Top ten patterns with largest overall score (regardless of classification)
                top_ten_scores = sorted(enumerate(scores.data.numpy()[i, :]),
                                        key=lambda x: x[1],
                                        reverse=True)[:10]

                top_scoring_spans = get_top_scoring_spans_for_doc(
                    model, dev_data[j], max_doc_len)

                # Printing out everything.
                ofh.write(
                    "{}   {}   {} All in, predicted: {:>2,.3f}   All in, not-predicted: {:>2,.3f}    Leave one out: +res: {} -res: {} Patt scores: {}\n"
                    .format(
                        dev_data[j][1], predictions[i], text_str,
                        output[i, predictions[i]],
                        output[i, 1 - predictions[i]], " ".join([
                            "{}:{:>2,.3f}".format(i, x)
                            for (i, x) in top_ten_deltas
                        ]), " ".join([
                            "{}:{:>2,.3f}".format(i, x)
                            for (i, x) in top_ten_neg_deltas
                        ]), " ".join([
                            "{}:{:>2,.3f}".format(i, x)
                            for (i, x) in top_ten_scores
                        ])))
                ofh.write("Top ten deltas:\n")
                for l in top_ten_deltas:
                    s = top_scoring_spans[l[0]].display(dev_text[j])
                    ofh.write(
                        str(int(l[0])) + " " + str(s.encode('utf-8'))[2:-1] +
                        "\n")

                ofh.write("Top ten negative deltas:\n")
                for l in top_ten_neg_deltas:
                    s = top_scoring_spans[l[0]].display(dev_text[j])
                    ofh.write(
                        str(int(l[0])) + " " + str(s.encode('utf-8'))[2:-1] +
                        "\n")
                j += 1