示例#1
0
def extract_pairs(sess, source_sentences, target_sentences,
                  source_sentences_ids, target_sentences_ids, probs_op,
                  placeholders):
    """Extract sentence pairs from a pair of articles in source and target languages.
       Returns a list of (source sentence, target sentence, probability score) tuples.
    """
    pairs = [(i, j) for i, j in product(range(len(source_sentences)),
                                        range(len(target_sentences)))]

    data = [(source_sentences_ids[i], target_sentences_ids[j], 1.0)
            for i, j in product(range(len(source_sentences)),
                                range(len(target_sentences)))]

    data_iterator = utils.TestingIterator(np.array(data, dtype=object))

    y_score = inference(sess, data_iterator, probs_op, placeholders)
    y_score = [(score, k) for k, score in enumerate(y_score)]
    y_score.sort(reverse=True)

    i_aligned = set()
    j_aligned = set()
    sentence_pairs = []
    for score, k in y_score:
        i, j = pairs[k]
        if score < FLAGS.decision_threshold or i in i_aligned or j in j_aligned:
            continue
        if FLAGS.use_greedy:
            i_aligned.add(i)
            j_aligned.add(j)
        sentence_pairs.append(
            (source_sentences[i], target_sentences[j], score))
    return sentence_pairs
示例#2
0
def extract_pairs(sess, source_sentences, target_sentences,
                  source_sentences_ids, target_sentences_ids, probs_op,
                  placeholders):
    """Extract sentence pairs from a pair of articles in source and target languages.
       Only keep sentence pairs that occur in a block of at least MINBLOCKSIZE sentence pairs with high scores.
       Returns a list of (source sentence, target sentence, probability score) tuples.
    """
    pairs = [(i, j) for i, j in product(range(len(source_sentences)),
                                        range(len(target_sentences)))]

    data = [(source_sentences_ids[i], target_sentences_ids[j], 1.0)
            for i, j in product(range(len(source_sentences)),
                                range(len(target_sentences)))]

    data_iterator = utils.TestingIterator(np.array(data, dtype=object))

    y_score = inference(sess, data_iterator, probs_op, placeholders)
    y_score = [(score, k) for k, score in enumerate(y_score)]

    sentence_pairs = []

    Pairs = set()
    minsizeblock = 2
    in_block = []
    for score, k in y_score:
        if score < FLAGS.decision_threshold:
            continue
        i, j = pairs[k]
        print("---", i, j)
        # scoresum = 0
        if (i, j) in in_block:
            print("in block")
            continue
        block = []
        for score2, k2 in y_score:
            if score2 < FLAGS.decision_threshold:
                continue
            i2, j2 = pairs[k2]
            if i2 == i and j2 == j:
                print(i2, j2)
                # scoresum += score2
                block.append((i2, j2, score2))
                i += 1
                j += 1
                if (len(block) > 1):
                    in_block.append((i2, j2))
        if len(block) >= minsizeblock:
            print("--------------------------")
            for i, j, score in block:
                print(i, j)
                print(source_sentences[i])
                print(target_sentences[j])
                print(score)
                print("\n")
                Pairs.add((source_sentences[i], target_sentences[j], score))

    for srcsent, trgsent, score in Pairs:
        sentence_pairs.append((srcsent, trgsent, score))

    return sentence_pairs
示例#3
0
def evaluate(sess, source_sentences, target_sentences, references,
             source_sentences_ids, target_sentences_ids, probs_op,
             placeholders):
    """"Evalute BiRNN at decision threshold value maximizing the area
        under the precison-recall curve.
    """
    pairs = [(i, j) for i, j in product(range(len(source_sentences)),
                                        range(len(target_sentences)))]

    data = [(source_sentences_ids[i], target_sentences_ids[j], 1.0) if
            (i, j) in references else
            (source_sentences_ids[i], target_sentences_ids[j], 0.0)
            for i, j in product(range(len(source_sentences)),
                                range(len(target_sentences)))]

    data_iterator = utils.TestingIterator(np.array(data, dtype=object))

    y_score = inference(sess, data_iterator, probs_op, placeholders)
    y_true = data_iterator.data[:, 2].astype(int)

    p, r, t = precision_recall_curve(y_true, y_score, pos_label=1)
    f1 = utils.f1_score(p, r)

    i = np.argmax(f1)
    print("Evaluation metrics at decision threshold = {:.4f}\n"
          "Precision = {:.2f}, Recall = {:.2f}, F1 = {:.2f}\n"
          "-------------------------------------------------".format(
              p[i], 100 * r[i], 100 * f1[i], 100 * t[i]))