def extract_pairs(sess, source_sentences, target_sentences, source_sentences_ids, target_sentences_ids, probs_op, placeholders): """Extract sentence pairs from a pair of articles in source and target languages. Returns a list of (source sentence, target sentence, probability score) tuples. """ pairs = [(i, j) for i, j in product(range(len(source_sentences)), range(len(target_sentences)))] data = [(source_sentences_ids[i], target_sentences_ids[j], 1.0) for i, j in product(range(len(source_sentences)), range(len(target_sentences)))] data_iterator = utils.TestingIterator(np.array(data, dtype=object)) y_score = inference(sess, data_iterator, probs_op, placeholders) y_score = [(score, k) for k, score in enumerate(y_score)] y_score.sort(reverse=True) i_aligned = set() j_aligned = set() sentence_pairs = [] for score, k in y_score: i, j = pairs[k] if score < FLAGS.decision_threshold or i in i_aligned or j in j_aligned: continue if FLAGS.use_greedy: i_aligned.add(i) j_aligned.add(j) sentence_pairs.append( (source_sentences[i], target_sentences[j], score)) return sentence_pairs
def extract_pairs(sess, source_sentences, target_sentences, source_sentences_ids, target_sentences_ids, probs_op, placeholders): """Extract sentence pairs from a pair of articles in source and target languages. Only keep sentence pairs that occur in a block of at least MINBLOCKSIZE sentence pairs with high scores. Returns a list of (source sentence, target sentence, probability score) tuples. """ pairs = [(i, j) for i, j in product(range(len(source_sentences)), range(len(target_sentences)))] data = [(source_sentences_ids[i], target_sentences_ids[j], 1.0) for i, j in product(range(len(source_sentences)), range(len(target_sentences)))] data_iterator = utils.TestingIterator(np.array(data, dtype=object)) y_score = inference(sess, data_iterator, probs_op, placeholders) y_score = [(score, k) for k, score in enumerate(y_score)] sentence_pairs = [] Pairs = set() minsizeblock = 2 in_block = [] for score, k in y_score: if score < FLAGS.decision_threshold: continue i, j = pairs[k] print("---", i, j) # scoresum = 0 if (i, j) in in_block: print("in block") continue block = [] for score2, k2 in y_score: if score2 < FLAGS.decision_threshold: continue i2, j2 = pairs[k2] if i2 == i and j2 == j: print(i2, j2) # scoresum += score2 block.append((i2, j2, score2)) i += 1 j += 1 if (len(block) > 1): in_block.append((i2, j2)) if len(block) >= minsizeblock: print("--------------------------") for i, j, score in block: print(i, j) print(source_sentences[i]) print(target_sentences[j]) print(score) print("\n") Pairs.add((source_sentences[i], target_sentences[j], score)) for srcsent, trgsent, score in Pairs: sentence_pairs.append((srcsent, trgsent, score)) return sentence_pairs
def evaluate(sess, source_sentences, target_sentences, references, source_sentences_ids, target_sentences_ids, probs_op, placeholders): """"Evalute BiRNN at decision threshold value maximizing the area under the precison-recall curve. """ pairs = [(i, j) for i, j in product(range(len(source_sentences)), range(len(target_sentences)))] data = [(source_sentences_ids[i], target_sentences_ids[j], 1.0) if (i, j) in references else (source_sentences_ids[i], target_sentences_ids[j], 0.0) for i, j in product(range(len(source_sentences)), range(len(target_sentences)))] data_iterator = utils.TestingIterator(np.array(data, dtype=object)) y_score = inference(sess, data_iterator, probs_op, placeholders) y_true = data_iterator.data[:, 2].astype(int) p, r, t = precision_recall_curve(y_true, y_score, pos_label=1) f1 = utils.f1_score(p, r) i = np.argmax(f1) print("Evaluation metrics at decision threshold = {:.4f}\n" "Precision = {:.2f}, Recall = {:.2f}, F1 = {:.2f}\n" "-------------------------------------------------".format( p[i], 100 * r[i], 100 * f1[i], 100 * t[i]))