def reorder_numbered_placeholders(input1, input2, by_group=True): """ Given a stream, identifies placeholders that are suffixed by '_$DIGITS'. These are considered to be argument placeholders, and will be renumbered with those prefixes in place. If we are reordering by group, numbering is done by counts in group""" eid = 0 toks1 = process_sentence(input1) toks2 = process_sentence(input2) oldent2newent = {} group_hist = {} # First pass scan for entities. Add in replacements for tok1 in toks1: matched = is_numbered_placeholder(tok1) if matched is not None: if tok1 not in oldent2newent: if by_group: if matched not in group_hist: group_hist[matched] = 0 next_id = group_hist[matched] group_hist[matched] += 1 else: next_id = eid new_entity = matched.format(next_id) eid += 1 oldent2newent[tok1] = new_entity # Replace in lang1 and lang2 new_toks1 = [oldent2newent[t] if t in oldent2newent else t for t in toks1] new_toks2 = [subsfirst(t, oldent2newent) for t in toks2] rlookup = {} for oldent, newent in oldent2newent.items(): rlookup[newent] = oldent return " ".join(new_toks1), " ".join(new_toks2), rlookup
def normalize_sal_entities(input1, input2, ent_prefix="id__"): """ Get the entity IDs in order, and issues new in-sequence IDs.""" eid = 0 toks1 = process_sentence(input1) toks2 = process_sentence(input2) oldent2newent = {} # First pass scan for entities for tok1 in toks1: if is_entity(tok1): if tok1 not in oldent2newent: new_entity = "{}{}".format(ent_prefix, eid) eid += 1 oldent2newent[tok1] = new_entity elif is_number(tok1): if tok1 not in oldent2newent: new_entity = "number__{}".format(eid) eid += 1 oldent2newent[tok1] = new_entity # Replace in lang1 and lang2 new_toks1 = [oldent2newent[t] if t in oldent2newent else t for t in toks1] new_toks2 = [oldent2newent[t] if t in oldent2newent else t for t in toks2] rlookup = {} for oldent, newent in oldent2newent.items(): rlookup[newent] = oldent return " ".join(new_toks1), " ".join(new_toks2), rlookup
def indexes_from_sentence(lang, sentence): indices = [ lang.word2index.get(word, UNK_token) for word in process_sentence(sentence) ] if match_parens: return indices else: return indices + [EOS_token]
def evaluate_pairs(evaluate_fn, eval_pairs): num_correct = 0 res_str = "" def emit(str): nonlocal res_str res_str += str res_str += "\n" return res_str i_iter = tqdm(range(len(eval_pairs))) print("Evaluating pairs") for i in i_iter: pair = eval_pairs[i] lang1_str, lang2_str = pair emit("# {}".format(i)) lang1_toks = process_sentence(lang1_str) lang2_toks = process_sentence(lang2_str) num_lang1_toks = len(lang1_toks) num_lang2_toks = len(lang2_toks) guessed_words, _ = evaluate_fn(lang1_str) guessed_sentence = ' '.join(guessed_words) num_guess_toks = len(guessed_words) is_correct = exact_match(lang2_toks, guessed_words) if is_correct: num_correct += 1 emit('Lang (#={}):\t{}'.format(num_lang1_toks, pair[0])) emit('Gold (#={}):\t{}'.format(num_lang2_toks, pair[1])) if is_correct: emit('Guess (#={}):\t{}'.format(num_guess_toks, guessed_sentence)) else: emit('* Guess (#={}):\t{}'.format(num_guess_toks, guessed_sentence)) emit('') acc = num_correct / len(eval_pairs) emit("Exact acc = {:.5f}".format(acc)) return acc, res_str
def show_attention(input_sentence, output_words, attentions): # Set up figure with colorbar fig = plt.figure() ax = fig.add_subplot(111) cax = ax.matshow(attentions.numpy(), cmap='bone') fig.colorbar(cax) # Set up axes ax.set_xticklabels([''] + process_sentence(input_sentence) + ['<EOS>'], rotation=90) ax.set_yticklabels([''] + output_words) # Show label at every tick ax.xaxis.set_major_locator(ticker.MultipleLocator(1)) ax.yaxis.set_major_locator(ticker.MultipleLocator(1)) plt.show()
def indexes_from_sentence(lang, sentence): return [ lang.word2index.get(word, UNK_token) for word in process_sentence(sentence) ]