def test_token_aligner_project_single_token_index(): source_tokens = ["abc", "def", "ghi", "jkl"] target_tokens = ["abc", "d", "ef", "ghi", "jkl"] ta = TokenAligner(source_tokens, target_tokens) m = ta.project_token_idxs(1) m_expected = np.array([1, 2]) assert (m == m_expected).all()
def test_token_aligner_project_to_empty_target_token_sequence(): source_tokens = ["abc", "def", "ghi", "jkl"] target_tokens = [] ta = TokenAligner(source_tokens, target_tokens) m = ta.project_tokens([1, 3]) m_expected = np.array([]) assert (m == m_expected).all()
def test_token_aligner_project_to_mismatched_token_sequence(): source_tokens = ["abc", "def", "ghi", "jkl"] target_tokens = ["qrs", "tuv", "wxy", "z"] ta = TokenAligner(source_tokens, target_tokens) m = ta.project_tokens([1]) m_expected = np.array([]) assert (m == m_expected).all()
def test_token_aligner_project_span(): source_tokens = ["abc", "def", "ghi", "jkl"] target_tokens = ["abc", "d", "ef", "ghi", "jkl"] ta = TokenAligner(source_tokens, target_tokens) m = ta.project_span(1, 2) m_expected = np.array([1, 3]) assert (m == m_expected).all()
def test_moses_tok_idx_proj_3(): src_tokens = [ "Mr.", "Immelt", "chose", "to", "focus", "on", "the", "incomprehensibility", "of", "accounting", "rules.", ] tgt_tokens = [ "Mr.", "Immelt", "chose", "to", "focus", "on", "the", "incomprehensibility", "of", "accounting", "rules", ".", ] tgt_token_index = [[0], [1], [2], [3], [4], [5], [6], [7], [8], [9], [10, 11]] ta = TokenAligner(src_tokens, tgt_tokens) for src_token_idx in range(len(src_tokens)): projected_tgt_tok_idx = ta.project_tokens(src_token_idx) assert (tgt_token_index[src_token_idx] == projected_tgt_tok_idx).all()
def test_token_aligner_project_span_last_token_range_is_end_exclusive(): source_tokens = ["abc", "def", "ghi", "jkl"] target_tokens = ["abc", "d", "ef", "ghi", "jkl"] ta = TokenAligner(source_tokens, target_tokens) m = ta.project_span(3, 4) m_expected = np.array([4, 5]) assert (m == m_expected).all()
def test_token_aligner_project_multiple_token_indices(): source_tokens = ["abc", "def", "ghi", "jkl"] target_tokens = ["abc", "d", "ef", "ghi", "jkl"] ta = TokenAligner(source_tokens, target_tokens) m = ta.project_tokens([1, 3]) m_expected = np.array([1, 2, 4]) assert (m == m_expected).all()
def test_project_span_covering_whole_sequence(): src_tokens = ["Members", "of", "the", "House", "clapped", "their", "hands"] tgt_tokens = [ "Members", "Ġof", "Ġthe", "ĠHouse", "Ġcl", "apped", "Ġtheir", "Ġhands" ] # reference: tgt_token_index = [[0], [1], [2], [3], [4, 5], [6], [7]] ta = TokenAligner(src_tokens, tgt_tokens) assert (0, 8) == ta.project_span(0, 7)
def test_bytebpe_tok_idx_proj_4(): src_tokens = ["What?"] tgt_tokens = ["What", "?"] tgt_token_index = [[0, 1]] ta = TokenAligner(src_tokens, tgt_tokens) for src_token_idx in range(len(src_tokens)): projected_tgt_tok_idx = ta.project_tokens(src_token_idx) assert (tgt_token_index[src_token_idx] == projected_tgt_tok_idx).all()
def test_wpm_tok_idx_proj_1(): src_tokens = ["Members", "of", "the", "House", "clapped", "their", "hands"] tgt_tokens = ["Members", "of", "the", "House", "clapped", "their", "hands"] tgt_token_index = [[0], [1], [2], [3], [4], [5], [6]] ta = TokenAligner(src_tokens, tgt_tokens) for src_token_idx in range(len(src_tokens)): projected_tgt_tok_idx = ta.project_tokens(src_token_idx) assert (tgt_token_index[src_token_idx] == projected_tgt_tok_idx).all()
def _create_examples(self, qa_file_path, set_type): wiki_df = pd.read_csv(self.path_dict["wiki_dict"], sep="\t", names=["sent_id", "text"]) wiki_dict = { row.sent_id: row.text for row in wiki_df.itertuples(index=False) } data_df = pd.read_csv( qa_file_path, sep="\t", header=None, names=[ "sent_id", "target_ids", "worker_id", "qa_index", "qa_word", "question", "answer", "response1", "response2", ], ) data_df["sent"] = data_df["sent_id"].apply(wiki_dict.get) examples = [] ptb_detokenizer = nltk.tokenize.treebank.TreebankWordDetokenizer() for i, row in enumerate(data_df.itertuples(index=False)): # Answer indices are a space-limited list of numbers. # We simply take the min/max of the indices answer_idxs = list(map(int, row.answer.split())) answer_token_start, answer_token_end = min(answer_idxs), max( answer_idxs) passage_ptb_tokens = row.sent.split() passage_space_tokens = ptb_detokenizer.detokenize( passage_ptb_tokens, convert_parentheses=True).split() passage_space_str = " ".join(passage_space_tokens) token_aligner = TokenAligner(source=passage_ptb_tokens, target=passage_space_tokens) answer_char_span = token_aligner.project_token_to_char_span( answer_token_start, answer_token_end, inclusive=True) answer_str = passage_space_str[ answer_char_span[0]:answer_char_span[1] + 1] examples.append( span_pred_template.Example( guid="%s-%s" % (set_type, i), passage=passage_space_str, question=row.question, answer=answer_str, answer_char_span=answer_char_span, )) return examples
def test_project_invalid_span(): src_tokens = ["Members", "of", "the", "House", "clapped", "their", "hands"] tgt_tokens = [ "Members", "Ġof", "Ġthe", "ĠHouse", "Ġcl", "apped", "Ġtheir", "Ġhands" ] # reference: tgt_token_index = [[0], [1], [2], [3], [4, 5], [6], [7]] ta = TokenAligner(src_tokens, tgt_tokens) with pytest.raises(ValueError): ta.project_span(0, 0)
def test_sentencepiece_tok_idx_proj_3(): src_tokens = [ "Mr.", "Immelt", "chose", "to", "focus", "on", "the", "incomprehensibility", "of", "accounting", "rules.", ] tgt_tokens = [ "▁Mr", ".", "▁I", "m", "mel", "t", "▁chose", "▁to", "▁focus", "▁on", "▁the", "▁in", "comp", "re", "hen", "s", "ibility", "▁of", "▁accounting", "▁rules", ".", ] tgt_token_index = [ [0, 1], [2, 3, 4, 5], [6], [7], [8], [9], [10], [11, 12, 13, 14, 15, 16], [17], [18], [19, 20], ] ta = TokenAligner(src_tokens, tgt_tokens) for src_token_idx in range(len(src_tokens)): projected_tgt_tok_idx = ta.project_tokens(src_token_idx) assert (tgt_token_index[src_token_idx] == projected_tgt_tok_idx).all()
def test_moses_tok_idx_proj_2(): src_tokens = ["I", "look", "at", "Sarah's", "dog.", "It", "was", "cute.!"] tgt_tokens = [ "I", "look", "at", "Sarah", "'s", "dog", ".", "It", "was", "cute", ".", "!" ] tgt_token_index = [[0], [1], [2], [3, 4], [5, 6], [7], [8], [9, 10, 11]] ta = TokenAligner(src_tokens, tgt_tokens) for src_token_idx in range(len(src_tokens)): projected_tgt_tok_idx = ta.project_tokens(src_token_idx) assert (tgt_token_index[src_token_idx] == projected_tgt_tok_idx).all()
def test_wpm_tok_idx_proj_3(): src_tokens = [ "Mr.", "Immelt", "chose", "to", "focus", "on", "the", "incomprehensibility", "of", "accounting", "rules.", ] tgt_tokens = [ "Mr", ".", "I", "##mme", "##lt", "chose", "to", "focus", "on", "the", "in", "##com", "##p", "##re", "##hen", "##si", "##bility", "of", "accounting", "rules", ".", ] tgt_token_index = [ [0, 1], [2, 3, 4], [5], [6], [7], [8], [9], [10, 11, 12, 13, 14, 15, 16], [17], [18], [19, 20], ] ta = TokenAligner(src_tokens, tgt_tokens) for src_token_idx in range(len(src_tokens)): projected_tgt_tok_idx = ta.project_tokens(src_token_idx) assert (tgt_token_index[src_token_idx] == projected_tgt_tok_idx).all()
def test_wpm_tok_idx_proj_2(): src_tokens = ["I", "look", "at", "Sarah's", "dog.", "It", "was", "cute.!"] tgt_tokens = [ "I", "look", "at", "Sarah", "'", "s", "dog", ".", "It", "was", "cute", ".", "!" ] tgt_token_index = [[0], [1], [2], [3, 4, 5], [6, 7], [8], [9], [10, 11, 12]] ta = TokenAligner(src_tokens, tgt_tokens) for src_token_idx in range(len(src_tokens)): projected_tgt_tok_idx = ta.project_tokens(src_token_idx) assert (tgt_token_index[src_token_idx] == projected_tgt_tok_idx).all()
def tokenize(self, tokenizer): passage_tokens = tokenizer.tokenize(self.passage) token_aligner = TokenAligner(source=self.passage, target=passage_tokens) answer_token_span = token_aligner.project_char_to_token_span( self.answer_char_span[0], self.answer_char_span[1], inclusive=True ) return TokenizedExample( guid=self.guid, passage=passage_tokens, question=tokenizer.tokenize(self.question), answer_str=self.answer, passage_str=self.passage, answer_token_span=answer_token_span, token_idx_to_char_idx_map=token_aligner.source_char_idx_to_target_token_idx.T, )
def test_bpe_tok_idx_proj_2(): src_tokens = ["I", "look", "at", "Sarah's", "dog.", "It", "was", "cute.!"] tgt_tokens = [ "i</w>", "look</w>", "at</w>", "sarah</w>", "'s</w>", "dog</w>", ".</w>", "it</w>", "was</w>", "cute</w>", ".</w>", "!</w>", ] tgt_token_index = [[0], [1], [2], [3, 4], [5, 6], [7], [8], [9, 10, 11]] ta = TokenAligner(src_tokens, tgt_tokens) for src_token_idx in range(len(src_tokens)): projected_tgt_tok_idx = ta.project_tokens(src_token_idx) assert (tgt_token_index[src_token_idx] == projected_tgt_tok_idx).all()
def test_bpe_tok_idx_proj_3(): src_tokens = [ "Mr.", "Immelt", "chose", "to", "focus", "on", "the", "incomprehensibility", "of", "accounting", "rules.", ] tgt_tokens = [ "mr.</w>", "im", "melt</w>", "chose</w>", "to</w>", "focus</w>", "on</w>", "the</w>", "in", "comprehen", "si", "bility</w>", "of</w>", "accounting</w>", "rules</w>", ".</w>", ] tgt_token_index = [[0], [1, 2], [3], [4], [5], [6], [7], [8, 9, 10, 11], [12], [13], [14, 15]] ta = TokenAligner(src_tokens, tgt_tokens) for src_token_idx in range(len(src_tokens)): projected_tgt_tok_idx = ta.project_tokens(src_token_idx) assert (tgt_token_index[src_token_idx] == projected_tgt_tok_idx).all()
def test_bytebpe_tok_idx_proj_3(): src_tokens = [ "Mr.", "Immelt", "chose", "to", "focus", "on", "the", "incomprehensibility", "of", "accounting", "rules.", ] tgt_tokens = [ "Mr", ".", "ĠImm", "elt", "Ġchose", "Ġto", "Ġfocus", "Ġon", "Ġthe", "Ġincomp", "rehens", "ibility", "Ġof", "Ġaccounting", "Ġrules", ".", ] tgt_token_index = [[0, 1], [2, 3], [4], [5], [6], [7], [8], [9, 10, 11], [12], [13], [14, 15]] ta = TokenAligner(src_tokens, tgt_tokens) for src_token_idx in range(len(src_tokens)): projected_tgt_tok_idx = ta.project_tokens(src_token_idx) assert (tgt_token_index[src_token_idx] == projected_tgt_tok_idx).all()
def test_private_project_token_span(): mat = np.eye(5, dtype=int) mat[0][0] = 0 mat[3][3] = 0 assert TokenAligner._project_span(mat, 1, 3, inclusive=True) == (1, 2) assert TokenAligner._project_span(mat, 1, 3, inclusive=False) == (1, 3) assert TokenAligner._project_span(mat, 1, 2, inclusive=True) == (1, 2) assert TokenAligner._project_span(mat, 1, 2, inclusive=False) == (1, 2) assert TokenAligner._project_span(mat, 1, 4, inclusive=True) == (1, 4) assert TokenAligner._project_span(mat, 1, 4, inclusive=False) == (1, 3)
def _create_examples(self, file_path, set_type): with gzip.open(file_path) as f: lines = f.read().splitlines() examples = [] ptb_detokenizer = nltk.tokenize.treebank.TreebankWordDetokenizer() for line in lines: datum = json.loads(line) datum = { "sentence_tokens": datum["sentenceTokens"], "entries": [{ "verb": verb_entry["verbInflectedForms"]["stem"], "verb_idx": verb_idx, "questions": { question: [[{ "tokens": datum["sentenceTokens"][span[0]:span[1]], "span": (span[0], span[1] - 1), } for span in answer_judgment["spans"]] for answer_judgment in q_data["answerJudgments"] if answer_judgment["isValid"]] for question, q_data in verb_entry["questionLabels"].items() }, } for verb_idx, verb_entry in datum["verbEntries"].items()], } passage_ptb_tokens = datum["sentence_tokens"] passage_space_tokens = ptb_detokenizer.detokenize( passage_ptb_tokens, convert_parentheses=True).split() passage_space_str = " ".join(passage_space_tokens) token_aligner = TokenAligner(source=passage_ptb_tokens, target=passage_space_tokens) for entry in datum["entries"]: for question, answer_list in entry["questions"].items(): for answer in answer_list: for answer_span in answer: try: answer_char_span = token_aligner.project_token_to_char_span( answer_span["span"][0], answer_span["span"][1], inclusive=True) except ValueError: continue answer_str = passage_space_str[ answer_char_span[0]:answer_char_span[1] + 1] examples.append( span_pred_template.Example( guid="%s-%s" % (set_type, len(examples)), passage=passage_space_str, question=question, answer=answer_str, answer_char_span=answer_char_span, )) return examples