def test_bytebpe(self): self.tokens = [ [ "ĠMembers", "Ġof", "Ġthe", "ĠHouse", "Ġcl", "apped", "Ġtheir", "Ġhands" ], [ "ĠI", "Ġlook", "Ġat", "ĠSarah", "'s", "Ġdog", ".", "ĠIt", "Ġwas", "Ġcute", ".", "!" ], [ "ĠMr", ".", "ĠImm", "elt", "Ġchose", "Ġto", "Ġfocus", "Ġon", "Ġthe", "Ġincomp", "rehens", "ibility", "Ġof", "Ġaccounting", "Ġrules", ".", ], ["ĠWhat", "?"], ] self.token_index_tgt = [ [[0], [1], [2], [3], [4, 5], [6], [7]], [[0], [1], [2], [3, 4], [5], [6, 7], [8], [9, 10, 11]], [[0], [1, 2, 3], [4], [5], [6], [7], [8], [9, 10, 11], [12], [13], [14, 15]], [[0, 1]], ] self.span_index_tgt = [ [(0, 4), (6, 8)], [(0, 1), (3, 6)], [(0, 4), (8, 16), (8, 12), (9, 16)], [(0, 2)], ] aligner_fn = retokenize.get_aligner_fn("roberta-base") tas, tokens = zip(*(aligner_fn(sent) for sent in self.text)) tas, tokens = list(tas), list(tokens) token_index_tgt = [[ ta.project_tokens(idxs).tolist() for idxs in token_idxs ] for ta, token_idxs in zip(tas, self.token_index_src)] span_index_tgt = [[ ta.project_span(start, end) for (start, end) in span_idxs ] for ta, span_idxs in zip(tas, self.span_index_src)] assert self.tokens == tokens assert self.token_index_tgt == token_index_tgt assert self.span_index_tgt == span_index_tgt
def get_tags(text, current_tags, tokenizer_name, tag_dict): aligner_fn = get_aligner_fn(tokenizer_name) assert len(text) == len(current_tags) introduced_tokenizer_tag = len(tag_dict) token_aligner, aligned_text = aligner_fn(" ".join(text)) aligned_tags = [introduced_tokenizer_tag for token in aligned_text] for text_idx, text_tag in enumerate(current_tags): aligned_idx = token_aligner.project_tokens(text_idx)[0] aligned_tags[aligned_idx] = tag_dict[text_tag] str_tags = [str(s) for s in aligned_tags] return " ".join(str_tags)
def retokenize_record(record, tokenizer_name): """Retokenize an edge probing example. Modifies in-place.""" text = record["text"] aligner_fn = retokenize.get_aligner_fn(tokenizer_name) ta, new_tokens = aligner_fn(text) record["text"] = " ".join(new_tokens) for target in record["targets"]: if "span1" in target: target["span1"] = list(map(int, ta.project_span(*target["span1"]))) if "span2" in target: target["span2"] = list(map(int, ta.project_span(*target["span2"]))) return record
def test_moses(self): self.tokens = [ ["Members", "of", "the", "House", "clapped", "their", "hands"], [ "I", "look", "at", "Sarah", "'s", "dog", ".", "It", "was", "cute", ".", "!" ], [ "Mr.", "Immelt", "chose", "to", "focus", "on", "the", "incomprehensibility", "of", "accounting", "rules", ".", ], ["What", "?"], ] self.token_index_tgt = [ [[0], [1], [2], [3], [4], [5], [6]], [[0], [1], [2], [3, 4], [5, 6], [7], [8], [9, 10, 11]], [[0], [1], [2], [3], [4], [5], [6], [7], [8], [9], [10, 11]], [[0, 1]], ] self.span_index_tgt = [ [(0, 4), (5, 7)], [(0, 1), (3, 7)], [(0, 2), (6, 12), (6, 8), (7, 12)], [(0, 2)], ] aligner_fn = retokenize.get_aligner_fn("transfo-xl-wt103") token_aligners, tokens = zip(*(aligner_fn(sent) for sent in self.text)) token_aligners, tokens = list(token_aligners), list(tokens) token_index_tgt = [[ token_aligner.project_tokens(idxs).tolist() for idxs in token_idxs ] for token_aligner, token_idxs in zip( token_aligners, self.token_index_src)] span_index_tgt = [[ token_aligner.project_span(start, end) for (start, end) in span_idxs ] for token_aligner, span_idxs in zip( token_aligners, self.span_index_src)] assert self.tokens == tokens assert self.token_index_tgt == token_index_tgt assert self.span_index_tgt == span_index_tgt
def get_tags(text, current_tags, tokenizer_name, tag_dict): aligner_fn = retokenize.get_aligner_fn(tokenizer_name) assert len(text) == len(current_tags) res_tags = [] introduced_tokenizer_tag = len(tag_dict) for i in range(len(text)): token = text[i] _, new_toks = aligner_fn(token) res_tags.append(tag_dict[current_tags[i]]) if len(new_toks) > 1: for tok in new_toks[1:]: res_tags.append(introduced_tokenizer_tag) # based on BERT-paper for wordpiece, we only keep the tag # for the first part of the word. _, aligned_text = aligner_fn(" ".join(text)) assert len(aligned_text) == len(res_tags) str_tags = [str(s) for s in res_tags] return " ".join(str_tags)
def test_wpm(self): self.tokens = [ ["Members", "of", "the", "House", "clapped", "their", "hands"], [ "I", "look", "at", "Sarah", "'", "s", "dog", ".", "It", "was", "cute", ".", "!" ], [ "Mr", ".", "I", "##mme", "##lt", "chose", "to", "focus", "on", "the", "in", "##com", "##p", "##re", "##hen", "##si", "##bility", "of", "accounting", "rules", ".", ], ["What", "?"], ] self.token_index_tgt = [ [[0], [1], [2], [3], [4], [5], [6]], [[0], [1], [2], [3, 4, 5], [6, 7], [8], [9], [10, 11, 12]], [ [0, 1], [2, 3, 4], [5], [6], [7], [8], [9], [10, 11, 12, 13, 14, 15, 16], [17], [18], [19, 20], ], [[0, 1]], ] self.span_index_tgt = [ [(0, 4), (5, 7)], [(0, 1), (3, 8)], [(0, 5), (9, 21), (9, 17), (10, 21)], [(0, 2)], ] aligner_fn = retokenize.get_aligner_fn("bert-base-cased") tas, tokens = zip(*(aligner_fn(sent) for sent in self.text)) tas, tokens = list(tas), list(tokens) token_index_tgt = [[ ta.project_tokens(idxs).tolist() for idxs in token_idxs ] for ta, token_idxs in zip(tas, self.token_index_src)] span_index_tgt = [[ ta.project_span(start, end) for (start, end) in span_idxs ] for ta, span_idxs in zip(tas, self.span_index_src)] assert self.tokens == tokens assert self.token_index_tgt == token_index_tgt assert self.span_index_tgt == span_index_tgt
def test_sentencepiece(self): self.tokens = [ [ "▁Members", "▁of", "▁the", "▁House", "▁clapped", "▁their", "▁hands" ], [ "▁I", "▁look", "▁at", "▁Sarah", "'", "s", "▁dog", ".", "▁It", "▁was", "▁cute", ".", "!", ], [ "▁Mr", ".", "▁I", "m", "mel", "t", "▁chose", "▁to", "▁focus", "▁on", "▁the", "▁in", "comp", "re", "hen", "s", "ibility", "▁of", "▁accounting", "▁rules", ".", ], ["▁What", "?"], ] self.token_index_tgt = [ [[0], [1], [2], [3], [4], [5], [6]], [[0], [1], [2], [3, 4, 5], [6, 7], [8], [9], [10, 11, 12]], [ [0, 1], [2, 3, 4, 5], [6], [7], [8], [9], [10], [11, 12, 13, 14, 15, 16], [17], [18], [19, 20], ], [[0, 1]], ] self.span_index_tgt = [ [(0, 4), (5, 7)], [(0, 1), (3, 8)], [(0, 6), (10, 21), (10, 17), (11, 21)], [(0, 2)], ] aligner_fn = retokenize.get_aligner_fn("xlnet-base-cased") tas, tokens = zip(*(aligner_fn(sent) for sent in self.text)) tas, tokens = list(tas), list(tokens) token_index_tgt = [[ ta.project_tokens(idxs).tolist() for idxs in token_idxs ] for ta, token_idxs in zip(tas, self.token_index_src)] span_index_tgt = [[ ta.project_span(start, end) for (start, end) in span_idxs ] for ta, span_idxs in zip(tas, self.span_index_src)] assert self.tokens == tokens assert self.token_index_tgt == token_index_tgt assert self.span_index_tgt == span_index_tgt
def test_bpe(self): self.tokens = [ [ "members</w>", "of</w>", "the</w>", "house</w>", "clapped</w>", "their</w>", "hands</w>", ], [ "i</w>", "look</w>", "at</w>", "sarah</w>", "'s</w>", "dog</w>", ".</w>", "it</w>", "was</w>", "cute</w>", ".</w>", "!</w>", ], [ "mr.</w>", "im", "melt</w>", "chose</w>", "to</w>", "focus</w>", "on</w>", "the</w>", "in", "comprehen", "si", "bility</w>", "of</w>", "accounting</w>", "rules</w>", ".</w>", ], ["what</w>", "?</w>"], ] self.token_index_tgt = [ [[0], [1], [2], [3], [4], [5], [6]], [[0], [1], [2], [3, 4], [5, 6], [7], [8], [9, 10, 11]], [[0], [1, 2], [3], [4], [5], [6], [7], [8, 9, 10, 11], [12], [13], [14, 15]], [[0, 1]], ] self.span_index_tgt = [ [(0, 4), (5, 7)], [(0, 1), (3, 7)], [(0, 3), (7, 16), (7, 12), (8, 16)], [(0, 2)], ] aligner_fn = retokenize.get_aligner_fn("openai-gpt") tas, tokens = zip(*(aligner_fn(sent) for sent in self.text)) tas, tokens = list(tas), list(tokens) token_index_tgt = [[ ta.project_tokens(idxs).tolist() for idxs in token_idxs ] for ta, token_idxs in zip(tas, self.token_index_src)] span_index_tgt = [[ ta.project_span(start, end) for (start, end) in span_idxs ] for ta, span_idxs in zip(tas, self.span_index_src)] assert self.tokens == tokens assert self.token_index_tgt == token_index_tgt assert self.span_index_tgt == span_index_tgt
def realign_spans(record, tokenizer_name): """ Builds the indices alignment while also tokenizing the input piece by piece. Parameters ----------------------- record: dict with the below fields text: str targets: list of dictionaries label: bool span1_index: int, start index of first span span1_text: str, text of first span span2_index: int, start index of second span span2_text: str, text of second span tokenizer_name: str Returns ------------------------ record: dict with the below fields: text: str in tokenized form targets: dictionary with the below fields -label: bool -span_1: (int, int) of token indices -span1_text: str, the string -span2: (int, int) of token indices -span2_text: str, the string """ # find span indices and text text = record["text"].split() span1 = record["targets"][0]["span1_index"] span1_text = record["targets"][0]["span1_text"] span2 = record["targets"][0]["span2_index"] span2_text = record["targets"][0]["span2_text"] # construct end spans given span text space-tokenized length span1 = [span1, span1 + len(span1_text.strip().split())] span2 = [span2, span2 + len(span2_text.strip().split())] indices = [span1, span2] sorted_indices = sorted(indices, key=lambda x: x[0]) current_tokenization = [] span_mapping = {} # align first span to tokenized text aligner_fn = retokenize.get_aligner_fn(tokenizer_name) _, new_tokens = aligner_fn(" ".join(text[: sorted_indices[0][0]])) current_tokenization.extend(new_tokens) new_span1start = len(current_tokenization) _, span_tokens = aligner_fn(" ".join(text[sorted_indices[0][0] : sorted_indices[0][1]])) current_tokenization.extend(span_tokens) new_span1end = len(current_tokenization) span_mapping[sorted_indices[0][0]] = [new_span1start, new_span1end] # re-indexing second span _, new_tokens = aligner_fn(" ".join(text[sorted_indices[0][1] : sorted_indices[1][0]])) current_tokenization.extend(new_tokens) new_span2start = len(current_tokenization) _, span_tokens = aligner_fn(" ".join(text[sorted_indices[1][0] : sorted_indices[1][1]])) current_tokenization.extend(span_tokens) new_span2end = len(current_tokenization) span_mapping[sorted_indices[1][0]] = [new_span2start, new_span2end] # save back into record _, all_text = aligner_fn(" ".join(text)) record["targets"][0]["span1"] = span_mapping[record["targets"][0]["span1_index"]] record["targets"][0]["span2"] = span_mapping[record["targets"][0]["span2_index"]] record["text"] = " ".join(all_text) return record