def extract_features_aligned_to_words( self, sentence: str, return_all_hiddens: bool = False) -> torch.Tensor: """Extract RoBERTa features, aligned to spaCy's word-level tokenizer.""" from fairseq.models.roberta import alignment_utils from spacy.tokens import Doc nlp = alignment_utils.spacy_nlp() tokenizer = alignment_utils.spacy_tokenizer() # tokenize both with GPT-2 BPE and spaCy bpe_toks = self.encode(sentence) spacy_toks = tokenizer(sentence) spacy_toks_ws = [t.text_with_ws for t in tokenizer(sentence)] alignment = alignment_utils.align_bpe_to_words(self, bpe_toks, spacy_toks_ws) # extract features and align them features = self.extract_features(bpe_toks, return_all_hiddens=return_all_hiddens) features = features.squeeze(0) aligned_feats = alignment_utils.align_features_to_words( self, features, alignment) # wrap in spaCy Doc doc = Doc( nlp.vocab, words=["<s>"] + [x.text for x in spacy_toks] + ["</s>"], spaces=[True] + [x.endswith(" ") for x in spacy_toks_ws[:-1]] + [True, False], ) assert len(doc) == aligned_feats.size(0) doc.user_token_hooks["vector"] = lambda token: aligned_feats[token.i] return doc
def extract_aligned_roberta(roberta, sentence: str, tokens: List[str], return_all_hiddens=False): ''' Code inspired from: https://github.com/pytorch/fairseq/blob/master/fairseq/models/roberta/hub_interface.py Aligns roberta embeddings for an input tokenization of words for a sentence Inputs: 1. roberta: roberta fairseq class 2. sentence: sentence in string 3. tokens: tokens of the sentence in which the alignment is to be done Outputs: Aligned roberta features ''' from fairseq.models.roberta import alignment_utils # tokenize both with GPT-2 BPE and get alignment with given tokens bpe_toks = roberta.encode(sentence) alignment = alignment_utils.align_bpe_to_words(roberta, bpe_toks, tokens) # tokens came from spacy, for this func. from golden tokens # extract features and align them # LM heads are only used when masked_tokens are involved, not in this case if not return_all_hiddens: features, x = roberta.extract_features(bpe_toks, return_all_hiddens=return_all_hiddens), None else: features, x = roberta.extract_features(bpe_toks, return_all_hiddens=return_all_hiddens) sent_features = features features = features.squeeze(0) #Batch-size = 1 # aligned_feats = alignment_utils.align_features_to_words(roberta, features, alignment) aligned_feats = align_features_to_words(roberta, features, alignment, 1e-3) return aligned_feats[1:-1], x, sent_features #exclude <s> and </s> tokens
def extract_aligned_roberta_multiple(roberta, sentence: str, sentence2, tokens: List[str], tokens2, return_all_hiddens=False): from fairseq.models.roberta import alignment_utils full_tokens = add_SEP_token("<s>", tokens, tokens2, single=False) # tokenize both with GPT-2 BPE and get alignment with given tokens bpe_toks = roberta.encode(sentence, sentence2) alignment = alignment_utils.align_bpe_to_words( roberta, bpe_toks, full_tokens ) # tokens came from spacy, for this func. from golden tokens features = roberta.extract_features(bpe_toks, return_all_hiddens=return_all_hiddens) sent_features = features if return_all_hiddens: features = features[-1].squeeze( 0) #Batch-size = 1 #-1 is the last layer, right? else: features = features.squeeze(0) #Batch-size = 1 # aligned_feats = alignment_utils.align_features_to_words(roberta, features, alignment) aligned_feats = align_features_to_words(roberta, features, alignment, 1e-3) return aligned_feats[1:-1], sent_features #exclude <s> and </s> tokens
def forward(self, seq_list): with torch.no_grad(): seq_embeddings = [] for seq in seq_list: sent = ' '.join(seq) encoded = self.roberta.encode(sent) alignment = alignment_utils.align_bpe_to_words( self.roberta, encoded, seq) features = self.roberta.extract_features( encoded, return_all_hiddens=False) features = features.squeeze(0) aligned = align_features_to_words(self.roberta, features, alignment) seq_embeddings.append( aligned[1:-1]) # skip <s>,</s> embeddings return torch.stack(seq_embeddings, dim=0).to(self.device)
def extract_aligned_roberta(roberta, sentence: str, tokens: List[str], return_all_hiddens=False): ''' Code inspired from: https://github.com/pytorch/fairseq/blob/master/fairseq/models/roberta/hub_interface.py Aligns roberta embeddings for an input tokenization of words for a sentence Inputs: 1. roberta: roberta fairseq class 2. sentence: sentence in string 3. tokens: tokens of the sentence in which the alignment is to be done Outputs: Aligned roberta features ''' from fairseq.models.roberta import alignment_utils full_tokens = add_SEP_token("<s>", tokens, single=True) # tokenize both with GPT-2 BPE and get alignment with given tokens bpe_toks = roberta.encode(sentence) alignment = alignment_utils.align_bpe_to_words( roberta, bpe_toks, tokens) # tokens came from spacy, for this func. from golden tokens # extract features and align them # LM heads are only used when masked_tokens are involved, not in this case features = roberta.extract_features(bpe_toks, return_all_hiddens=return_all_hiddens) sent_features = features print( "Feature size: " + str(len(sent_features)) ) # 1 vs 25, when all layers are returned, when return all hidden is true just features are returned, one variable, with the hidden # features is the last layer, extra is all layers with hidden ones if return_all_hiddens: features = features[-1].squeeze( 0) #Batch-size = 1 #-1 is the last layer, right? else: features = features.squeeze(0) #Batch-size = 1 # aligned_feats = alignment_utils.align_features_to_words(roberta, features, alignment) aligned_feats = align_features_to_words(roberta, features, alignment, 1e-3) return aligned_feats[1:-1], sent_features #exclude <s> and </s> tokens
def extract_attention_to_words(self, sentence: str, sentence2: str, features) -> torch.Tensor: from fairseq.models.roberta import alignment_utils nlp = alignment_utils.spacy_nlp() tokenizer = alignment_utils.spacy_tokenizer() # tokenize both with GPT-2 BPE and spaCy bpe_toks = self.encode(sentence, sentence2) s1_bpe_len = len(self.encode(sentence)) s2_bpe_len = len(self.encode(sentence2)) s2_bpe_toks = torch.cat([bpe_toks[0:1], bpe_toks[s1_bpe_len + 1:]], dim=0) features = torch.cat( [features[0:1], features[s1_bpe_len + 1:s1_bpe_len + s2_bpe_len]], dim=0) features = features[:, None] # spacy_toks = tokenizer(sentence) # spacy_toks_ws = [t.text_with_ws for t in tokenizer(sentence)] spacy_toks2 = tokenizer(sentence2) spacy_toks_ws2 = [t.text_with_ws for t in tokenizer(sentence2)] spacy_toks_ws = spacy_toks_ws2 # print(spacy_toks_ws) # print(s2_bpe_toks) # print(features) assert features.size(0) == len(s2_bpe_toks) alignment = alignment_utils.align_bpe_to_words(self, s2_bpe_toks, spacy_toks_ws) aligned_attn = alignment_utils.align_features_to_words( self, features, alignment) # print(spacy_toks_ws) # print(aligned_attn) # print(len(spacy_toks_ws)) # print(len(aligned_attn)) # aligned_attn = torch.nn.functional.softmax(aligned_attn.squeeze(), dim=0)[1:-1] aligned_attn = aligned_attn.squeeze()[1:-1] aligned_attn = aligned_attn / torch.sum(aligned_attn) assert aligned_attn.size(0) == len(spacy_toks_ws) return spacy_toks_ws, aligned_attn