def embed_sentence(sentence: str, pooling_operation, layers: str = '1', use_scalar_mix: bool = False) -> Sentence: embeddings = XLMEmbeddings(pretrained_model_name_or_path=xlm_model, layers=layers, pooling_operation=pooling_operation, use_scalar_mix=use_scalar_mix) flair_sentence = Sentence(sentence) embeddings.embed(flair_sentence) return flair_sentence
def embed_sentence( sentence: str, pooling_operation, layers: str = "1", use_scalar_mix: bool = False, ) -> Sentence: embeddings = XLMEmbeddings( model=xlm_model, layers=layers, pooling_operation=pooling_operation, use_scalar_mix=use_scalar_mix, ) flair_sentence = Sentence(sentence) embeddings.embed(flair_sentence) return flair_sentence
def __init__(self, *embeddings: str): print("May need a couple moments to instantiate...") self.embedding_stack = [] # Load correct Embeddings module for model_name_or_path in embeddings: if "bert" in model_name_or_path and "roberta" not in model_name_or_path: self.embedding_stack.append(BertEmbeddings(model_name_or_path)) elif "roberta" in model_name_or_path: self.embedding_stack.append( RoBERTaEmbeddings(model_name_or_path)) elif "gpt2" in model_name_or_path: self.embedding_stack.append( OpenAIGPT2Embeddings(model_name_or_path)) elif "xlnet" in model_name_or_path: self.embedding_stack.append( XLNetEmbeddings(model_name_or_path)) elif "xlm" in model_name_or_path: self.embedding_stack.append(XLMEmbeddings(model_name_or_path)) elif ("flair" in model_name_or_path or model_name_or_path in FLAIR_PRETRAINED_MODEL_NAMES): self.embedding_stack.append( FlairEmbeddings(model_name_or_path)) else: try: self.embedding_stack.append( WordEmbeddings(model_name_or_path)) except ValueError: raise ValueError( f"Embeddings not found for the model key: {model_name_or_path}, check documentation or custom model path to verify specified model" ) assert len(self.embedding_stack) != 0 self.stacked_embeddings = StackedEmbeddings( embeddings=self.embedding_stack)
def __init__(self, *embeddings: str): print("May need a couple moments to instantiate...") self.embedding_stack = [] # Load correct Embeddings module for model_name_or_path in embeddings: if "bert" in model_name_or_path and "roberta" not in model_name_or_path: self.embedding_stack.append(BertEmbeddings(model_name_or_path)) elif "roberta" in model_name_or_path: self.embedding_stack.append( RoBERTaEmbeddings(model_name_or_path)) elif "gpt2" in model_name_or_path: self.embedding_stack.append( OpenAIGPT2Embeddings(model_name_or_path)) elif "xlnet" in model_name_or_path: self.embedding_stack.append( XLNetEmbeddings(model_name_or_path)) elif "xlm" in model_name_or_path: self.embedding_stack.append(XLMEmbeddings(model_name_or_path)) elif ("flair" in model_name_or_path or model_name_or_path in FLAIR_PRETRAINED_MODEL_NAMES): self.embedding_stack.append( FlairEmbeddings(model_name_or_path)) else: print( f"Corresponding flair embedding module not found for {model_name_or_path}" ) assert len(self.embedding_stack) != 0 self.stacked_embeddings = StackedEmbeddings( embeddings=self.embedding_stack)
def _get_stacked_embeddings(self) -> StackedEmbeddings: layers = ",".join(str(layer) for layer in self.experiment.layers) pooling_operation = self.experiment.pooling_operation token_embeddings = [] for embedding in self.experiment.embeddings: if embedding.startswith("roberta"): token_embeddings.append( RoBERTaEmbeddings( pretrained_model_name_or_path=embedding, pooling_operation=pooling_operation, layers=layers, use_scalar_mix=self.experiment.use_scalar_mix, )) elif (embedding.startswith("bert") or embedding.startswith("distilbert") or embedding.startswith("spanbert")): token_embeddings.append( BertEmbeddings( bert_model_or_path=embedding, pooling_operation=pooling_operation, layers=layers, use_scalar_mix=self.experiment.use_scalar_mix, )) elif embedding.startswith("elmo"): model_name = embedding.split("-")[-1] token_embeddings.append(ELMoEmbeddings(model=model_name)) elif embedding.startswith("gpt2"): token_embeddings.append( OpenAIGPT2Embeddings( pretrained_model_name_or_path=embedding, pooling_operation=pooling_operation, layers=layers, use_scalar_mix=self.experiment.use_scalar_mix, )) elif embedding.startswith("xlm"): token_embeddings.append( XLMEmbeddings( pretrained_model_name_or_path=embedding, pooling_operation=pooling_operation, layers=layers, use_scalar_mix=self.experiment.use_scalar_mix, )) elif embedding.startswith("xlnet"): token_embeddings.append( XLNetEmbeddings( pretrained_model_name_or_path=embedding, pooling_operation=pooling_operation, layers=layers, use_scalar_mix=self.experiment.use_scalar_mix, )) return StackedEmbeddings(embeddings=token_embeddings)
def embed_text( self, text: Union[List[Sentence], Sentence, List[str], str], model_name_or_path: str = "bert-base-cased", ) -> List[Sentence]: """ Produces embeddings for text * **text** - Text input, it can be a string or any of Flair's `Sentence` input formats * **model_name_or_path** - The hosted model name key or model path **return** - A list of Flair's `Sentence`s """ # Convert into sentences if isinstance(text, str): sentences = Sentence(text) elif isinstance(text, list) and all(isinstance(t, str) for t in text): sentences = [Sentence(t) for t in text] else: sentences = text # Load correct Embeddings module if not self.models[model_name_or_path]: if "bert" in model_name_or_path and "roberta" not in model_name_or_path: self.models[model_name_or_path] = BertEmbeddings( model_name_or_path) elif "roberta" in model_name_or_path: self.models[model_name_or_path] = RoBERTaEmbeddings( model_name_or_path) elif "gpt2" in model_name_or_path: self.models[model_name_or_path] = OpenAIGPT2Embeddings( model_name_or_path) elif "xlnet" in model_name_or_path: self.models[model_name_or_path] = XLNetEmbeddings( model_name_or_path) elif "xlm" in model_name_or_path: self.models[model_name_or_path] = XLMEmbeddings( model_name_or_path) elif ("flair" in model_name_or_path or model_name_or_path in FLAIR_PRETRAINED_MODEL_NAMES): self.models[model_name_or_path] = FlairEmbeddings( model_name_or_path) else: try: self.models[model_name_or_path] = WordEmbeddings( model_name_or_path) except ValueError: raise ValueError( f"Embeddings not found for the model key: {model_name_or_path}, check documentation or custom model path to verify specified model" ) return Sentence("") embedding = self.models[model_name_or_path] return embedding.embed(sentences)
def __init__( self, *embeddings: str, methods: List[str] = ["rnn", "pool"], configs: Dict = { "pool_configs": { "fine_tune_mode": "linear", "pooling": "mean" }, "rnn_configs": { "hidden_size": 512, "rnn_layers": 1, "reproject_words": True, "reproject_words_dimension": 256, "bidirectional": False, "dropout": 0.5, "word_dropout": 0.0, "locked_dropout": 0.0, "rnn_type": "GRU", "fine_tune": True, }, }, ): print("May need a couple moments to instantiate...") self.embedding_stack = [] # Check methods for m in methods: assert m in self.__class__.__allowed_methods # Set configs for pooling and rnn parameters for k, v in configs.items(): assert k in self.__class__.__allowed_configs setattr(self, k, v) # Load correct Embeddings module for model_name_or_path in embeddings: if "bert" in model_name_or_path and "roberta" not in model_name_or_path: self.embedding_stack.append(BertEmbeddings(model_name_or_path)) elif "roberta" in model_name_or_path: self.embedding_stack.append( RoBERTaEmbeddings(model_name_or_path)) elif "gpt2" in model_name_or_path: self.embedding_stack.append( OpenAIGPT2Embeddings(model_name_or_path)) elif "xlnet" in model_name_or_path: self.embedding_stack.append( XLNetEmbeddings(model_name_or_path)) elif "xlm" in model_name_or_path: self.embedding_stack.append(XLMEmbeddings(model_name_or_path)) elif ("flair" in model_name_or_path or model_name_or_path in FLAIR_PRETRAINED_MODEL_NAMES): self.embedding_stack.append( FlairEmbeddings(model_name_or_path)) else: try: self.embedding_stack.append( WordEmbeddings(model_name_or_path)) except ValueError: raise ValueError( f"Embeddings not found for the model key: {model_name_or_path}, check documentation or custom model path to verify specified model" ) assert len(self.embedding_stack) != 0 if "pool" in methods: self.pool_embeddings = DocumentPoolEmbeddings( self.embedding_stack, **self.pool_configs) print("Pooled embedding loaded") if "rnn" in methods: self.rnn_embeddings = DocumentRNNEmbeddings( self.embedding_stack, **self.rnn_configs) print("RNN embeddings loaded")
def get_xlm(model_name): return XLMEmbeddings(model_name)
def main(): argparser = argparse.ArgumentParser( description="download embeddings for models") argparser.add_argument("-bert", "--bert", action='store_true', default=False, help="bert embeddings (12 layers)") argparser.add_argument("-roberta", "--roberta", action='store_true', default=False, help="roberta embeddings (12 layers)") argparser.add_argument("-gpt2", "--gpt2", action='store_true', default=False, help="gpt2 embeddings (12 layers)") argparser.add_argument("-xlm", "--xlm", action='store_true', default=False, help="xlm embeddings (24 layers)") argparser.add_argument("-local", "--local", action='store_true', default=False, help="if local") args = argparser.parse_args() # verify arguments if args.bert and args.roberta and args.xlm and args.gpt2: print("select only one flag for model type from (bert, roberta, xlm)") exit() if not args.bert and not args.roberta and not args.xlm and not args.gpt2: print("select at least flag for model type from (bert, roberta, xlm)") exit() if args.bert or args.roberta or args.gpt2: num_layers = 12 if args.xlm: num_layers = 24 # open sentences file = open("cleaned_sentencesGLM.txt", "r").read().splitlines() # specify model print("uploading model...") for layer in tqdm(range(num_layers)): print(layer) if args.bert: embeddings = BertEmbeddings("bert-base-multilingual-cased", layers="-{}".format(layer)) model_type = "bert" elif args.roberta: embeddings = RoBERTaEmbeddings("roberta-base", layers="-{}".format(layer)) model_type = "roberta" elif args.xlm: embeddings = XLMEmbeddings("xlm-mlm-en-2048", layers="-{}".format(layer)) model_type = "xlm" elif args.gpt2: embeddings = TransformerWordEmbeddings("gpt2", layers="-{}".format(layer)) model_type = "gpt2" else: print("error on calling embeddings") exit() embed_matrix = get_embeddings(file, embeddings) print("aggregating types...") avg_sentence = process_sentence(embed_matrix, "avg") max_sentence = process_sentence(embed_matrix, "max") min_sentence = process_sentence(embed_matrix, "min") last_sentence = process_sentence(embed_matrix, "last") methods = ['avg', 'max', 'min', 'last'] mats = [avg_sentence, max_sentence, min_sentence, last_sentence] bool_labels = [1] * len(file) print("saving files...") if args.local: file_path = '../embeddings/{}/layer{}/'.format(model_type, layer) else: file_path = "/n/shieber_lab/Lab/users/cjou/embeddings/{}/layer{}/".format( model_type, layer) if not os.path.exists(file_path): os.makedirs(file_path) for i in range(len(methods)): print("saving file: " + file_path + str(methods[i]) + ".p") pickle.dump(mats[i], open(file_path + str(methods[i]) + ".p", "wb")) print("done.")
if __name__ == "__main__": from test_textsim import * from flair.embeddings import XLMRobertaEmbeddings, BertEmbeddings, XLNetEmbeddings, XLMEmbeddings, RoBERTaEmbeddings measures = {} SAME, DIFF = load_data("./data/test_STS2017en-en.txt") MODELS = { "xlmr": XLMRobertaEmbeddings(), "bert": BertEmbeddings(), "xlnet": XLNetEmbeddings(), "xlm": XLMEmbeddings(), "roberta": RoBERTaEmbeddings(), } for model in MODELS: print(model) results = run_experiment(SAME, DIFF, lambda x: flair_embed_dict(x, MODELS[model]), wmdistance, inverse=True) measures['{}-wmdist'.format(model)] = results print(score(results[0], results[1]))