def embed_sentence(sentence: str, pooling_operation, layers: str = '1', use_scalar_mix: bool = False) -> Sentence: embeddings = XLNetEmbeddings(pretrained_model_name_or_path=xlnet_model, layers=layers, pooling_operation=pooling_operation, use_scalar_mix=use_scalar_mix) flair_sentence = Sentence(sentence) embeddings.embed(flair_sentence) return flair_sentence
def embed_sentence( sentence: str, pooling_operation, layers: str = "1", use_scalar_mix: bool = False, ) -> Sentence: embeddings = XLNetEmbeddings( model=xlnet_model, layers=layers, pooling_operation=pooling_operation, use_scalar_mix=use_scalar_mix, ) flair_sentence = Sentence(sentence) embeddings.embed(flair_sentence) return flair_sentence
def __init__(self, *embeddings: str): print("May need a couple moments to instantiate...") self.embedding_stack = [] # Load correct Embeddings module for model_name_or_path in embeddings: if "bert" in model_name_or_path and "roberta" not in model_name_or_path: self.embedding_stack.append(BertEmbeddings(model_name_or_path)) elif "roberta" in model_name_or_path: self.embedding_stack.append( RoBERTaEmbeddings(model_name_or_path)) elif "gpt2" in model_name_or_path: self.embedding_stack.append( OpenAIGPT2Embeddings(model_name_or_path)) elif "xlnet" in model_name_or_path: self.embedding_stack.append( XLNetEmbeddings(model_name_or_path)) elif "xlm" in model_name_or_path: self.embedding_stack.append(XLMEmbeddings(model_name_or_path)) elif ("flair" in model_name_or_path or model_name_or_path in FLAIR_PRETRAINED_MODEL_NAMES): self.embedding_stack.append( FlairEmbeddings(model_name_or_path)) else: try: self.embedding_stack.append( WordEmbeddings(model_name_or_path)) except ValueError: raise ValueError( f"Embeddings not found for the model key: {model_name_or_path}, check documentation or custom model path to verify specified model" ) assert len(self.embedding_stack) != 0 self.stacked_embeddings = StackedEmbeddings( embeddings=self.embedding_stack)
def __init__(self, *embeddings: str): print("May need a couple moments to instantiate...") self.embedding_stack = [] # Load correct Embeddings module for model_name_or_path in embeddings: if "bert" in model_name_or_path and "roberta" not in model_name_or_path: self.embedding_stack.append(BertEmbeddings(model_name_or_path)) elif "roberta" in model_name_or_path: self.embedding_stack.append( RoBERTaEmbeddings(model_name_or_path)) elif "gpt2" in model_name_or_path: self.embedding_stack.append( OpenAIGPT2Embeddings(model_name_or_path)) elif "xlnet" in model_name_or_path: self.embedding_stack.append( XLNetEmbeddings(model_name_or_path)) elif "xlm" in model_name_or_path: self.embedding_stack.append(XLMEmbeddings(model_name_or_path)) elif ("flair" in model_name_or_path or model_name_or_path in FLAIR_PRETRAINED_MODEL_NAMES): self.embedding_stack.append( FlairEmbeddings(model_name_or_path)) else: print( f"Corresponding flair embedding module not found for {model_name_or_path}" ) assert len(self.embedding_stack) != 0 self.stacked_embeddings = StackedEmbeddings( embeddings=self.embedding_stack)
def other_embeddings(embd): sess = tf.InteractiveSession() train_data_list = [] test_data_list = [] val_data_list = [] if embd == 'glove': print('Starting Glove Embedding...') glove_embedding = WordEmbeddings('glove') document_embeddings = DocumentPoolEmbeddings( embeddings=[glove_embedding]) elif embd == 'xlnet': print('Starting XLNet Embedding...') xlnet_embedding = XLNetEmbeddings('xlnet-large-cased') document_embeddings = DocumentPoolEmbeddings( embeddings=[xlnet_embedding]) elif embd == 'fasttext': print('Starting Fasttext Embedding...') fasttext_embedding = WordEmbeddings('en') document_embeddings = DocumentPoolEmbeddings( embeddings=[fasttext_embedding]) elif embd == 'elmo': print('Starting ELMo Embedding...') elmo_embedding = ELMoEmbeddings() document_embeddings = DocumentPoolEmbeddings( embeddings=[elmo_embedding]) else: # init Flair embeddings flair_forward_embedding = FlairEmbeddings('multi-forward') flair_backward_embedding = FlairEmbeddings('multi-backward') glove_embedding = WordEmbeddings('glove') # now create the DocumentPoolEmbeddings object that combines all embeddings document_embeddings = DocumentPoolEmbeddings(embeddings=[ glove_embedding, flair_forward_embedding, flair_backward_embedding ]) print('Train embedding Started...') for text in final_train['text'].tolist(): text = Sentence(text) document_embeddings.embed(text) emb = text.get_embedding().detach().numpy() emb = tf.constant(emb).eval() train_data_list.append(emb) print('Embedded Train data!!') print('Test embedding Started...') for text in final_test['text'].tolist(): text = Sentence(text) document_embeddings.embed(text) emb = text.get_embedding().detach().numpy() emb = tf.constant(emb).eval() test_data_list.append(emb) print('Embedded Test data!!') for text in final_val['text'].tolist(): text = Sentence(text) document_embeddings.embed(text) emb = text.get_embedding().detach().numpy() emb = tf.constant(emb).eval() val_data_list.append(emb) print('Embedded Test data!!') return train_data_list, test_data_list, val_data_list
def _get_stacked_embeddings(self) -> StackedEmbeddings: layers = ",".join(str(layer) for layer in self.experiment.layers) pooling_operation = self.experiment.pooling_operation token_embeddings = [] for embedding in self.experiment.embeddings: if embedding.startswith("roberta"): token_embeddings.append( RoBERTaEmbeddings( pretrained_model_name_or_path=embedding, pooling_operation=pooling_operation, layers=layers, use_scalar_mix=self.experiment.use_scalar_mix, )) elif (embedding.startswith("bert") or embedding.startswith("distilbert") or embedding.startswith("spanbert")): token_embeddings.append( BertEmbeddings( bert_model_or_path=embedding, pooling_operation=pooling_operation, layers=layers, use_scalar_mix=self.experiment.use_scalar_mix, )) elif embedding.startswith("elmo"): model_name = embedding.split("-")[-1] token_embeddings.append(ELMoEmbeddings(model=model_name)) elif embedding.startswith("gpt2"): token_embeddings.append( OpenAIGPT2Embeddings( pretrained_model_name_or_path=embedding, pooling_operation=pooling_operation, layers=layers, use_scalar_mix=self.experiment.use_scalar_mix, )) elif embedding.startswith("xlm"): token_embeddings.append( XLMEmbeddings( pretrained_model_name_or_path=embedding, pooling_operation=pooling_operation, layers=layers, use_scalar_mix=self.experiment.use_scalar_mix, )) elif embedding.startswith("xlnet"): token_embeddings.append( XLNetEmbeddings( pretrained_model_name_or_path=embedding, pooling_operation=pooling_operation, layers=layers, use_scalar_mix=self.experiment.use_scalar_mix, )) return StackedEmbeddings(embeddings=token_embeddings)
def embed_text( self, text: Union[List[Sentence], Sentence, List[str], str], model_name_or_path: str = "bert-base-cased", ) -> List[Sentence]: """ Produces embeddings for text * **text** - Text input, it can be a string or any of Flair's `Sentence` input formats * **model_name_or_path** - The hosted model name key or model path **return** - A list of Flair's `Sentence`s """ # Convert into sentences if isinstance(text, str): sentences = Sentence(text) elif isinstance(text, list) and all(isinstance(t, str) for t in text): sentences = [Sentence(t) for t in text] else: sentences = text # Load correct Embeddings module if not self.models[model_name_or_path]: if "bert" in model_name_or_path and "roberta" not in model_name_or_path: self.models[model_name_or_path] = BertEmbeddings( model_name_or_path) elif "roberta" in model_name_or_path: self.models[model_name_or_path] = RoBERTaEmbeddings( model_name_or_path) elif "gpt2" in model_name_or_path: self.models[model_name_or_path] = OpenAIGPT2Embeddings( model_name_or_path) elif "xlnet" in model_name_or_path: self.models[model_name_or_path] = XLNetEmbeddings( model_name_or_path) elif "xlm" in model_name_or_path: self.models[model_name_or_path] = XLMEmbeddings( model_name_or_path) elif ("flair" in model_name_or_path or model_name_or_path in FLAIR_PRETRAINED_MODEL_NAMES): self.models[model_name_or_path] = FlairEmbeddings( model_name_or_path) else: try: self.models[model_name_or_path] = WordEmbeddings( model_name_or_path) except ValueError: raise ValueError( f"Embeddings not found for the model key: {model_name_or_path}, check documentation or custom model path to verify specified model" ) return Sentence("") embedding = self.models[model_name_or_path] return embedding.embed(sentences)
def __init__( self, *embeddings: str, methods: List[str] = ["rnn", "pool"], configs: Dict = { "pool_configs": { "fine_tune_mode": "linear", "pooling": "mean" }, "rnn_configs": { "hidden_size": 512, "rnn_layers": 1, "reproject_words": True, "reproject_words_dimension": 256, "bidirectional": False, "dropout": 0.5, "word_dropout": 0.0, "locked_dropout": 0.0, "rnn_type": "GRU", "fine_tune": True, }, }, ): print("May need a couple moments to instantiate...") self.embedding_stack = [] # Check methods for m in methods: assert m in self.__class__.__allowed_methods # Set configs for pooling and rnn parameters for k, v in configs.items(): assert k in self.__class__.__allowed_configs setattr(self, k, v) # Load correct Embeddings module for model_name_or_path in embeddings: if "bert" in model_name_or_path and "roberta" not in model_name_or_path: self.embedding_stack.append(BertEmbeddings(model_name_or_path)) elif "roberta" in model_name_or_path: self.embedding_stack.append( RoBERTaEmbeddings(model_name_or_path)) elif "gpt2" in model_name_or_path: self.embedding_stack.append( OpenAIGPT2Embeddings(model_name_or_path)) elif "xlnet" in model_name_or_path: self.embedding_stack.append( XLNetEmbeddings(model_name_or_path)) elif "xlm" in model_name_or_path: self.embedding_stack.append(XLMEmbeddings(model_name_or_path)) elif ("flair" in model_name_or_path or model_name_or_path in FLAIR_PRETRAINED_MODEL_NAMES): self.embedding_stack.append( FlairEmbeddings(model_name_or_path)) else: try: self.embedding_stack.append( WordEmbeddings(model_name_or_path)) except ValueError: raise ValueError( f"Embeddings not found for the model key: {model_name_or_path}, check documentation or custom model path to verify specified model" ) assert len(self.embedding_stack) != 0 if "pool" in methods: self.pool_embeddings = DocumentPoolEmbeddings( self.embedding_stack, **self.pool_configs) print("Pooled embedding loaded") if "rnn" in methods: self.rnn_embeddings = DocumentRNNEmbeddings( self.embedding_stack, **self.rnn_configs) print("RNN embeddings loaded")
def get_xlnet(model_name): return XLNetEmbeddings(model_name)
return {token.text: token.embedding.numpy() for token in sentence} if __name__ == "__main__": from test_textsim import * from flair.embeddings import XLMRobertaEmbeddings, BertEmbeddings, XLNetEmbeddings, XLMEmbeddings, RoBERTaEmbeddings measures = {} SAME, DIFF = load_data("./data/test_STS2017en-en.txt") MODELS = { "xlmr": XLMRobertaEmbeddings(), "bert": BertEmbeddings(), "xlnet": XLNetEmbeddings(), "xlm": XLMEmbeddings(), "roberta": RoBERTaEmbeddings(), } for model in MODELS: print(model) results = run_experiment(SAME, DIFF, lambda x: flair_embed_dict(x, MODELS[model]), wmdistance, inverse=True) measures['{}-wmdist'.format(model)] = results print(score(results[0], results[1]))
'BertLS': [ BertEmbeddings(bert_model_or_path='bert-large-uncased', layers="0,1,2,3,4,5,6,7,8,9,10,11,12,13,14," "15,16,17,18,19,20,21,22,23,24", use_scalar_mix=True) ], "RoBERTa": [RoBERTaEmbeddings('roberta-base')], "RoBERTaL": [RoBERTaEmbeddings('roberta-large')], "RoBERTaLS": [ RoBERTaEmbeddings(pretrained_model_name_or_path="roberta-large", layers="0,1,2,3,4,5,6,7,8,9,10,11,12,13,14," "15,16,17,18,19,20,21,22,23,24", use_scalar_mix=True) ], "XLNet": [XLNetEmbeddings(pretrained_model_name_or_path="xlnet-large-cased")], "XLNetS": [ XLNetEmbeddings(pretrained_model_name_or_path="xlnet-large-cased", layers="0,1,2,3,4,5,6,7,8,9,10,11,12,13,14," "15,16,17,18,19,20,21,22,23,24", use_scalar_mix=True) ], "DistilBert": [BertEmbeddings('distilbert-base-uncased')] } # Choose the tested embeddings word_embeddings = model_selector[model_type] # create an RNN document_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=256,
if ARGS.restore: ensemble_tagger = EnsembleTagger.load(model_path + "final-model.pt") else: elmo_tagger = SequenceTagger(hidden_size=256, embeddings=ELMoEmbeddings('small'), tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) bert_tagger = SequenceTagger(hidden_size=256, embeddings=BertEmbeddings(), tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) xlnet_tagger = SequenceTagger(hidden_size=256, embeddings=XLNetEmbeddings(), tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) flair_tagger = SequenceTagger(hidden_size=256, embeddings=StackedEmbeddings([ FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward') ]), tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) models = [] if ARGS.model == "be": models = [bert_tagger, elmo_tagger] elif ARGS.model == "bf":
END = '\033[0m' granularity_level = "Sent" #"Word" "Sent" "Paragraph" dynamic = False ##Controls if we highlight the more important words more or not graph = False ###ONLY WORKS WITH "Word" granularity_level word_doc = True html = True word_window_size = 10 ##This is really doubled since it's Bi-directional. Only matters for word level granularity highlight_color_intensity = 175 # try values between 25 and 200 doc_embeddings = [] scores = [] stacked_embeddings = DocumentPoolEmbeddings([ #WordEmbeddings('en'), XLNetEmbeddings('xlnet-large-cased', use_scalar_mix='True') #WordEmbeddings('glove'), #WordEmbeddings('extvec'),#ELMoEmbeddings('original'), #BertEmbeddings('bert-base-cased'), #FlairEmbeddings('news-forward-fast'), #FlairEmbeddings('news-backward-fast'), #OpenAIGPTEmbeddings() #TransformerXLEmbeddings() ]) #, mode='max') def arg_parser(): parser = argparse.ArgumentParser() parser.add_argument("--card_file_path", "-c", type=str,
nlp = StanfordCoreNLP('http://localhost:9000') properties = {'annotators': 'ssplit', 'outputFormat': 'json'} #Getting user arguments mode, number, _set, load, iteration, cuda_option, save_path, log_file, architecture, embedding_type, loss_mode, learning_rate, score_mode, max_pool, args = parse_arguments( ) #define the embeddings if embedding_type == 1: selected_embedding = BertEmbeddings() embed_dim = 3072 elif embedding_type == 2: selected_embedding = FlairEmbeddings("news-forward") embed_dim = 2048 elif embedding_type == 3: selected_embedding = XLNetEmbeddings() embed_dim = 2048 bert = BertEmbeddings() flair = FlairEmbeddings("news-forward") # Load the pretrained model of resnet resnet = models.resnet101(pretrained=True) # Use the model object to select the desired layer modules = list(resnet.children())[:-1] resnet = nn.Sequential(*modules) resnet.eval() #define the transformers for the picture scaler = transforms.Scale((224, 224))