def create_embeddings(params): embedding_type = params["embedding_type"] assert embedding_type in ["bert", "flair", "char"] if embedding_type == "bert": bert_embedding = BertEmbeddings(params["bert_model_dirpath_or_name"], pooling_operation="mean") embedding_types: List[TokenEmbeddings] = [bert_embedding] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) elif embedding_type == "flair": glove_embedding = WordEmbeddings( '/opt/kanarya/glove/GLOVE/GloVe/vectors.gensim') word2vec_embedding = WordEmbeddings( '/opt/kanarya/huawei_w2v/vector.gensim') fast_text_embedding = WordEmbeddings('tr') char_embedding = CharacterEmbeddings() # bert_embedding = BertEmbeddings('../bert_pretraining/pretraining_outputs/pretraining_output_batch_size_32') embedding_types: List[TokenEmbeddings] = [ fast_text_embedding, glove_embedding, word2vec_embedding, char_embedding ] # embedding_types: List[TokenEmbeddings] = [custom_embedding] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) elif embedding_type == "char": embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=[CharacterEmbeddings()]) else: embeddings = None return embeddings
def __init__(self, pipeline): self.mode = pipeline.mode self.type = pipeline.embedding_type embedders = [] for component in pipeline.embedders: if "forward" in component or "backward" in component: embedders.append(FlairEmbeddings(component)) elif "glove" in component: embedders.append(WordEmbeddings(component)) elif "bert" in component: embedders.append(BertEmbeddings(component)) elif len(component) == 2: # see https://github.com/zalandoresearch/flair/blob/master/resources/docs/embeddings/FASTTEXT_EMBEDDINGS.md#fasttext-embeddings embedders.append(WordEmbeddings(component)) embedders.append(BytePairEmbeddings(component)) else: raise ValueError(f"unknown embedder: {component}") if self.type == "document": self.embedder = self._make_doc_embedder(pipeline, embedders) elif self.type == "word": self.embedder = StackedEmbeddings(embedders) elif self.type == "both": self.embedders = [ self._make_doc_embedder(pipeline, embedders), StackedEmbeddings(embedders), ] else: raise ValueError( f"Innapropriate embedding type {pipeline.embedding_type}, " "should be 'word', 'document', or 'both'.")
def test_sequence_tagger_param_selector(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpus(NLPTask.FASHION, base_path=tasks_base_path) search_space = SearchSpace() search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[ StackedEmbeddings([WordEmbeddings(u'glove')]), StackedEmbeddings([ WordEmbeddings(u'glove'), FlairEmbeddings(u'news-forward'), FlairEmbeddings(u'news-backward') ]) ]) search_space.add(Parameter.USE_CRF, hp.choice, options=[True, False]) search_space.add(Parameter.DROPOUT, hp.uniform, low=0.25, high=0.75) search_space.add(Parameter.WORD_DROPOUT, hp.uniform, low=0.0, high=0.25) search_space.add(Parameter.LOCKED_DROPOUT, hp.uniform, low=0.0, high=0.5) search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[64, 128]) search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2]) search_space.add(Parameter.OPTIMIZER, hp.choice, options=[SGD]) search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[4, 8, 32]) search_space.add(Parameter.LEARNING_RATE, hp.uniform, low=0.01, high=1) search_space.add(Parameter.ANNEAL_FACTOR, hp.uniform, low=0.3, high=0.75) search_space.add(Parameter.PATIENCE, hp.choice, options=[3, 5]) search_space.add(Parameter.WEIGHT_DECAY, hp.uniform, low=0.01, high=1) optimizer = SequenceTaggerParamSelector(corpus, u'ner', results_base_path, max_epochs=2) optimizer.optimize(search_space, max_evals=2) shutil.rmtree(results_base_path)
def __init__(self, *embeddings: str): print("May need a couple moments to instantiate...") self.embedding_stack = [] # Load correct Embeddings module for model_name_or_path in embeddings: if "bert" in model_name_or_path and "roberta" not in model_name_or_path: self.embedding_stack.append(BertEmbeddings(model_name_or_path)) elif "roberta" in model_name_or_path: self.embedding_stack.append( RoBERTaEmbeddings(model_name_or_path)) elif "gpt2" in model_name_or_path: self.embedding_stack.append( OpenAIGPT2Embeddings(model_name_or_path)) elif "xlnet" in model_name_or_path: self.embedding_stack.append( XLNetEmbeddings(model_name_or_path)) elif "xlm" in model_name_or_path: self.embedding_stack.append(XLMEmbeddings(model_name_or_path)) elif ("flair" in model_name_or_path or model_name_or_path in FLAIR_PRETRAINED_MODEL_NAMES): self.embedding_stack.append( FlairEmbeddings(model_name_or_path)) else: print( f"Corresponding flair embedding module not found for {model_name_or_path}" ) assert len(self.embedding_stack) != 0 self.stacked_embeddings = StackedEmbeddings( embeddings=self.embedding_stack)
def __init__(self, num_classes: int = 2, bidirectional: bool = False, rnn_layers: int = 1, hidden_size: int = 256, rnn_type: str = 'GRU'): super(ATAE_LSTM, self).__init__() self.stackedembeddings: StackedEmbeddings = StackedEmbeddings([ FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward') ]) self.wordembeddings: StackedEmbeddings = StackedEmbeddings( [WordEmbeddings('glove')]) self.embedding_dimension: int = self.stackedembeddings.embedding_length + self.wordembeddings.embedding_length self.bidirectional: bool = bidirectional self.rnn_layers: int = rnn_layers self.rnn_type: str = rnn_type self.num_classes: int = num_classes self.hidden_size: int = hidden_size if self.rnn_type == 'GRU': self.rnn = torch.nn.GRU(self.embedding_dimension, self.hidden_size, bidirectional=self.bidirectional, num_layers=self.rnn_layers) else: self.rnn = torch.nn.LSTM(self.embedding_dimension, self.hidden_size, bidirectional=self.bidirectional, num_layers=self.rnn_layers) self.attention = Attention()
def __init__(self): # Sequence Tagging Model tagger_file = self.modelpath + 'tagger.pt' if Path(tagger_file).is_file(): print('loading tagger from file') self.tagger = SequenceTagger.load_from_file(tagger_file) else: print('downloading pretrained tagger') self.tagger = SequenceTagger.load('ner-ontonotes') self.tagger.save(tagger_file) # Text Embedding Model embeddings_file = self.modelpath + 'embeddings.pickle' if Path(embeddings_file).is_file(): print('loading embedder from file') filestream = open(embeddings_file, 'rb') self.embeddings = pickle.load(filestream) else: print('downloading pretrained embedders') self.embeddings = [ # WordEmbeddings('glove'), FlairEmbeddings('multi-forward') # FlairEmbeddings('multi-backward') ] filestream = open(embeddings_file, 'wb') pickle.dump(self.embeddings, filestream) self.token_embedder = StackedEmbeddings(self.embeddings) self.doc_embedder = DocumentPoolEmbeddings(self.embeddings)
def __init__(self, *embeddings: str): print("May need a couple moments to instantiate...") self.embedding_stack = [] # Load correct Embeddings module for model_name_or_path in embeddings: if "bert" in model_name_or_path and "roberta" not in model_name_or_path: self.embedding_stack.append(BertEmbeddings(model_name_or_path)) elif "roberta" in model_name_or_path: self.embedding_stack.append( RoBERTaEmbeddings(model_name_or_path)) elif "gpt2" in model_name_or_path: self.embedding_stack.append( OpenAIGPT2Embeddings(model_name_or_path)) elif "xlnet" in model_name_or_path: self.embedding_stack.append( XLNetEmbeddings(model_name_or_path)) elif "xlm" in model_name_or_path: self.embedding_stack.append(XLMEmbeddings(model_name_or_path)) elif ("flair" in model_name_or_path or model_name_or_path in FLAIR_PRETRAINED_MODEL_NAMES): self.embedding_stack.append( FlairEmbeddings(model_name_or_path)) else: try: self.embedding_stack.append( WordEmbeddings(model_name_or_path)) except ValueError: raise ValueError( f"Embeddings not found for the model key: {model_name_or_path}, check documentation or custom model path to verify specified model" ) assert len(self.embedding_stack) != 0 self.stacked_embeddings = StackedEmbeddings( embeddings=self.embedding_stack)
class Embedder(object): def __init__(self, embedding=None, method=None, batch_size=5): assert method in [None, "average"], "Bad method" self.method = method self.batch_size = batch_size if embedding is not None: self.embedding = StackedEmbeddings(embedding) else: self.embedding = StackedEmbeddings([ #WordEmbeddings('glove'), #WordEmbeddings('en-news'), #BytePairEmbeddings('en'), WordEmbeddings('crawl') ]) def embed_data(self, sentences): sentences = [Sentence(s) for s in sentences] self.embedding.embed(sentences) if self.method == "average": sentences = [torch.stack([word.embedding.detach().cpu() for word in s]).mean( 0) for s in sentences] else: sentences = [torch.stack( [word.embedding.detach().cpu() for word in s]) for s in sentences] return sentences def embed_dataset(self, sentences): sentences = self.embed_data(sentences) return sentences
class FlairEmbeddings(object): def __init__(self): self.stop_words = list(stopwords.words('english')) self.lemmatizer = WordNetLemmatizer() self.stacked_embeddings = StackedEmbeddings( embeddings=[flair_embedding_forward, flair_embedding_backward]) def word_token(self, tokens, lemma=False): tokens = str(tokens) tokens = re.sub( r"([\w].)([\~\!\@\#\$\%\^\&\*\(\)\-\+\[\]\{\}\/\"\'\:\;])([\s\w].)", "\\1 \\2 \\3", tokens) tokens = re.sub(r"\s+", " ", tokens) if lemma: return " ".join([ self.lemmatizer.lemmatize(token, 'v') for token in word_tokenize(tokens.lower()) if token not in self.stop_words and token.isalpha() ]) else: return " ".join([ token for token in word_tokenize(tokens.lower()) if token not in self.stop_words and token.isalpha() ]) def cos_sim(self, a, b): return np.inner(a, b) / (np.linalg.norm(a) * (np.linalg.norm(b))) def getFlairEmbedding(self, text): sentence = Sentence(text) self.stacked_embeddings.embed(sentence) return np.mean([np.array(token.embedding) for token in sentence], axis=0)
def test_stacked_embeddings(): (sentence, glove, charlm) = init_document_embeddings() embeddings = StackedEmbeddings([glove, charlm]) embeddings.embed(sentence) for token in sentence.tokens: assert (len(token.get_embedding()) == 1124) token.clear_embeddings() assert (len(token.get_embedding()) == 0)
class FlairPretrained(ModelBase): """ Encapsulates pretrained Flair Embeddings (Zalando Flair) by conforming to the ModelBase interface. """ def __init__(self, model=None): super(FlairPretrained, self).__init__() if model is not None: self.model = model else: self.model = StackedEmbeddings([ FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast'), ]) def dim(self) -> int: """ The dimensionality of created embeddings. :return: 2048 (for now, #TODO) """ return 2048 def get_word_vector(self, word: str) -> Optional[np.ndarray]: """ Returns the word vector for word |word| or None. It is discouraged to use this method as it invalidates the purpose of Flair embeddings. Instead, utilize the context as well for more accurate vectorization. In reality, Flair embeddings never return None, even for bogus words. :param word: The word to vectorize. :return: Either the word vector or None. """ dummy_sentence = Sentence(word) self.model.embed(dummy_sentence) return np.array(list(dummy_sentence)[0].embedding) def get_word_vectors(self, words: List[str]) -> List[np.ndarray]: """ Vectorizes the list of words, using pretrained Flair embeddings. These embeddings are context dependent, so this method is preferred over fetching word vectors for single words. :param words: The list of words to vectorize. :return: A list of word vectors. """ sentence = Sentence(' '.join(words)) self.model.embed(sentence) return list( map(lambda token: np.array(token.embedding), list(sentence))) def vectorize_context(self, words: List[str]) -> Optional[np.ndarray]: """ Transforms the context into a single vector. May return None in extreme cases, e.g. if |words| is an empty list. :param words: List of tokens describing the context. :return: A single word vector or None. """ return self.mean_of_words(self.get_word_vectors(words))
class DefaultFeaturizerForSeqTagging(ObservationFeaturizer): def __init__(self, action_space: ActionSpace, embedding_type: str = "fasttext", device: str = "cpu"): self.device = device self._setup_device() embeddings = EmbeddingRegistry.get_embedding(embedding_type) self.doc_embeddings = StackedEmbeddings(embeddings).to( torch.device(self.device)) self.action_space = action_space self._current_token_embeddings: List[torch.tensor] = None def _setup_device(self): import flair, torch flair.device = torch.device(self.device) def init_on_reset(self, input_text: Union[List[str], str]): sent = Sentence(input_text) self.doc_embeddings.embed(sent) self._current_token_embeddings = [ token.embedding.cpu().detach() for token in sent ] sent.clear_embeddings() def featurize(self, observation: Observation) -> torch.Tensor: input_vector = self._featurize_input(observation.get_current_index()) context_vector = self._featurize_context( observation.get_current_action_history()) concatenated = torch.cat((input_vector, context_vector), dim=0) return concatenated def get_observation_dim(self) -> int: return self._get_input_dim() + self._get_context_dim() def _featurize_input(self, input_index: int) -> torch.Tensor: input_features = self._current_token_embeddings[input_index] return input_features def _featurize_context(self, context: List[str]) -> torch.Tensor: # consider only last action context_vector = torch.zeros(self.action_space.size()) context_ = [context[-1]] if len(context) > 0 else [] action_indices = [ self.action_space.action_to_ix(action) for action in context_ ] context_vector[action_indices] = 1.0 return context_vector def _get_input_dim(self): sent = Sentence("A random text to get the embedding dimension") self.doc_embeddings.embed(sent) dim = sent[0].embedding.shape[0] sent.clear_embeddings() return dim def _get_context_dim(self): return self.action_space.size()
def args_init(args): # initialize word2vec args.word2vec = KeyedVectors.load_word2vec_format('data/mymodel-new-5-%d' % args.model_dim, binary=True) # initialize contextual embedding dimensions if args.contextual_embedding == 'word2vec': args.word_dim = args.tag_dim = args.dis_dim = 50 args.stacked_embeddings = 'word2vec' elif args.contextual_embedding == 'elmo': #glove + elmo args.word_dim = args.tag_dim = args.dis_dim = 868 ## stacked embeddings # create a StackedEmbedding object that combines glove and forward/backward flair embeddings args.stacked_embeddings = StackedEmbeddings( [WordEmbeddings('glove'), ELMoEmbeddings('small')]) elif args.contextual_embedding == 'bert': #glove + bert args.word_dim = args.tag_dim = args.dis_dim = 3172 args.stacked_embeddings = StackedEmbeddings( [WordEmbeddings('glove'), BertEmbeddings('bert-base-uncased')]) args.batch_size = 8 elif args.contextual_embedding == 'flair': #glove + flair-forward + flair-backward args.word_dim = args.tag_dim = args.dis_dim = 4196 args.stacked_embeddings = StackedEmbeddings([ WordEmbeddings('glove'), FlairEmbeddings('mix-forward', chars_per_chunk=128), FlairEmbeddings('mix-backward', chars_per_chunk=128) ]) if args.agent_mode == 'act': args.batch_size = 8 else: args.batch_size = 8 elif args.contextual_embedding == 'glove': # not tested args.word_dim = args.tag_dim = args.dis_dim = 100 args.stacked_embeddings = StackedEmbeddings([ WordEmbeddings('glove'), ]) # weights loaded, set exploration rate to minimum if args.load_weights: # 1 to 0.1. decayed to minimum. args.exploration_rate_start = args.exploration_rate_end # agent mode arguments, set number of words to 100 if args.agent_mode == 'arg': args.num_words = args.context_len args.display_training_result = 0 args.result_dir = 'results/%s_%s_%s' % (args.domain, args.agent_mode, args.contextual_embedding) return args
def train_model(directory='Data', use_BERT=True): # define columns columns = { 0: 'ID', 1: 'text', 2: 'empty_0', 3: 'pos', 4: 'empty_1', 5: 'empty_2', 6: 'empty_3', 7: 'empty_4', 8: 'empty_5', 9: 'tox' } # this is the folder in which train, test and dev files reside data_folder = directory # init a corpus using column format, data folder and the names of the train, dev and test files corpus: Corpus = ColumnCorpus(data_folder, columns, train_file='converted_data_train.conll', test_file='converted_data_test.conll', dev_file='converted_data_dev.conll') # tag to predict tag_type = 'tox' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) # embeddings if use_BERT: bert_embeddings = [ TransformerWordEmbeddings('bert-large-uncased', fine_tune=True) ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=bert_embeddings) else: embedding_types = [WordEmbeddings('glove')] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) # initialize sequence tagger tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) # initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) # start training trainer.train('resources/taggers/toxic_classifier_bert', learning_rate=0.1, mini_batch_size=32, max_epochs=5)
def __init__(self, model=None): super(FlairPretrained, self).__init__() if model is not None: self.model = model else: self.model = StackedEmbeddings([ FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast'), ])
def __init__(self, action_space: ActionSpace, embedding_type: str = "fasttext", device: str = "cpu"): self.device = device self._setup_device() embeddings = EmbeddingRegistry.get_embedding(embedding_type) self.doc_embeddings = StackedEmbeddings(embeddings).to( torch.device(self.device)) self.action_space = action_space self._current_token_embeddings: List[torch.tensor] = None
def __init__(self, forward, backward, use_tokenizer, *args, **kwargs): super(FlairEmbeddings, self).__init__(*args, **kwargs) self._forward = forward self._backward = backward self._use_tokenizer = use_tokenizer from flair.embeddings import FlairEmbeddings as FLEmbeddings from flair.embeddings import StackedEmbeddings self._embeddings = StackedEmbeddings([FLEmbeddings(forward), FLEmbeddings(backward)])
def __init__(self, *embeddings: str): print("May need a couple moments to instantiate...") self.embedding_stack = [] # Load correct Embeddings module for model_name_or_path in embeddings: self.embedding_stack.append( _get_embedding_model(model_name_or_path)) assert len(self.embedding_stack) != 0 self.stacked_embeddings = StackedEmbeddings( embeddings=self.embedding_stack)
def __init__(self, corpus, emb_path, flair=False): self.word2index, self.word_emb = self.get_pretrain_embeddings(emb_path, corpus.get_word_vocab()) self.index2word = {i: w for w, i in self.word2index.items()} self.flair_words = None if config.if_flair or flair: # self.elmo = ELMoEmbeddings() # self.bert_embedding = BertEmbeddings('bert-base-cased') self.flair_forward_embedding = FlairEmbeddings('news-forward') self.flair_backward_embedding = FlairEmbeddings('news-backward') self.stacked_embeddings = StackedEmbeddings( embeddings=[self.flair_forward_embedding, self.flair_backward_embedding])
def __init__(self,gpu): super(LayerFlairEmbeddings, self).__init__(gpu) self.gpu = gpu # self.flair_embeddings_dim = flair_embeddings_dim # self.freeze_flair_embeddings = freeze_flair_embeddings self.output_dim = 4096 self.flair_embedding_forward = FlairEmbeddings('/home/jlfu/flair_model/news-forward-0.4.1.pt') self.flair_embedding_backward = FlairEmbeddings('/home/jlfu/flair_model/news-backward-0.4.1.pt') self.stacked_embeddings = StackedEmbeddings([ self.flair_embedding_forward, self.flair_embedding_backward ])
def test_sequence_tagger_param_selector(results_base_path, tasks_base_path): corpus = flair.datasets.ColumnCorpus(data_folder=tasks_base_path / "fashion", column_format={ 0: "text", 2: "ner" }) # define search space search_space = SearchSpace() # sequence tagger parameter search_space.add( Parameter.EMBEDDINGS, hp.choice, options=[ StackedEmbeddings([WordEmbeddings("glove")]), StackedEmbeddings([ WordEmbeddings("glove"), FlairEmbeddings("news-forward-fast"), FlairEmbeddings("news-backward-fast"), ]), ], ) search_space.add(Parameter.USE_CRF, hp.choice, options=[True, False]) search_space.add(Parameter.DROPOUT, hp.uniform, low=0.25, high=0.75) search_space.add(Parameter.WORD_DROPOUT, hp.uniform, low=0.0, high=0.25) search_space.add(Parameter.LOCKED_DROPOUT, hp.uniform, low=0.0, high=0.5) search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[64, 128]) search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2]) # model trainer parameter search_space.add(Parameter.OPTIMIZER, hp.choice, options=[SGD]) # training parameter search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[4, 8, 32]) search_space.add(Parameter.LEARNING_RATE, hp.uniform, low=0.01, high=1) search_space.add(Parameter.ANNEAL_FACTOR, hp.uniform, low=0.3, high=0.75) search_space.add(Parameter.PATIENCE, hp.choice, options=[3, 5]) search_space.add(Parameter.WEIGHT_DECAY, hp.uniform, low=0.01, high=1) # find best parameter settings optimizer = SequenceTaggerParamSelector(corpus, "ner", results_base_path, max_epochs=2) optimizer.optimize(search_space, max_evals=2) # clean up results directory shutil.rmtree(results_base_path)
def init_emb(self): # init standard GloVe embedding flair.device = torch.device("cpu") glove_embedding = WordEmbeddings('glove') # init Flair forward and backwards embeddings flair_embedding_forward = FlairEmbeddings('news-forward') flair_embedding_backward = FlairEmbeddings('news-backward') # create a StackedEmbedding object that combines glove and forward/backward flair embeddings self.stacked_embeddings = StackedEmbeddings([ glove_embedding, flair_embedding_forward, flair_embedding_backward, ])
def __init__(self): """ initialize the word embedding and document embedding classes """ self.word_embedding = flair.embeddings.WordEmbeddings('glove') self.doc_embedding = flair.embeddings.DocumentPoolEmbeddings([self.word_embedding]) # embedding self.flair_forward = FlairEmbeddings('news-forward-fast') self.backward_flair = FlairEmbeddings('news-backward-fast') # stacked embedding self.stacked_embedding = StackedEmbeddings(embeddings=[ self.flair_forward, self.backward_flair])
def __init__(self, embedding=None, method=None, batch_size=5): assert method in [None, "average"], "Bad method" self.method = method self.batch_size = batch_size if embedding is not None: self.embedding = StackedEmbeddings(embedding) else: self.embedding = StackedEmbeddings([ #WordEmbeddings('glove'), #WordEmbeddings('en-news'), #BytePairEmbeddings('en'), WordEmbeddings('crawl') ])
class LayerFlairEmbeddings(LayerBase): """LayerBertEmbeddings implements character-level embeddings.""" def __init__(self,gpu): super(LayerFlairEmbeddings, self).__init__(gpu) self.gpu = gpu # self.flair_embeddings_dim = flair_embeddings_dim # self.freeze_flair_embeddings = freeze_flair_embeddings self.output_dim = 4096 self.flair_embedding_forward = FlairEmbeddings('/home/jlfu/flair_model/news-forward-0.4.1.pt') self.flair_embedding_backward = FlairEmbeddings('/home/jlfu/flair_model/news-backward-0.4.1.pt') self.stacked_embeddings = StackedEmbeddings([ self.flair_embedding_forward, self.flair_embedding_backward ]) # self.glove_embedding = WordEmbeddings('glove') # self.args= args # if self.args.use_flair_glove: # self.stacked_embeddings = StackedEmbeddings([ # self.glove_embedding, # self.flair_embedding_forward, # self.flair_embedding_backward # ]) # self.output_dim = 4096 def is_cuda(self): return self.embeddings.weight.is_cuda def forward(self, word_sequences): batch_size = len(word_sequences) max_seq_len = max([len(word_seq) for word_seq in word_sequences]) flair_embedding = torch.zeros(batch_size, max_seq_len, self.output_dim) # create a sentence for i,word_sequence in enumerate(word_sequences): word_seq_str = ' '.join(word_sequence) sentence = Sentence(word_seq_str) # self.flair_embedding_forward.embed(sentence) self.stacked_embeddings.embed(sentence) for j,token in enumerate(sentence): # print('token.embedding',token.embedding) flair_embedding[i][j][:] = token.embedding # print('flair_embedding',flair_embedding) # break return flair_embedding
def main(data_folder: str, model_folder: str, dev_size: float, nb_epochs: int) -> None: nlp = spacy.blank('fr') nlp.tokenizer = get_tokenizer(nlp) corpus: Corpus = prepare_flair_train_test_corpus(spacy_model=nlp, data_folder=data_folder, dev_size=dev_size) tag_dictionary = corpus.make_tag_dictionary(tag_type='ner') print(tag_dictionary.idx2item) embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('fr'), FlairEmbeddings('fr-forward'), FlairEmbeddings('fr-backward'), ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, use_crf=True, tag_dictionary=tag_dictionary, tag_type='ner') trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train(model_folder, max_epochs=nb_epochs, mini_batch_size=32, embeddings_storage_mode="cpu", checkpoint=False)
def out_embedding(type_, model, n_layers, stacked=False): ''' Create object of embedding type for later purpose :param: :type_: (str) type of embedding (currently there are only BERT or Flair embeddings) :model: (str) pretrained model of BERT embedding :n_layers: (int) number of last layers of trained BERT embeddings to be chosen :stacked: (bool) if this embedding is a combination of more embeddings (True/False) :return: :embedding: (BertEmbeddings / StackedEmbeddings) embedding object ''' out_layers = ','.join([str(-i) for i in range(1, n_layers + 1)]) if not stacked: if type_.lower() == 'bert': embedding = BertEmbeddings(bert_model_or_path=model, layers=out_layers) return embedding else: emb = WordEmbeddings('glove') else: emb = BertEmbeddings(bert_model_or_path=model, layers=out_layers) flair_forward = FlairEmbeddings('news-forward-fast') flair_backward = FlairEmbeddings('news-backward-fast') embedding = StackedEmbeddings( embeddings=[flair_forward, flair_backward, emb]) return embedding
def initialize_embeddings(self, fastbert=True, stackedembeddings=True): # Consider using pooling_operation="first", use_scalar_mix=True for the parameters # initialize individual embeddings if fastbert: bert_embedding = BertEmbeddings('distilbert-base-uncased', layers='-1') else: bert_embedding = BertEmbeddings('bert-base-cased', layers='-1') if stackedembeddings: glove_embedding = WordEmbeddings('glove') # init Flair forward and backwards embeddings flair_embedding_forward = FlairEmbeddings('news-forward') flair_embedding_backward = FlairEmbeddings('news-backward') embedding_types = [ bert_embedding, glove_embedding, flair_embedding_forward, flair_embedding_backward ] embeddings = StackedEmbeddings(embeddings=embedding_types) else: embeddings = bert_embedding return embeddings
def embed_dataset() -> List: # init standard GloVe embedding glove_embedding = WordEmbeddings('glove') flair_embedding_forward = FlairEmbeddings('news-forward') # create a StackedEmbedding object that combines glove and forward/backward flair embeddings stacked_embeddings = StackedEmbeddings([ glove_embedding, flair_embedding_forward, ]) sentence_dataset = load_dataset( '/Users/haraldott/Development/thesis/anomaly_detection_main/logparser/Drain/Drain_result/st_0.2 depth_2/openstack_normal_10k.csv' ) embedded_sentences = [] count = 0.0 for s in sentence_dataset: sentence = Sentence(s) flair_embedding_forward.embed(sentence) embedded_sentences.append(sentence) if count % 50 == 0 or count == len(sentence_dataset): print('Processed {0:.1f}% of log lines.'.format( count * 100.0 / len(sentence_dataset))) count += 1 words = [] for sentence in embedded_sentences: for word in sentence: words.append(word.embedding) # TODO: is this correct? return all torch.save(words, '10k_depth_2_st_0.2.pt') return words
def train(): # column format - word postag label columns = {0: "word", 1: "postag", 2: "ner"} data_folder = os.path.join(path, "../data/") # read train, dev and test set # here test set is same as dev set corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(data_folder, columns, train_file = "onto.train", dev_file = "onto.testa", test_file="onto.testa") print(corpus) # create label dictionary tag_dictionary = corpus.make_tag_dictionary(tag_type = "ner") print(tag_dictionary.idx2item) # using glove embeddings and character embeddings embedding_types: List[TokenEmbeddings] = [WordEmbeddings("glove"), CharacterEmbeddings()] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings = embedding_types) # create sequence tagger and trainer instance tagger: SequenceTagger = SequenceTagger(hidden_size = 256, embeddings = embeddings, tag_dictionary = tag_dictionary, tag_type = "ner", use_crf = True) trainer: ModelTrainer = ModelTrainer(tagger, corpus) model_path = os.path.join(path, "../models/") # commence training # model shall be saved in model_path under filename final-model.pt # this step takes at least 4 hours to complete, so please ensure access to GPU trainer.train(model_path, learning_rate = 0.1, mini_batch_size = 64, max_epochs = 3)