def embed_dataset() -> List: # init standard GloVe embedding glove_embedding = WordEmbeddings('glove') flair_embedding_forward = FlairEmbeddings('news-forward') # create a StackedEmbedding object that combines glove and forward/backward flair embeddings stacked_embeddings = StackedEmbeddings([ glove_embedding, flair_embedding_forward, ]) sentence_dataset = load_dataset( '/Users/haraldott/Development/thesis/anomaly_detection_main/logparser/Drain/Drain_result/st_0.2 depth_2/openstack_normal_10k.csv' ) embedded_sentences = [] count = 0.0 for s in sentence_dataset: sentence = Sentence(s) flair_embedding_forward.embed(sentence) embedded_sentences.append(sentence) if count % 50 == 0 or count == len(sentence_dataset): print('Processed {0:.1f}% of log lines.'.format( count * 100.0 / len(sentence_dataset))) count += 1 words = [] for sentence in embedded_sentences: for word in sentence: words.append(word.embedding) # TODO: is this correct? return all torch.save(words, '10k_depth_2_st_0.2.pt') return words
class Embedder: def __init__(self): self.embedder = FlairEmbeddings('news-forward-fast') self.embedding_length = self.__len__() def __len__(self): return self.embedder.embedding_length def __call__(self, sentences: np.ndarray): return self.embed(sentences) def embed(self, sentences: np.ndarray): if not isinstance(sentences, np.ndarray): raise TypeError( f'Expected numpy ndarray input got {type(sentences)}') if sentences.ndim != 2: raise TypeError( f'Expected numpy ndarray with 2 dims, try to A.reshape(-1, 1) ' ) sentences = [Sentence(sentence[0]) for sentence in sentences] self.embedder.embed(sentences) embeddings = [] for sentence in sentences: embeddings.append( torch.stack([token.embedding.cpu() for token in sentence])) return embeddings
def flair_embeddings(sentences, output_file=None): if output_file: f = open(output_file, 'w') embedder = FlairEmbeddings( "multi-forward" ) #multilingual; you also have nl-forward; no french model though document_embedding = [] for i, sent in enumerate(sentences): print("Encoding the {}th input sentence!".format(i)) # create a sentence sentence = Sentence(" ".join(sent)) # embed words in sentence embedder.embed(sentence) sentence_embedding = np.mean( [token.embedding.cpu().numpy() for token in sentence], axis=0) #have to go from CUDA tensor to cpu tensor document_embedding.append(sentence_embedding) if output_file: for token in sentence: f.write( token.text + "\t" + "\t".join([str(num) for num in token.embedding.tolist()]) + '\n') document_embedding = np.mean(document_embedding, axis=0) return document_embedding
def test_train_language_model(results_base_path, resources_path): # get default dictionary dictionary: Dictionary = Dictionary.load('chars') # init forward LM with 128 hidden states and 1 layer language_model: LanguageModel = LanguageModel(dictionary, is_forward_lm=True, hidden_size=128, nlayers=1) # get the example corpus and process at character level in forward direction corpus: TextCorpus = TextCorpus(resources_path / 'corpora/lorem_ipsum', dictionary, language_model.is_forward_lm, character_level=True) # train the language model trainer: LanguageModelTrainer = LanguageModelTrainer(language_model, corpus, test_mode=True) trainer.train(results_base_path, sequence_length=10, mini_batch_size=10, max_epochs=2) # use the character LM as embeddings to embed the example sentence 'I love Berlin' char_lm_embeddings = FlairEmbeddings(str(results_base_path / 'best-lm.pt')) sentence = Sentence('I love Berlin') char_lm_embeddings.embed(sentence) text, likelihood = language_model.generate_text(number_of_characters=100) assert (text is not None) assert (len(text) >= 100) # clean up results directory shutil.rmtree(results_base_path, ignore_errors=True)
def flair_embeddings(sentences, tokenized_contents, output_file=None): if output_file: f = open(output_file, 'w') # init embedding flair_embedding_forward = FlairEmbeddings('news-forward') for i, (sent, sent_tokens) in enumerate(zip(sentences, tokenized_contents)): print( "Encoding the {}th input sentence for Flair embedding!".format(i)) # Getting the tokens from our own tokenized sentence! tokens: List[Token] = [Token(token) for token in sent_tokens] if len(tokens) != len(sent_tokens): raise ValueError("tokens length does not match sent_tokens length") # Create new empty sentence sentence = Sentence() # add our own tokens sentence.tokens = tokens flair_embedding_forward.embed(sentence) for token in sentence: if output_file: f.write( token.text + " " + " ".join([str(num) for num in token.embedding.tolist()]) + '\n') else: print(token.text + " " + " ".join([str(num) for num in token.embedding.tolist()]) + '\n') f.write('\n')
class FlairEmbedding(EmbeddingBase): def __init__(self): self.forward_model = FlairEmbeddings("pl-forward") self.backward_model = FlairEmbeddings("pl-backward") self.size = 8192 def _get_vector(self, forward: Sentence, backward: Sentence) -> np.ndarray: res = np.zeros(self.size, dtype=np.float32) for idx in range(len(forward)): out_fwd = np.fromiter(forward.tokens[idx].embedding.tolist(), dtype=np.float32) out_bwd = np.fromiter(backward.tokens[idx].embedding.tolist(), dtype=np.float32) out = np.hstack((out_fwd, out_bwd)) res += out res /= len(forward) return res def batcher(self, params, batch: List[List[str]]) -> np.ndarray: batch = [ Sentence(" ".join(sent)) if sent != [] else ['.'] for sent in batch ] embeddings = [] outputs_forward = self.forward_model.embed(batch) outputs_backward = self.backward_model.embed(batch) for forward, backward in zip(outputs_forward, outputs_backward): embeddings.append(self._get_vector(forward, backward)) embeddings = np.vstack(embeddings) return embeddings def dim(self) -> int: return self.size
def load_and_apply_char_lm_embeddings(emb_type): text = u'I love Berlin.' sentence = Sentence(text) embeddings = FlairEmbeddings(emb_type) embeddings.embed(sentence) for token in sentence.tokens: assert (len(token.get_embedding()) != 0) token.clear_embeddings() assert (len(token.get_embedding()) == 0)
class SentenceFlairEmbedderSensor(SentenceSensor): def __init__(self, *pres): super().__init__(*pres) self.flair_embedding_backward = FlairEmbeddings('news-backward') def forward( self, ) -> Any: self.flair_embedding_backward.embed(self.fetch_value(self.sentence_value)) return None
class FlairEncoder(BaseTextEncoder): is_trained = True def __init__(self, model_name: str = 'multi-forward-fast', pooling_strategy: str = 'REDUCE_MEAN', *args, **kwargs): super().__init__(*args, **kwargs) self.model_name = model_name self.pooling_strategy = pooling_strategy def post_init(self): from flair.embeddings import FlairEmbeddings self._flair = FlairEmbeddings(self.model_name) @batching def encode(self, text: List[str], *args, **kwargs) -> np.ndarray: from flair.data import Sentence # tokenize text batch_tokens = [Sentence(sent) for sent in text] flair_encodes = self._flair.embed(batch_tokens) pooled_data = [] for sentence in flair_encodes: _layer_data = np.stack([s.embedding.numpy() for s in sentence]) _pooled = pooling_np(_layer_data, self.pooling_strategy) pooled_data.append(_pooled) return np.array(pooled_data, dtype=np.float32)
def get_flair_embeddings(sentence_list): flair_sentence_object_list = [] for sentence_string in sentence_list: sentence_string = str(sentence_string) + " ." flair_sentence_object_list.append(Sentence(sentence_string)) numpy_embedding_list = list([]) flair_embedding_forward = FlairEmbeddings(modelconstants.FLAIR_MODEL_NAME) for sentence_object in flair_sentence_object_list: flair_embedding_forward.embed(sentence_object) composite_vector = [ 0.0 for _ in range(flair_embedding_forward.embedding_length) ] for token in sentence_object: token_embedding = token.embedding.numpy() composite_vector = (np.array(composite_vector) + np.array(token_embedding)) / 2.0 numpy_embedding_list.append(composite_vector) return np.array(numpy_embedding_list)
def test_train_language_model(results_base_path, resources_path): dictionary = Dictionary.load(u'chars') language_model = LanguageModel(dictionary, is_forward_lm=True, hidden_size=128, nlayers=1) corpus = TextCorpus((resources_path / u'corpora/lorem_ipsum'), dictionary, language_model.is_forward_lm, character_level=True) trainer = LanguageModelTrainer(language_model, corpus, test_mode=True) trainer.train(results_base_path, sequence_length=10, mini_batch_size=10, max_epochs=2) char_lm_embeddings = FlairEmbeddings( unicode((results_base_path / u'best-lm.pt'))) sentence = Sentence(u'I love Berlin') char_lm_embeddings.embed(sentence) (text, likelihood) = language_model.generate_text(number_of_characters=100) assert (text is not None) assert (len(text) >= 100) shutil.rmtree(results_base_path, ignore_errors=True)
for token in sentence: print(token) print(token.embedding) #Fasttest Embedding加载训练 fasttext_embedding_forward = WordEmbeddings('model/zh-wiki-fasttext-300d-1M') sentence = Sentence('The grass is green .') fasttext_embedding_forward.embed(sentence) for token in sentence: print(token) print(token.embedding) #Flair Embedding加载训练 flair_embedding_forward = FlairEmbeddings('model/news-forward-0.4.1.pt') sentence = Sentence('The grass is green .') flair_embedding_forward.embed(sentence) for token in sentence: print(token) print(token.embedding) #Bert Embedding加载训练 embedding = BertEmbeddings() sentence = Sentence('The grass is green .') embedding.embed(sentence) for token in sentence: print(token) print(token.embedding) #Elmo Embedding加载训练 embedding = ELMoEmbeddings() sentence = Sentence('The grass is green .')
corpus = pickle.load(open(inputFileName, 'br')) # In[5]: vectors = {} # In[6]: for d in corpus: print("processing ", d) totLen = len(corpus[d]['text']) for i, s in enumerate(corpus[d]['text']): if i % 10 == 0: print("processed {}/{} ".format(i, totLen), end='\r') sentence = Sentence(s) char_lm_embeddings.embed(sentence) for token in sentence: if not token.text in vectors: string = token.text for v in token.embedding.cpu().numpy(): string += ' {}'.format(v) vectors[token.text] = string print("processed {}/{} ".format(i, totLen)) # In[11]: with open(outputFileName, 'wt') as f: for k in vectors: f.write(vectors[k]) f.write("\n")
from flair.data import Sentence from flair.embeddings import FlairEmbeddings sentence = Sentence( 'расчете на душу населения ниже среднекраевого уровня приход на потребительский рынок г Алейска ' 'крупных торговых сетей г Барнаула Мария-Ра Аникс Новэкс Магнит Холди и др отрицательно влияет на ' 'динамику оборота розничной торговли района в районе не развито бытовое обслуживание населения ' 'отсутствуют комплексные приемные пункты') # init emdeddings from your trained LM char_lm_embeddings = FlairEmbeddings( 'resources/taggers/language_model/best-lm.pt') # embed sentence print(char_lm_embeddings.embed(sentence))
class FlairEmbedder(nn.Module, ClassNursery, BaseEmbedder): def __init__( self, embedding_type: str, datasets_manager: DatasetsManager = None, device: Union[str, torch.device] = "cpu", word_tokens_namespace: str = "tokens", ): """ Flair Embeddings. This is used to produce Named Entity Recognition. Note: This only works if your tokens are produced by splitting based on white space Parameters ---------- embedding_type datasets_manager device word_tokens_namespace """ super(FlairEmbedder, self).__init__() self.allowed_type = ["en", "news"] assert embedding_type in self.allowed_type self.embedder_forward = FlairEmbeddings(f"{embedding_type}-forward") self.embedder_backward = FlairEmbeddings(f"{embedding_type}-backward") self.embedder_name = f"FlairEmbedder-{embedding_type}" self.datasets_manager = datasets_manager self.device = torch.device(device) if isinstance(device, str) else device self.word_tokens_namespace = word_tokens_namespace def forward(self, lines: List[Line]): sentences = [] for line in lines: sentence = Sentence(line.text) sentences.append(sentence) len_tokens = [len(line.tokens[self.word_tokens_namespace]) for line in lines] max_len = max(len_tokens) _ = self.embedder_forward.embed(sentences) _ = self.embedder_backward.embed(sentences) batch_embeddings = [] for sentence in sentences: sentence_embeddings = [] padding_length = max_len - len(sentence) for token in sentence: embedding = token.get_embedding() embedding = embedding.to(self.device) sentence_embeddings.append(embedding) for i in range(padding_length): embedding = torch.randn( self.get_embedding_dimension(), dtype=torch.float, device=self.device, ) sentence_embeddings.append(embedding) sentence_embeddings = torch.stack(sentence_embeddings) batch_embeddings.append(sentence_embeddings) # batch_size, num_tokens, embedding_dim batch_embeddings = torch.stack(batch_embeddings) batch_embeddings = batch_embeddings.to(self.device) for idx, line in enumerate(lines): line_embeddings = batch_embeddings[idx] for token, emb in zip( line.tokens[self.word_tokens_namespace], line_embeddings ): token.set_embedding(name=self.embedder_name, value=emb) return batch_embeddings def get_embedding_dimension(self): return self.embedder_forward.embedding_length * 2 # for forward and backward
class FlairEmbedding(ContextualEmbedding): def __init__(self, vocab: Vocabulary, model_dir_or_name: str = 'en-base-uncased', layers: str = '-1', pool_method: str = 'first', word_dropout=0, dropout=0, include_cls_sep: bool = False, pooled_cls=True, requires_grad: bool = True, auto_truncate: bool = False, **kwargs): super(FlairEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) if word_dropout > 0: assert vocab.unknown is not None, "When word_drop>0, Vocabulary must contain the unknown token." self._word_sep_index = -100 if '[SEP]' in vocab: self._word_sep_index = vocab['[SEP]'] self._word_cls_index = -100 if '[CLS]' in vocab: self._word_cls_index = vocab['CLS'] self.vocab = vocab self.model = FlairEmbeddings(model=model_dir_or_name, fine_tune=False) self.requires_grad = requires_grad self._embed_size = self.model.embedding_length def _delete_model_weights(self): del self.model def forward(self, words): max_length = words.shape[1] words = self.drop_word(words) words_sentences = [] for sentence in words: words_sentences.append([ self.vocab.idx2word[word.item()] for word in sentence if word.item() != 0 ]) words = [Sentence(' '.join(x)) for x in words_sentences] self.model.embed(words) outputs = torch.stack([ torch.stack([x.embedding for x in y] + (max_length - len(y)) * [torch.zeros(2048).to(next(self.parameters()).device)]) for y in words ]) del words del words_sentences torch.cuda.empty_cache() if outputs is not None: return self.dropout(outputs) return self.dropout(outputs) def drop_word(self, words): if self.word_dropout > 0 and self.training: with torch.no_grad(): mask = torch.full_like(words, fill_value=self.word_dropout, dtype=torch.float, device=words.device) mask = torch.bernoulli(mask).eq(1) pad_mask = words.ne(self._word_pad_index) mask = pad_mask.__and__(mask) if self._word_sep_index != -100: not_sep_mask = words.ne(self._word_sep_index) mask = mask.__and__(not_sep_mask) if self._word_cls_index != -100: not_cls_mask = words.ne(self._word_cls_index) mask = mask.__and__(not_cls_mask) words = words.masked_fill(mask, self._word_unk_index) return words