Пример #1
0
def embed_dataset() -> List:
    # init standard GloVe embedding
    glove_embedding = WordEmbeddings('glove')
    flair_embedding_forward = FlairEmbeddings('news-forward')

    # create a StackedEmbedding object that combines glove and forward/backward flair embeddings
    stacked_embeddings = StackedEmbeddings([
        glove_embedding,
        flair_embedding_forward,
    ])
    sentence_dataset = load_dataset(
        '/Users/haraldott/Development/thesis/anomaly_detection_main/logparser/Drain/Drain_result/st_0.2 depth_2/openstack_normal_10k.csv'
    )

    embedded_sentences = []
    count = 0.0
    for s in sentence_dataset:
        sentence = Sentence(s)
        flair_embedding_forward.embed(sentence)
        embedded_sentences.append(sentence)
        if count % 50 == 0 or count == len(sentence_dataset):
            print('Processed {0:.1f}% of log lines.'.format(
                count * 100.0 / len(sentence_dataset)))
        count += 1
    words = []
    for sentence in embedded_sentences:
        for word in sentence:
            words.append(word.embedding)  #  TODO: is this correct? return all
    torch.save(words, '10k_depth_2_st_0.2.pt')
    return words
Пример #2
0
class Embedder:
    def __init__(self):
        self.embedder = FlairEmbeddings('news-forward-fast')
        self.embedding_length = self.__len__()

    def __len__(self):
        return self.embedder.embedding_length

    def __call__(self, sentences: np.ndarray):
        return self.embed(sentences)

    def embed(self, sentences: np.ndarray):
        if not isinstance(sentences, np.ndarray):
            raise TypeError(
                f'Expected numpy ndarray input got {type(sentences)}')

        if sentences.ndim != 2:
            raise TypeError(
                f'Expected numpy ndarray with 2 dims, try to A.reshape(-1, 1) '
            )

        sentences = [Sentence(sentence[0]) for sentence in sentences]

        self.embedder.embed(sentences)

        embeddings = []
        for sentence in sentences:
            embeddings.append(
                torch.stack([token.embedding.cpu() for token in sentence]))

        return embeddings
def flair_embeddings(sentences, output_file=None):
    if output_file:
        f = open(output_file, 'w')

    embedder = FlairEmbeddings(
        "multi-forward"
    )  #multilingual; you also have nl-forward; no french model though

    document_embedding = []
    for i, sent in enumerate(sentences):
        print("Encoding the {}th input sentence!".format(i))
        # create a sentence
        sentence = Sentence(" ".join(sent))

        # embed words in sentence
        embedder.embed(sentence)
        sentence_embedding = np.mean(
            [token.embedding.cpu().numpy() for token in sentence],
            axis=0)  #have to go from CUDA tensor to cpu tensor
        document_embedding.append(sentence_embedding)

        if output_file:
            for token in sentence:
                f.write(
                    token.text + "\t" +
                    "\t".join([str(num)
                               for num in token.embedding.tolist()]) + '\n')
    document_embedding = np.mean(document_embedding, axis=0)
    return document_embedding
def test_train_language_model(results_base_path, resources_path):
    # get default dictionary
    dictionary: Dictionary = Dictionary.load('chars')

    # init forward LM with 128 hidden states and 1 layer
    language_model: LanguageModel = LanguageModel(dictionary, is_forward_lm=True, hidden_size=128, nlayers=1)

    # get the example corpus and process at character level in forward direction
    corpus: TextCorpus = TextCorpus(resources_path / 'corpora/lorem_ipsum',
                                    dictionary,
                                    language_model.is_forward_lm,
                                    character_level=True)

    # train the language model
    trainer: LanguageModelTrainer = LanguageModelTrainer(language_model, corpus, test_mode=True)
    trainer.train(results_base_path, sequence_length=10, mini_batch_size=10, max_epochs=2)

    # use the character LM as embeddings to embed the example sentence 'I love Berlin'
    char_lm_embeddings = FlairEmbeddings(str(results_base_path / 'best-lm.pt'))
    sentence = Sentence('I love Berlin')
    char_lm_embeddings.embed(sentence)

    text, likelihood = language_model.generate_text(number_of_characters=100)
    assert (text is not None)
    assert (len(text) >= 100)

    # clean up results directory
    shutil.rmtree(results_base_path, ignore_errors=True)
Пример #5
0
def flair_embeddings(sentences, tokenized_contents, output_file=None):
    if output_file:
        f = open(output_file, 'w')
    # init embedding
    flair_embedding_forward = FlairEmbeddings('news-forward')
    for i, (sent, sent_tokens) in enumerate(zip(sentences,
                                                tokenized_contents)):
        print(
            "Encoding the {}th input sentence for Flair embedding!".format(i))
        # Getting the tokens from our own tokenized sentence!
        tokens: List[Token] = [Token(token) for token in sent_tokens]

        if len(tokens) != len(sent_tokens):
            raise ValueError("tokens length does not match sent_tokens length")

        # Create new empty sentence
        sentence = Sentence()

        # add our own tokens
        sentence.tokens = tokens

        flair_embedding_forward.embed(sentence)

        for token in sentence:

            if output_file:
                f.write(
                    token.text + " " +
                    " ".join([str(num)
                              for num in token.embedding.tolist()]) + '\n')
            else:
                print(token.text + " " +
                      " ".join([str(num)
                                for num in token.embedding.tolist()]) + '\n')
        f.write('\n')
class FlairEmbedding(EmbeddingBase):
    def __init__(self):
        self.forward_model = FlairEmbeddings("pl-forward")
        self.backward_model = FlairEmbeddings("pl-backward")
        self.size = 8192

    def _get_vector(self, forward: Sentence, backward: Sentence) -> np.ndarray:
        res = np.zeros(self.size, dtype=np.float32)
        for idx in range(len(forward)):
            out_fwd = np.fromiter(forward.tokens[idx].embedding.tolist(),
                                  dtype=np.float32)
            out_bwd = np.fromiter(backward.tokens[idx].embedding.tolist(),
                                  dtype=np.float32)
            out = np.hstack((out_fwd, out_bwd))
            res += out
        res /= len(forward)
        return res

    def batcher(self, params, batch: List[List[str]]) -> np.ndarray:
        batch = [
            Sentence(" ".join(sent)) if sent != [] else ['.'] for sent in batch
        ]
        embeddings = []
        outputs_forward = self.forward_model.embed(batch)
        outputs_backward = self.backward_model.embed(batch)
        for forward, backward in zip(outputs_forward, outputs_backward):
            embeddings.append(self._get_vector(forward, backward))
        embeddings = np.vstack(embeddings)
        return embeddings

    def dim(self) -> int:
        return self.size
Пример #7
0
def load_and_apply_char_lm_embeddings(emb_type):
    text = u'I love Berlin.'
    sentence = Sentence(text)
    embeddings = FlairEmbeddings(emb_type)
    embeddings.embed(sentence)
    for token in sentence.tokens:
        assert (len(token.get_embedding()) != 0)
        token.clear_embeddings()
        assert (len(token.get_embedding()) == 0)
Пример #8
0
class SentenceFlairEmbedderSensor(SentenceSensor):
    def __init__(self, *pres):
        super().__init__(*pres)
        self.flair_embedding_backward = FlairEmbeddings('news-backward')
        
    def forward(
            self,
    ) -> Any:
        self.flair_embedding_backward.embed(self.fetch_value(self.sentence_value))
        return None
Пример #9
0
class FlairEncoder(BaseTextEncoder):
    is_trained = True

    def __init__(self, model_name: str = 'multi-forward-fast',
                 pooling_strategy: str = 'REDUCE_MEAN', *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.model_name = model_name
        self.pooling_strategy = pooling_strategy

    def post_init(self):
        from flair.embeddings import FlairEmbeddings
        self._flair = FlairEmbeddings(self.model_name)

    @batching
    def encode(self, text: List[str], *args, **kwargs) -> np.ndarray:
        from flair.data import Sentence
        # tokenize text
        batch_tokens = [Sentence(sent) for sent in text]

        flair_encodes = self._flair.embed(batch_tokens)

        pooled_data = []
        for sentence in flair_encodes:
            _layer_data = np.stack([s.embedding.numpy() for s in sentence])
            _pooled = pooling_np(_layer_data, self.pooling_strategy)
            pooled_data.append(_pooled)
        return np.array(pooled_data, dtype=np.float32)
def get_flair_embeddings(sentence_list):
    flair_sentence_object_list = []
    for sentence_string in sentence_list:
        sentence_string = str(sentence_string) + " ."
        flair_sentence_object_list.append(Sentence(sentence_string))
    numpy_embedding_list = list([])
    flair_embedding_forward = FlairEmbeddings(modelconstants.FLAIR_MODEL_NAME)
    for sentence_object in flair_sentence_object_list:
        flair_embedding_forward.embed(sentence_object)
        composite_vector = [
            0.0 for _ in range(flair_embedding_forward.embedding_length)
        ]
        for token in sentence_object:
            token_embedding = token.embedding.numpy()
            composite_vector = (np.array(composite_vector) +
                                np.array(token_embedding)) / 2.0
        numpy_embedding_list.append(composite_vector)
    return np.array(numpy_embedding_list)
Пример #11
0
def test_train_language_model(results_base_path, resources_path):
    dictionary = Dictionary.load(u'chars')
    language_model = LanguageModel(dictionary,
                                   is_forward_lm=True,
                                   hidden_size=128,
                                   nlayers=1)
    corpus = TextCorpus((resources_path / u'corpora/lorem_ipsum'),
                        dictionary,
                        language_model.is_forward_lm,
                        character_level=True)
    trainer = LanguageModelTrainer(language_model, corpus, test_mode=True)
    trainer.train(results_base_path,
                  sequence_length=10,
                  mini_batch_size=10,
                  max_epochs=2)
    char_lm_embeddings = FlairEmbeddings(
        unicode((results_base_path / u'best-lm.pt')))
    sentence = Sentence(u'I love Berlin')
    char_lm_embeddings.embed(sentence)
    (text, likelihood) = language_model.generate_text(number_of_characters=100)
    assert (text is not None)
    assert (len(text) >= 100)
    shutil.rmtree(results_base_path, ignore_errors=True)
Пример #12
0
for token in sentence:
    print(token)
    print(token.embedding)

#Fasttest Embedding加载训练
fasttext_embedding_forward = WordEmbeddings('model/zh-wiki-fasttext-300d-1M')
sentence = Sentence('The grass is green .')
fasttext_embedding_forward.embed(sentence)
for token in sentence:
    print(token)
    print(token.embedding)

#Flair Embedding加载训练
flair_embedding_forward = FlairEmbeddings('model/news-forward-0.4.1.pt')
sentence = Sentence('The grass is green .')
flair_embedding_forward.embed(sentence)
for token in sentence:
    print(token)
    print(token.embedding)

#Bert Embedding加载训练
embedding = BertEmbeddings()
sentence = Sentence('The grass is green .')
embedding.embed(sentence)
for token in sentence:
    print(token)
    print(token.embedding)

#Elmo Embedding加载训练
embedding = ELMoEmbeddings()
sentence = Sentence('The grass is green .')
corpus = pickle.load(open(inputFileName, 'br'))

# In[5]:

vectors = {}

# In[6]:

for d in corpus:
    print("processing ", d)
    totLen = len(corpus[d]['text'])
    for i, s in enumerate(corpus[d]['text']):
        if i % 10 == 0:
            print("processed {}/{}        ".format(i, totLen), end='\r')
        sentence = Sentence(s)
        char_lm_embeddings.embed(sentence)
        for token in sentence:
            if not token.text in vectors:
                string = token.text
                for v in token.embedding.cpu().numpy():
                    string += ' {}'.format(v)
                vectors[token.text] = string
    print("processed {}/{}        ".format(i, totLen))

# In[11]:

with open(outputFileName, 'wt') as f:
    for k in vectors:
        f.write(vectors[k])
        f.write("\n")
from flair.data import Sentence
from flair.embeddings import FlairEmbeddings

sentence = Sentence(
    'расчете на душу населения ниже среднекраевого уровня приход на потребительский	 рынок г Алейска '
    'крупных торговых сетей г Барнаула Мария-Ра Аникс Новэкс Магнит Холди и др отрицательно влияет на '
    'динамику оборота розничной торговли района в районе не развито бытовое обслуживание населения '
    'отсутствуют комплексные приемные пункты')

# init emdeddings from your trained LM
char_lm_embeddings = FlairEmbeddings(
    'resources/taggers/language_model/best-lm.pt')

# embed sentence
print(char_lm_embeddings.embed(sentence))
Пример #15
0
class FlairEmbedder(nn.Module, ClassNursery, BaseEmbedder):
    def __init__(
        self,
        embedding_type: str,
        datasets_manager: DatasetsManager = None,
        device: Union[str, torch.device] = "cpu",
        word_tokens_namespace: str = "tokens",
    ):
        """ Flair Embeddings. This is used to produce Named Entity Recognition. Note: This only
        works if your tokens are produced by splitting based on white space

        Parameters
        ----------
        embedding_type
        datasets_manager
        device
        word_tokens_namespace
        """
        super(FlairEmbedder, self).__init__()
        self.allowed_type = ["en", "news"]
        assert embedding_type in self.allowed_type
        self.embedder_forward = FlairEmbeddings(f"{embedding_type}-forward")
        self.embedder_backward = FlairEmbeddings(f"{embedding_type}-backward")
        self.embedder_name = f"FlairEmbedder-{embedding_type}"
        self.datasets_manager = datasets_manager
        self.device = torch.device(device) if isinstance(device, str) else device
        self.word_tokens_namespace = word_tokens_namespace

    def forward(self, lines: List[Line]):
        sentences = []
        for line in lines:
            sentence = Sentence(line.text)
            sentences.append(sentence)

        len_tokens = [len(line.tokens[self.word_tokens_namespace]) for line in lines]
        max_len = max(len_tokens)

        _ = self.embedder_forward.embed(sentences)
        _ = self.embedder_backward.embed(sentences)

        batch_embeddings = []
        for sentence in sentences:
            sentence_embeddings = []
            padding_length = max_len - len(sentence)
            for token in sentence:
                embedding = token.get_embedding()
                embedding = embedding.to(self.device)
                sentence_embeddings.append(embedding)
            for i in range(padding_length):
                embedding = torch.randn(
                    self.get_embedding_dimension(),
                    dtype=torch.float,
                    device=self.device,
                )
                sentence_embeddings.append(embedding)

            sentence_embeddings = torch.stack(sentence_embeddings)
            batch_embeddings.append(sentence_embeddings)

        # batch_size, num_tokens, embedding_dim
        batch_embeddings = torch.stack(batch_embeddings)
        batch_embeddings = batch_embeddings.to(self.device)

        for idx, line in enumerate(lines):
            line_embeddings = batch_embeddings[idx]
            for token, emb in zip(
                line.tokens[self.word_tokens_namespace], line_embeddings
            ):
                token.set_embedding(name=self.embedder_name, value=emb)

        return batch_embeddings

    def get_embedding_dimension(self):
        return self.embedder_forward.embedding_length * 2  # for forward and backward
Пример #16
0
class FlairEmbedding(ContextualEmbedding):
    def __init__(self,
                 vocab: Vocabulary,
                 model_dir_or_name: str = 'en-base-uncased',
                 layers: str = '-1',
                 pool_method: str = 'first',
                 word_dropout=0,
                 dropout=0,
                 include_cls_sep: bool = False,
                 pooled_cls=True,
                 requires_grad: bool = True,
                 auto_truncate: bool = False,
                 **kwargs):

        super(FlairEmbedding, self).__init__(vocab,
                                             word_dropout=word_dropout,
                                             dropout=dropout)

        if word_dropout > 0:
            assert vocab.unknown is not None, "When word_drop>0, Vocabulary must contain the unknown token."

        self._word_sep_index = -100
        if '[SEP]' in vocab:
            self._word_sep_index = vocab['[SEP]']
        self._word_cls_index = -100
        if '[CLS]' in vocab:
            self._word_cls_index = vocab['CLS']

        self.vocab = vocab

        self.model = FlairEmbeddings(model=model_dir_or_name, fine_tune=False)

        self.requires_grad = requires_grad
        self._embed_size = self.model.embedding_length

    def _delete_model_weights(self):
        del self.model

    def forward(self, words):

        max_length = words.shape[1]
        words = self.drop_word(words)
        words_sentences = []
        for sentence in words:
            words_sentences.append([
                self.vocab.idx2word[word.item()] for word in sentence
                if word.item() != 0
            ])

        words = [Sentence(' '.join(x)) for x in words_sentences]
        self.model.embed(words)

        outputs = torch.stack([
            torch.stack([x.embedding for x in y] + (max_length - len(y)) *
                        [torch.zeros(2048).to(next(self.parameters()).device)])
            for y in words
        ])

        del words
        del words_sentences
        torch.cuda.empty_cache()

        if outputs is not None:
            return self.dropout(outputs)
        return self.dropout(outputs)

    def drop_word(self, words):

        if self.word_dropout > 0 and self.training:
            with torch.no_grad():
                mask = torch.full_like(words,
                                       fill_value=self.word_dropout,
                                       dtype=torch.float,
                                       device=words.device)
                mask = torch.bernoulli(mask).eq(1)
                pad_mask = words.ne(self._word_pad_index)
                mask = pad_mask.__and__(mask)
                if self._word_sep_index != -100:
                    not_sep_mask = words.ne(self._word_sep_index)
                    mask = mask.__and__(not_sep_mask)
                if self._word_cls_index != -100:
                    not_cls_mask = words.ne(self._word_cls_index)
                    mask = mask.__and__(not_cls_mask)
                words = words.masked_fill(mask, self._word_unk_index)
        return words