def get_flair_embeddings(): jw_forward: FlairEmbeddings = FlairEmbeddings("multi-forward", chars_per_chunk=128) jw_backward: FlairEmbeddings = FlairEmbeddings("multi-backward", chars_per_chunk=128) embeddings: list = [jw_forward, jw_backward] return embeddings
def main(data_folder: str, model_folder: str, dev_size: float, nb_epochs: int) -> None: nlp = spacy.blank('fr') nlp.tokenizer = get_tokenizer(nlp) corpus: Corpus = prepare_flair_train_test_corpus(spacy_model=nlp, data_folder=data_folder, dev_size=dev_size) tag_dictionary = corpus.make_tag_dictionary(tag_type='ner') print(tag_dictionary.idx2item) embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('fr'), FlairEmbeddings('fr-forward'), FlairEmbeddings('fr-backward'), ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, use_crf=True, tag_dictionary=tag_dictionary, tag_type='ner') trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train(model_folder, max_epochs=nb_epochs, mini_batch_size=32, embeddings_storage_mode="cpu", checkpoint=False)
def train(): # load training data in FastText format corpus = NLPTaskDataFetcher.load_classification_corpus( Path('./'), test_file='./data/test.txt', train_file='./data/train.txt') # Combine different embeddings: # Glove word ebmeddings + Flair contextual string embeddings word_embeddings = [ WordEmbeddings('glove'), FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast') ] # use LSTM based method for combining the different embeddings document_embeddings = DocumentLSTMEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256) classifier = TextClassifier( document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False) trainer = ModelTrainer(classifier, corpus) trainer.train('./models', max_epochs=10)
def build_train_sequence_tagger(corpus, tag_dictionary, params: Params, TAG_TYPE="ner"): embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=[ WordEmbeddings("glove"), FlairEmbeddings("news-forward"), FlairEmbeddings("news-backward"), ]) from flair.models import SequenceTagger tagger: SequenceTagger = SequenceTagger( hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=TAG_TYPE, ) from flair.trainers import ModelTrainer corpus = Corpus(train=corpus.train, dev=corpus.dev, test=[]) trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train( "flair_checkpoints", train_with_dev=False, max_epochs=params.max_epochs, save_final_model=False, ) # original return tagger
def download_flair_models(): w = WordEmbeddings("en-crawl") w = WordEmbeddings("news") w = FlairEmbeddings("news-forward-fast") w = FlairEmbeddings("news-backward-fast") w = FlairEmbeddings("mix-forward") w = BertEmbeddings("bert-base-uncased")
def get_scibert_flair_embeddings(): return [ TransformerWordEmbeddings(model="allenai/scibert_scivocab_uncased", fine_tune=True), FlairEmbeddings("pubmed-forward"), FlairEmbeddings("pubmed-backward") ]
def main(): args = parse_args() if not os.path.exists(args.data_dir): raise Exception(f'Path does not exist: {args.data_dir}') # 1. Build corpus columns = {0: 'text', 1: 'ner'} corpus: Corpus = ColumnCorpus(args.data_dir, columns, train_file=args.train_file, dev_file=args.dev_file, test_file=args.test_file) print(corpus) print(corpus.obtain_statistics()) # 2. What tag do we want to predict? tag_type = 'ner' # 3. Build tag dictionary tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) # 4. Initialize embeddings embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('crawl'), FlairEmbeddings(args.forward_flair_embeddings), FlairEmbeddings(args.backward_flair_embeddings), ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) # 5. Initialize sequence tagger tagger: SequenceTagger = SequenceTagger(hidden_size=HIDDEN_SIZE, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type) # 6. Initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) if args.learning_rate_find: print('***** Plotting learning rate') # 7a. Find learning rate learning_rate_tsv = trainer.find_learning_rate( 'temp', 'learning_rate.tsv', mini_batch_size=MINI_BATCH_SIZE) else: print('***** Running train') # 7b. Run Training trainer.train( 'temp', learning_rate=0.1, mini_batch_size=MINI_BATCH_SIZE, # it's a big dataset so maybe set embeddings_in_memory to False embeddings_storage_mode='none') tag_and_output(corpus.test, tagger, os.path.join(args.data_dir, args.test_output_file), tag_type)
def flair_embeddings(sentences, tokenized_contents, output_file=None): if output_file: f = open(output_file, 'w') # init embedding flair_embedding_forward = FlairEmbeddings('news-forward') for i, (sent, sent_tokens) in enumerate(zip(sentences, tokenized_contents)): print( "Encoding the {}th input sentence for Flair embedding!".format(i)) # Getting the tokens from our own tokenized sentence! tokens: List[Token] = [Token(token) for token in sent_tokens] if len(tokens) != len(sent_tokens): raise ValueError("tokens length does not match sent_tokens length") # Create new empty sentence sentence = Sentence() # add our own tokens sentence.tokens = tokens flair_embedding_forward.embed(sentence) for token in sentence: if output_file: f.write( token.text + " " + " ".join([str(num) for num in token.embedding.tolist()]) + '\n') else: print(token.text + " " + " ".join([str(num) for num in token.embedding.tolist()]) + '\n') f.write('\n')
def generate_topics_on_series(series): """https://towardsdatascience.com/covid-19-with-a-flair-2802a9f4c90f Returns: [type]: [description] """ validate_text(series) # initialise embedding classes flair_embedding_forward = FlairEmbeddings("news-forward") flair_embedding_backward = FlairEmbeddings("news-backward") bert_embedding = BertEmbeddings("bert-base-uncased") # combine word embedding models document_embeddings = DocumentPoolEmbeddings( [bert_embedding, flair_embedding_backward, flair_embedding_forward]) # set up empty tensor X = torch.empty(size=(len(series.index), 7168)).cuda() # fill tensor with embeddings i = 0 for text in tqdm(series): sentence = Sentence(text) document_embeddings.embed(sentence) embedding = sentence.get_embedding() X[i] = embedding i += 1 X = X.cpu().detach().numpy() torch.cuda.empty_cache() return X
def __init__( self, embedding_type: str, datasets_manager: DatasetsManager = None, device: Union[str, torch.device] = "cpu", word_tokens_namespace: str = "tokens", ): """ Flair Embeddings. This is used to produce Named Entity Recognition. Note: This only works if your tokens are produced by splitting based on white space Parameters ---------- embedding_type datasets_manager device word_tokens_namespace """ super(FlairEmbedder, self).__init__() self.allowed_type = ["en", "news"] assert embedding_type in self.allowed_type self.embedder_forward = FlairEmbeddings(f"{embedding_type}-forward") self.embedder_backward = FlairEmbeddings(f"{embedding_type}-backward") self.embedder_name = f"FlairEmbedder-{embedding_type}" self.datasets_manager = datasets_manager self.device = torch.device(device) if isinstance(device, str) else device self.word_tokens_namespace = word_tokens_namespace
def test_visualize(resources_path): with open(resources_path / 'visual/snippet.txt') as f: sentences = [x for x in f.read().split('\n') if x] sentences = [Sentence(x) for x in sentences] embeddings = FlairEmbeddings('news-forward') visualizer = Visualizer() X_forward = visualizer.prepare_char_embeddings(embeddings, sentences) embeddings = FlairEmbeddings('news-backward') X_backward = visualizer.prepare_char_embeddings(embeddings, sentences) X = numpy.concatenate([X_forward, X_backward], axis=1) contexts = visualizer.char_contexts(sentences) trans_ = tSNE() reduced = trans_.fit(X) visualizer.visualize(reduced, contexts, str(resources_path / 'visual/char_embeddings.html')) # clean up directory (resources_path / 'visual/char_embeddings.html').unlink()
def __init__(self, vocab: Vocabulary, model_dir_or_name: str = 'en-base-uncased', layers: str = '-1', pool_method: str = 'first', word_dropout=0, dropout=0, include_cls_sep: bool = False, pooled_cls=True, requires_grad: bool = True, auto_truncate: bool = False, **kwargs): super(FlairEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) if word_dropout > 0: assert vocab.unknown is not None, "When word_drop>0, Vocabulary must contain the unknown token." self._word_sep_index = -100 if '[SEP]' in vocab: self._word_sep_index = vocab['[SEP]'] self._word_cls_index = -100 if '[CLS]' in vocab: self._word_cls_index = vocab['CLS'] self.vocab = vocab self.model = FlairEmbeddings(model=model_dir_or_name, fine_tune=False) self.requires_grad = requires_grad self._embed_size = self.model.embedding_length
class FlairEmbedding(EmbeddingBase): def __init__(self): self.forward_model = FlairEmbeddings("pl-forward") self.backward_model = FlairEmbeddings("pl-backward") self.size = 8192 def _get_vector(self, forward: Sentence, backward: Sentence) -> np.ndarray: res = np.zeros(self.size, dtype=np.float32) for idx in range(len(forward)): out_fwd = np.fromiter(forward.tokens[idx].embedding.tolist(), dtype=np.float32) out_bwd = np.fromiter(backward.tokens[idx].embedding.tolist(), dtype=np.float32) out = np.hstack((out_fwd, out_bwd)) res += out res /= len(forward) return res def batcher(self, params, batch: List[List[str]]) -> np.ndarray: batch = [ Sentence(" ".join(sent)) if sent != [] else ['.'] for sent in batch ] embeddings = [] outputs_forward = self.forward_model.embed(batch) outputs_backward = self.backward_model.embed(batch) for forward, backward in zip(outputs_forward, outputs_backward): embeddings.append(self._get_vector(forward, backward)) embeddings = np.vstack(embeddings) return embeddings def dim(self) -> int: return self.size
def train(): corpus: Corpus = ClassificationCorpus(sst_folder, test_file='test.csv', dev_file='dev.csv', train_file='sst_dev.csv') label_dict = corpus.make_label_dictionary() stacked_embedding = WordEmbeddings('glove') # Stack Flair string-embeddings with optional embeddings word_embeddings = list( filter(None, [ stacked_embedding, FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast'), ])) # Initialize document embedding by passing list of word embeddings document_embeddings = DocumentRNNEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256, ) # Define classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False) trainer = ModelTrainer(classifier, corpus) trainer.train(model_path, max_epochs=10, train_with_dev=False)
def test_sequence_tagger_param_selector(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpus(NLPTask.FASHION, base_path=tasks_base_path) search_space = SearchSpace() search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[ StackedEmbeddings([WordEmbeddings(u'glove')]), StackedEmbeddings([ WordEmbeddings(u'glove'), FlairEmbeddings(u'news-forward'), FlairEmbeddings(u'news-backward') ]) ]) search_space.add(Parameter.USE_CRF, hp.choice, options=[True, False]) search_space.add(Parameter.DROPOUT, hp.uniform, low=0.25, high=0.75) search_space.add(Parameter.WORD_DROPOUT, hp.uniform, low=0.0, high=0.25) search_space.add(Parameter.LOCKED_DROPOUT, hp.uniform, low=0.0, high=0.5) search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[64, 128]) search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2]) search_space.add(Parameter.OPTIMIZER, hp.choice, options=[SGD]) search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[4, 8, 32]) search_space.add(Parameter.LEARNING_RATE, hp.uniform, low=0.01, high=1) search_space.add(Parameter.ANNEAL_FACTOR, hp.uniform, low=0.3, high=0.75) search_space.add(Parameter.PATIENCE, hp.choice, options=[3, 5]) search_space.add(Parameter.WEIGHT_DECAY, hp.uniform, low=0.01, high=1) optimizer = SequenceTaggerParamSelector(corpus, u'ner', results_base_path, max_epochs=2) optimizer.optimize(search_space, max_evals=2) shutil.rmtree(results_base_path)
def train(self): path = "./src/tmp/" self.training_data = self.convert_format(self.training_data) corpus: Corpus = ColumnCorpus(".", {0: 'text', 1: 'ner'}, train_file=self.training_data ) tag_dictionary = corpus.make_tag_dictionary(tag_type='ner') embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('fr'), FlairEmbeddings('fr-forward'), FlairEmbeddings('fr-backward'), ] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type='ner', use_crf=True) self.trainer = ModelTrainer(tagger, corpus) save_path = path + self.model_name self.trainer.train(save_path,learning_rate=self.learning_rate,mini_batch_size=self.batch_size, max_epochs=self.nb_iter,embeddings_storage_mode=self.mode) self.is_ready = 1
class Embedder: def __init__(self): self.embedder = FlairEmbeddings('news-forward-fast') self.embedding_length = self.__len__() def __len__(self): return self.embedder.embedding_length def __call__(self, sentences: np.ndarray): return self.embed(sentences) def embed(self, sentences: np.ndarray): if not isinstance(sentences, np.ndarray): raise TypeError( f'Expected numpy ndarray input got {type(sentences)}') if sentences.ndim != 2: raise TypeError( f'Expected numpy ndarray with 2 dims, try to A.reshape(-1, 1) ' ) sentences = [Sentence(sentence[0]) for sentence in sentences] self.embedder.embed(sentences) embeddings = [] for sentence in sentences: embeddings.append( torch.stack([token.embedding.cpu() for token in sentence])) return embeddings
def test_train_language_model(results_base_path, resources_path): # get default dictionary dictionary: Dictionary = Dictionary.load('chars') # init forward LM with 128 hidden states and 1 layer language_model: LanguageModel = LanguageModel(dictionary, is_forward_lm=True, hidden_size=128, nlayers=1) # get the example corpus and process at character level in forward direction corpus: TextCorpus = TextCorpus(resources_path / 'corpora/lorem_ipsum', dictionary, language_model.is_forward_lm, character_level=True) # train the language model trainer: LanguageModelTrainer = LanguageModelTrainer(language_model, corpus, test_mode=True) trainer.train(results_base_path, sequence_length=10, mini_batch_size=10, max_epochs=2) # use the character LM as embeddings to embed the example sentence 'I love Berlin' char_lm_embeddings = FlairEmbeddings(str(results_base_path / 'best-lm.pt')) sentence = Sentence('I love Berlin') char_lm_embeddings.embed(sentence) text, likelihood = language_model.generate_text(number_of_characters=100) assert (text is not None) assert (len(text) >= 100) # clean up results directory shutil.rmtree(results_base_path, ignore_errors=True)
def test_generate_text_with_small_temperatures(): from flair.embeddings import FlairEmbeddings language_model = FlairEmbeddings(u'news-forward-fast').lm (text, likelihood) = language_model.generate_text(temperature=0.01, number_of_characters=100) assert (text is not None) assert (len(text) >= 100)
def train(): columns = {0: 'text', 1: 'pos'} # init a corpus using column format, data folder and the names of the train, dev and test files corpus: Corpus = ColumnCorpus('', columns, train_file=args.train, test_file=args.test, dev_file=args.dev) tag_dictionary = corpus.make_tag_dictionary(tag_type='pos') # initialize embeddings embedding_types: List[TokenEmbeddings] = [ CharacterEmbeddings(), FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), ] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type='pos', use_crf=True) trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train(args.model, learning_rate=0.1, mini_batch_size=32, max_epochs=150)
def _get_embedding_model( model_name_or_path: Union[str, HFModelResult, FlairModelResult] ) -> Union[FlairEmbeddings, WordEmbeddings, TransformerWordEmbeddings, Sentence]: "Load the proper `Embeddings` model from `model_name_or_path`" if isinstance(model_name_or_path, FlairModelResult): nm = model_name_or_path.name try: return WordEmbeddings(nm.strip('flairNLP/')) except: return FlairEmbeddings(nm.strip('flairNLP/')) elif isinstance(model_name_or_path, HFModelResult): return TransformerWordEmbeddings(model_name_or_path.name) else: res = _flair_hub.search_model_by_name(model_name_or_path, user_uploaded=True) if len(res) < 1: # No models found res = _hf_hub.search_model_by_name(model_name_or_path, user_uploaded=True) if len(res) < 1: raise ValueError( f'Embeddings not found for the model key: {model_name_or_path}, check documentation or custom model path to verify specified model' ) else: return TransformerWordEmbeddings( res[0].name ) # Returning the first should always be the non-fast option else: nm = res[0].name try: return WordEmbeddings(nm.strip('flairNLP/')) except: return FlairEmbeddings(nm.strip('flairNLP/'))
def __init__(self, parent_dir_dataset: {str}, dataset_name: {str}, parent_dir_data: {str}, parent_dir_model: {str}, sentiment_model_dir: {str}, word_embeddings: {list} = None): self.parent_dir_dataset = parent_dir_dataset self.dataset_name = dataset_name self.parent_dir_data = parent_dir_data self.parent_dir_model = parent_dir_model self.sentiment_model_dir = sentiment_model_dir self.dataset_filepath = os.path.join(self.parent_dir_dataset, self.dataset_name) self.model_filepath = os.path.join(self.parent_dir_model, self.sentiment_model_dir) self.train_filename = os.path.join(self.parent_dir_data, "train.csv") self.test_filename = os.path.join(self.parent_dir_data, "test.csv") self.dev_filename = os.path.join(self.parent_dir_data, "dev.csv") self.column_name_map = {} if word_embeddings is None: self.word_embeddings = [ FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward') ] else: self.word_embeddings = word_embeddings self.corpus = None self.document_RNNEmbeddings = None self.label = gv.label_sarcasm self.renamed_columns = gv.sarcasm_renamed_columns if gv.logger is None: gv.init_logger_object()
def initialize_embeddings(self, fastbert=True, stackedembeddings=True): # Consider using pooling_operation="first", use_scalar_mix=True for the parameters # initialize individual embeddings if fastbert: bert_embedding = BertEmbeddings('distilbert-base-uncased', layers='-1') else: bert_embedding = BertEmbeddings('bert-base-cased', layers='-1') if stackedembeddings: glove_embedding = WordEmbeddings('glove') # init Flair forward and backwards embeddings flair_embedding_forward = FlairEmbeddings('news-forward') flair_embedding_backward = FlairEmbeddings('news-backward') embedding_types = [ bert_embedding, glove_embedding, flair_embedding_forward, flair_embedding_backward ] embeddings = StackedEmbeddings(embeddings=embedding_types) else: embeddings = bert_embedding return embeddings
def create_embeddings(self) -> StackedEmbeddings: embedding_types: List[FlairEmbeddings] = [] if self.config['use_word_embeddings']: embedding_types.append(W2vWordEmbeddings(self.config['word_embeddings_path'])) if self.config['use_char_embeddings']: embedding_types.append(CharacterEmbeddings()) if self.config['use_flair_embeddings']: embedding_types.append(FlairEmbeddings('es-clinical-forward')) embedding_types.append(FlairEmbeddings('es-clinical-backward')) if self.config['use_beto_embeddings']: embedding_types.append( TransformerWordEmbeddings( 'dccuchile/bert-base-spanish-wwm-cased', layers = self.config['layers'], layer_mean = self.config['layer_mean'], subtoken_pooling = self.config['subtoken_pooling'])) embeddings: StackedEmbeddings = StackedEmbeddings(embeddings = embedding_types) return embeddings
def out_embedding(type_, model, n_layers, stacked=False): ''' Create object of embedding type for later purpose :param: :type_: (str) type of embedding (currently there are only BERT or Flair embeddings) :model: (str) pretrained model of BERT embedding :n_layers: (int) number of last layers of trained BERT embeddings to be chosen :stacked: (bool) if this embedding is a combination of more embeddings (True/False) :return: :embedding: (BertEmbeddings / StackedEmbeddings) embedding object ''' out_layers = ','.join([str(-i) for i in range(1, n_layers + 1)]) if not stacked: if type_.lower() == 'bert': embedding = BertEmbeddings(bert_model_or_path=model, layers=out_layers) return embedding else: emb = WordEmbeddings('glove') else: emb = BertEmbeddings(bert_model_or_path=model, layers=out_layers) flair_forward = FlairEmbeddings('news-forward-fast') flair_backward = FlairEmbeddings('news-backward-fast') embedding = StackedEmbeddings( embeddings=[flair_forward, flair_backward, emb]) return embedding
def flair_embeddings(sentences, output_file=None): if output_file: f = open(output_file, 'w') embedder = FlairEmbeddings( "multi-forward" ) #multilingual; you also have nl-forward; no french model though document_embedding = [] for i, sent in enumerate(sentences): print("Encoding the {}th input sentence!".format(i)) # create a sentence sentence = Sentence(" ".join(sent)) # embed words in sentence embedder.embed(sentence) sentence_embedding = np.mean( [token.embedding.cpu().numpy() for token in sentence], axis=0) #have to go from CUDA tensor to cpu tensor document_embedding.append(sentence_embedding) if output_file: for token in sentence: f.write( token.text + "\t" + "\t".join([str(num) for num in token.embedding.tolist()]) + '\n') document_embedding = np.mean(document_embedding, axis=0) return document_embedding
def __init__(self, num_classes: int = 2, bidirectional: bool = False, rnn_layers: int = 1, hidden_size: int = 256, rnn_type: str = 'GRU'): super(ATAE_LSTM, self).__init__() self.stackedembeddings: StackedEmbeddings = StackedEmbeddings([ FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward') ]) self.wordembeddings: StackedEmbeddings = StackedEmbeddings( [WordEmbeddings('glove')]) self.embedding_dimension: int = self.stackedembeddings.embedding_length + self.wordembeddings.embedding_length self.bidirectional: bool = bidirectional self.rnn_layers: int = rnn_layers self.rnn_type: str = rnn_type self.num_classes: int = num_classes self.hidden_size: int = hidden_size if self.rnn_type == 'GRU': self.rnn = torch.nn.GRU(self.embedding_dimension, self.hidden_size, bidirectional=self.bidirectional, num_layers=self.rnn_layers) else: self.rnn = torch.nn.LSTM(self.embedding_dimension, self.hidden_size, bidirectional=self.bidirectional, num_layers=self.rnn_layers) self.attention = Attention()
def load_context_embeddings_with_flair(direction='bi', word_embeddings=True, cache_dir=DEFAULT_CACHE_DIR, verbose=False): """ :param bidirectional: :param cache_dir: :param verbose: """ from flair.embeddings import FlairEmbeddings from flair.embeddings import WordEmbeddings from flair.embeddings import StackedEmbeddings embeddings = [] if word_embeddings: fasttext_embedding = WordEmbeddings('da') embeddings.append(fasttext_embedding) if direction == 'bi' or direction == 'fwd': fwd_weight_path = download_model('flair.fwd', cache_dir, verbose=verbose, process_func=_unzip_process_func) embeddings.append(FlairEmbeddings(fwd_weight_path)) if direction == 'bi' or direction == 'bwd': bwd_weight_path = download_model('flair.bwd', cache_dir, verbose=verbose, process_func=_unzip_process_func) embeddings.append(FlairEmbeddings(bwd_weight_path)) if len(embeddings) == 1: return embeddings[0] return StackedEmbeddings(embeddings=embeddings)
def embed_dataset() -> List: # init standard GloVe embedding glove_embedding = WordEmbeddings('glove') flair_embedding_forward = FlairEmbeddings('news-forward') # create a StackedEmbedding object that combines glove and forward/backward flair embeddings stacked_embeddings = StackedEmbeddings([ glove_embedding, flair_embedding_forward, ]) sentence_dataset = load_dataset( '/Users/haraldott/Development/thesis/anomaly_detection_main/logparser/Drain/Drain_result/st_0.2 depth_2/openstack_normal_10k.csv' ) embedded_sentences = [] count = 0.0 for s in sentence_dataset: sentence = Sentence(s) flair_embedding_forward.embed(sentence) embedded_sentences.append(sentence) if count % 50 == 0 or count == len(sentence_dataset): print('Processed {0:.1f}% of log lines.'.format( count * 100.0 / len(sentence_dataset))) count += 1 words = [] for sentence in embedded_sentences: for word in sentence: words.append(word.embedding) # TODO: is this correct? return all torch.save(words, '10k_depth_2_st_0.2.pt') return words
def optimize_lr(): corpus, label_dictionary = load_corpus() embeddings = [ WordEmbeddings('glove'), FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward') ] document_embeddings = DocumentRNNEmbeddings(embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256, bidirectional=True) classifier = TextClassifier(document_embeddings, label_dictionary=label_dictionary, multi_label=False) trainer = ModelTrainer(classifier, corpus) # 7. find learning rate learning_rate_tsv = trainer.find_learning_rate('resources/classifiers/', 'learning_rate.tsv') # 8. plot the learning rate finder curve from flair.visual.training_curves import Plotter plotter = Plotter() plotter.plot_learning_rate(learning_rate_tsv)