def post_init(self): import flair flair.device = self.device from flair.embeddings import WordEmbeddings, FlairEmbeddings, BytePairEmbeddings, PooledFlairEmbeddings, \ DocumentPoolEmbeddings embeddings_list = [] for e in self.embeddings: model_name, model_id = e.split(':', maxsplit=1) emb = None try: if model_name == 'flair': emb = FlairEmbeddings(model_id) elif model_name == 'pooledflair': emb = PooledFlairEmbeddings(model_id) elif model_name == 'word': emb = WordEmbeddings(model_id) elif model_name == 'byte-pair': emb = BytePairEmbeddings(model_id) except ValueError: self.logger.error(f'embedding not found: {e}') continue if emb is not None: embeddings_list.append(emb) if embeddings_list: self.model = DocumentPoolEmbeddings(embeddings_list, pooling=self.pooling_strategy) self.logger.info( f'flair encoder initialized with embeddings: {self.embeddings}' ) else: self.logger.error('flair encoder initialization failed.')
def post_init(self): from flair.embeddings import WordEmbeddings, FlairEmbeddings, BytePairEmbeddings, PooledFlairEmbeddings, \ DocumentPoolEmbeddings if self.model is not None: return embeddings_list = [] for e in self.embeddings: model_name, model_id = e.split(':', maxsplit=1) emb = None try: if model_name == 'flair': emb = FlairEmbeddings(model_id) elif model_name == 'pooledflair': emb = PooledFlairEmbeddings(model_id) elif model_name == 'word': emb = WordEmbeddings(model_id) elif model_name == 'byte-pair': emb = BytePairEmbeddings(model_id) except ValueError: self.logger.error('embedding not found: {}'.format(e)) continue if emb is not None: embeddings_list.append(emb) if embeddings_list: self.model = DocumentPoolEmbeddings(embeddings_list, pooling=self.pooling_strategy) self.logger.info( 'initialize flair encoder with embeddings: {}'.format( self.embeddings))
def __init__(self, pipeline): self.mode = pipeline.mode self.type = pipeline.embedding_type embedders = [] for component in pipeline.embedders: if "forward" in component or "backward" in component: embedders.append(FlairEmbeddings(component)) elif "glove" in component: embedders.append(WordEmbeddings(component)) elif "bert" in component: embedders.append(BertEmbeddings(component)) elif len(component) == 2: # see https://github.com/zalandoresearch/flair/blob/master/resources/docs/embeddings/FASTTEXT_EMBEDDINGS.md#fasttext-embeddings embedders.append(WordEmbeddings(component)) embedders.append(BytePairEmbeddings(component)) else: raise ValueError(f"unknown embedder: {component}") if self.type == "document": self.embedder = self._make_doc_embedder(pipeline, embedders) elif self.type == "word": self.embedder = StackedEmbeddings(embedders) elif self.type == "both": self.embedders = [ self._make_doc_embedder(pipeline, embedders), StackedEmbeddings(embedders), ] else: raise ValueError( f"Innapropriate embedding type {pipeline.embedding_type}, " "should be 'word', 'document', or 'both'.")
def build_embedding(self, lang, embedding_codes: List[str]) -> None: self.tic = time.time() self.embedding_name: str = "-".join(embedding_codes) self.lang = lang embedding_types: List[TokenEmbeddings] = [] for code in embedding_codes: code = code.lower() assert code in [ "bpe", "bert", "flair", "ft", "char", "ohe", "elmo", ], f"{code} - Invalid embedding code" if code == "ohe": embedding_types.append(OneHotEmbeddings(corpus=self.corpus)) elif code == "ft": embedding_types.append(WordEmbeddings(self.lang)) elif code == "bpe": embedding_types.append(BytePairEmbeddings(self.lang)) elif code == "bert": embedding_types.append( TransformerWordEmbeddings( model=self.huggingface_ref[self.lang], pooling_operation="first", layers="-1", fine_tune=False, ) ) elif code == "char": embedding_types.append(CharacterEmbeddings()) elif code == "flair": embedding_types.append(FlairEmbeddings(f"{self.lang}-forward")) embedding_types.append(FlairEmbeddings(f"{self.lang}-backward")) elif code == "elmo": embedding_types.append( ELMoEmbeddings(model="large", embedding_mode="all") ) self.embedding: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types ) self.tagger: SequenceTagger = SequenceTagger( hidden_size=256, embeddings=self.embedding, tag_dictionary=self.tag_dictionary, tag_type=self.tag_type, use_crf=True, ) self.trainer: ModelTrainer = ModelTrainer(self.tagger, self.corpus)
def load_model(): """Load word embeddings model.""" fasttext_embedding = WordEmbeddings('en-crawl') byte_embedding = BytePairEmbeddings('en') flair_embedding_forward = FlairEmbeddings('en-forward') flair_embedding_backward = FlairEmbeddings('en-backward') document_embeddings = DocumentPoolEmbeddings([fasttext_embedding, byte_embedding, flair_embedding_backward, flair_embedding_forward, ], fine_tune_mode='nonlinear') return document_embeddings
def load_doc_embeddings(): """Load word embeddings model.""" fasttext_embedding = WordEmbeddings('en-crawl') byte_embedding = BytePairEmbeddings('en') flair_embedding_forward = FlairEmbeddings('en-forward') flair_embedding_backward = FlairEmbeddings('en-backward') document_embeddings = DocumentPoolEmbeddings([fasttext_embedding, byte_embedding, flair_embedding_backward, flair_embedding_forward#Removed extraneous comma here, suprised it didn't cause an error??? ], fine_tune_mode='nonlinear') return document_embeddings
def get_embeddings(embeddings: List[str], character: bool, lang: str, bpe_size: int) -> StackedEmbeddings: """To Construct and return a embedding model""" stack = [] for e in embeddings: if e != '': if 'forward' in e or 'backward' in e: stack.append(FlairEmbeddings(e)) else: stack.append(WordEmbeddings(e)) if character: stack.append(CharacterEmbeddings()) if bpe_size > 0: stack.append(BytePairEmbeddings(language=lang, dim=bpe_size)) return StackedEmbeddings(embeddings=stack)
def __init__(self, config): """ Load pretrained language model """ super(LanguageModel, self).__init__() embeddings_stack = [] transformers = config.get("language_model", "transformers") if transformers is not "": transformers = transformers.split(";") for model in transformers: embeddings_stack.append( TransformerWordEmbeddings( model, layers="-1", pooling_operation='mean', # use_scalar_mix=True, fine_tune=True)) word_embeddings = config.get("language_model", "word_embeddings") if word_embeddings is not "": word_embeddings = word_embeddings.split(";") for model in word_embeddings: embeddings_stack.append(WordEmbeddings(model)) flair_embeddings = config.get("language_model", "flair_embeddings") if flair_embeddings is not "": flair_embeddings = flair_embeddings.split(";") for model in flair_embeddings: embeddings_stack.append(FlairEmbeddings(model, fine_tune=True)) character_embeddings = config.get("language_model", "character_embeddigs") if character_embeddings.lower() is "yes": embeddings_stack.append(CharacterEmbeddings(character_embeddings)) bytepair_embeddings = config.get("language_model", "bytepair_embeddings") if bytepair_embeddings.lower() is "yes": embeddings_stack.append(BytePairEmbeddings()) custom_embeddings = config.get("language_model", "custom_embeddings") if custom_embeddings is not "": custom_embeddings = custom_embeddings.split(";") for path in custom_embeddings: embeddings_stack.append(WordEmbeddings(path)) self.lm = StackedEmbeddings(embeddings_stack) self.embedding_dim = self.lm.embedding_length self.dropout = torch.nn.Dropout( float(config.get("language_model", "dropout"))) self.classify = torch.nn.Linear(self.embedding_dim, 2) if config.get("language_model", "relu") == "yes": self.relu = torch.nn.ReLU()
def get_embeddings(pooling_op='min'): return StackedEmbeddings(embeddings=[ # pre-trained embeddings PooledFlairEmbeddings( 'es-forward', pooling=pooling_op, ), PooledFlairEmbeddings( 'es-backward', pooling=pooling_op, ), BytePairEmbeddings( language='es', dim=300, ), # self-trained embeddings SpanishHealthCorpusEmbeddings('wang2vec'), # SpanishHealthCorpusEmbeddings('fastText'), ])
def __init__(self, device="cpu"): super(RankNetWithEmbeddings, self).__init__() self._device = device fasttext_embedding = WordEmbeddings('en-news') # flair_embedding_forward = FlairEmbeddings('news-forward') # flair_embedding_backward = FlairEmbeddings('news-backward') byte_pair_embedding = BytePairEmbeddings('en') glove_embeddings = WordEmbeddings('glove') character_embedding = CharacterEmbeddings() self._mention_embedding = DocumentPoolEmbeddings([fasttext_embedding]) self._label_embedding = DocumentPoolEmbeddings([ fasttext_embedding, ]) self._context_embedding = DocumentPoolEmbeddings([fasttext_embedding]) self._description_embedding = DocumentPoolEmbeddings([ fasttext_embedding, ]) input_length = self._mention_embedding.embedding_length \ + self._context_embedding.embedding_length \ + self._label_embedding.embedding_length \ + self._description_embedding.embedding_length self.model = nn.Sequential( nn.Linear(input_length, 256), nn.ReLU(), # nn.Dropout(0.2), nn.Linear(256, 64), nn.ReLU(), # nn.Dropout(0.2), nn.Linear(64, 1), nn.Tanh(), ) self.output_sig = nn.Sigmoid() self.to(device)
def __init__(self, make_unit_length=True): super().__init__(n_dims=200, make_unit_length=make_unit_length) embeddings = [WordEmbeddings('glove'), BytePairEmbeddings('en')] self.embeddings = DocumentPoolEmbeddings(embeddings) self.log = getLogger(type(self).__name__)
def __init__(self, lang, embeddings_dim, embedding_weights, hidden_dim, hidden_layers, dropout, output_layers=["embed_wsd"], lemma2synsets=None, synsets2id={}, pos_tags={}, entity_tags={}, use_flair=False, combine_WN_FN=False): super(WSDModel, self).__init__() self.use_flair = use_flair self.combine_WN_FN = combine_WN_FN self.output_layers = output_layers self.hidden_layers = hidden_layers self.hidden_dim = hidden_dim self.num_wsd_classes = 0 self.synsets2id = synsets2id output_emb_dim = embeddings_dim if use_flair is True: if lang == "Bulgarian": # BG EMBEDDINGS: self.word_embeddings = StackedEmbeddings([ WordEmbeddings( '/home/lenovo/dev/PostDoc/LREC/Embeddings/cc.bg.300.vec_FILTERED_OOV.gensim' ), # WordEmbeddings('bg'), # FastTextEmbeddings('/home/lenovo/dev/PostDoc/LREC/Embeddings/cc.bg.300.vec_FILTERED_OOV'), # Byte pair embeddings for English BytePairEmbeddings('bg'), FlairEmbeddings('bg-forward-fast'), FlairEmbeddings('bg-backward-fast'), CharacterEmbeddings() ]) elif lang == "English": # EN EMBEDDINGS: self.word_embeddings = StackedEmbeddings([ WordEmbeddings( '/home/lenovo/dev/word-embeddings/glove.6B/glove.6B.300d_MOD.gensim' ), WordEmbeddings( '/home/lenovo/dev/word-embeddings/lemma_sense_embeddings/' 'WN30WN30glConOne-C15I7S7N5_200M_syn_and_lemma_WikipediaLemmatized_FILTERED.gensim' ), # WordEmbeddings('bg'), # FastTextEmbeddings('/home/lenovo/dev/PostDoc/LREC/Embeddings/cc.bg.300.vec_FILTERED_OOV'), # Byte pair embeddings for English BytePairEmbeddings('en'), FlairEmbeddings('en-forward-fast'), FlairEmbeddings('en-backward-fast'), CharacterEmbeddings() ]) else: print("Unknown language!") exit(1) embeddings_dim = self.word_embeddings.embedding_length else: self.word_embeddings = nn.Embedding.from_pretrained( embedding_weights, freeze=True) self.lstm = nn.LSTM(embeddings_dim, hidden_dim, hidden_layers, bidirectional=True, batch_first=True, dropout=dropout) if "embed_wsd" in self.output_layers: # We want output with the size of the lemma&synset embeddings self.emb_relu = nn.ReLU() self.output_emb = nn.Linear(2 * hidden_dim, output_emb_dim) if "embed_frameID" in self.output_layers: self.emb_relu_frames = nn.ReLU() self.output_emb_frames = nn.Linear(2 * hidden_dim, output_emb_dim) if "classify_wsd" in self.output_layers: if len(self.synsets2id) > 0: self.output_classify = nn.Linear(2 * hidden_dim, len(self.synsets2id)) self.num_wsd_classes = len(self.synsets2id) else: lemma2layers = collections.OrderedDict() for lemma, synsets in lemma2synsets.items(): lemma2layers[lemma] = nn.Linear(2 * hidden_dim, len(synsets)) if len(synsets) > self.num_wsd_classes: self.num_wsd_classes = len(synsets) self.classifiers = nn.Sequential(lemma2layers) if "pos_tagger" in self.output_layers: self.pos_tags = nn.Linear(2 * hidden_dim, len(pos_tags)) if "ner" in self.output_layers: self.ner = nn.Linear(2 * hidden_dim, len(entity_tags)) self.dropout = nn.Dropout(dropout)
def train_all(self): config_file = open(self.config, "r") if self.config.split('.')[-1] == "yml": datastore = yaml.load(config_file) elif self.config.split('.')[-1] == "json": datastore = json.loads(config_file.read()) else: print("Need a json or yaml file as config") sys.exit(0) columns = { int(datastore["dataset_reader"]["position_text"]): "text", int(datastore["dataset_reader"]["position_ner"]): "ner", } # focus_on = datastore["dataset_reader"]["focus_on"] if bool(datastore["dataset_reader"]["only_train"]): all_corpus = [] log.info("Reading data from {}".format(datastore["dataset_reader"]["data_folder"])) all_corpus = ColumnCorpusTrain( datastore["dataset_reader"]["data_folder"], columns, train_file=datastore["dataset_reader"]["train_name"], ) tag_type = "ner" tag_dictionary = all_corpus[0].make_tag_dictionary(tag_type=tag_type) else: iobes_corpus = ColumnCorpus( datastore["dataset_reader"]["data_folder"], columns, train_file=datastore["dataset_reader"]["train_name"], dev_file=datastore["dataset_reader"]["dev_name"], test_file=datastore["dataset_reader"]["test_name"], ) tag_type = "ner" tag_dictionary = iobes_corpus.make_tag_dictionary(tag_type=tag_type) try: train_ratio = float(datastore["dataset_reader"]["train_ratio"]) iobes_corpus = Corpus(iobes_corpus.train[0:int(len(iobes_corpus.train) * train_ratio)], iobes_corpus.dev, iobes_corpus.test) log_ratio = "Using only ", str(train_ratio * 100), "% of the train dataset" log.info(log_ratio) except: pass embed_list = [] word_char = [] char_word = [] for embed in datastore["embeddings"]["embeddings_list"]: if embed == "bpe": embed_list.append(BytePairEmbeddings(datastore["embeddings"]["lang"])) elif embed == "fasttext": embed_list.append(WordEmbeddings(datastore["embeddings"]["lang"])) elif embed == "flair" and datastore["embeddings"]["lang"] == "en": embed_list.append(FlairEmbeddings("news-forward")) embed_list.append(FlairEmbeddings("news-backward")) elif embed == "bert-base-uncased": if datastore["embeddings"]["lang"] == "en": embed_list.append(BertEmbeddings("bert-base-uncased")) elif embed == "bert-base-cased": if datastore["embeddings"]["lang"] == "en": embed_list.append(BertEmbeddings("bert-base-cased")) elif embed == "bert-large-uncased": if datastore["embeddings"]["lang"] == "en": embed_list.append(BertEmbeddings("bert-large-uncased")) elif embed == "bert-large-cased": if datastore["embeddings"]["lang"] == "en": embed_list.append(BertEmbeddings("bert-large-cased")) elif embed == "elmo-small": if datastore["embeddings"]["lang"] == "en": embed_list.append(ELMoEmbeddings("small")) elif embed == "elmo-medium": if datastore["embeddings"]["lang"] == "en": embed_list.append(ELMoEmbeddings("medium")) elif embed == "elmo-original": if datastore["embeddings"]["lang"] == "en": embed_list.append(ELMoEmbeddings("original")) elif embed == "bert-base-chinese": if datastore["embeddings"]["lang"] == "zh": embed_list.append(emb.BertEmbeddingsChinese("bert-base-chinese")) else: split_name = embed.split(".") ext = split_name[-1] kind = split_name[-2] if ext == "pt": # Flair type extra_index = 0 try: extra_index = int(datastore["embeddings"]["extra_index"]) except: pass if kind == "char": embed_list.append(emb.FlairEmbeddingsChar(embed, extra_index=extra_index)) elif kind == "char-seg": embed_list.append(emb.FlairEmbeddingsWordLevelCharSeg(embed, extra_index=extra_index)) if ext == "vec": # Char type if kind == "char-seg": embed_list.append(emb.WordEmbeddingsVecCharSeg(embed)) elif kind == "char": embed_list.append(emb.WordEmbeddingsVecFirst(embed)) elif kind == "word": embed_list.append(emb.WordEmbeddingsVecWord(embed)) elif kind == "bichar": embed_list.append(emb.WordEmbeddingsVecBichar(embed)) if ext == "bin": if kind == "word": embed_list.append(emb.WordEmbeddingsBinWord(embed)) elif kind == "bichar": embed_list.append(emb.WordEmbeddingsBinBichar(embed)) try: if bool(datastore["embeddings"]["ner_embed"]) == True: print("Generate NER embeddings..") embed_list.append( emb.nerEmbedding( generateNerEmbFromTrain( iobes_corpus.train, tag_dictionary.get_items() ) ) ) except: pass try: if bool(datastore["embeddings"]["one_hot"]) == True: print("Generate one hot embeddings..") embed_list.append(emb.OneHotEmbeddings(iobes_corpus)) except: pass try: if datastore["embeddings"]["embeddings_ngram_list"] != None: embed_list.append( emb.WordEmbeddingsVecNGramList( datastore["embeddings"]["embeddings_ngram_list"] ) ) except: pass if len(word_char) == 1 and len(char_word) == 1: embed_list.append(emb.WordEmbeddingsVecWordChar(word_char[0], char_word[0])) embedding_types: List[TokenEmbeddings] = embed_list embeddings: emb.StackedEmbeddingsNew = emb.StackedEmbeddingsNew( embeddings=embedding_types ) if bool(datastore["dataset_reader"]["only_train"]): score = [] for i in range(len(all_corpus)): tagger: SequenceTagger = SequenceTagger( hidden_size=int(datastore["model"]["hidden_size"]), embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=bool(datastore["model"]["use_crf"]), dropout=float(datastore["model"]["dropout"]), word_dropout=float(datastore["model"]["word_dropout"]), locked_dropout=float(datastore["model"]["locked_dropout"]), rnn_layers=int(datastore["model"]["rnn_layers"]), ) folder = datastore["train_config"]["folder"] + "/" + str(i) best = Path(folder + "/checkpoint.pt") iobes_corpus = all_corpus[i] if not best.exists(): best = Path(folder + "/best-model.pt") if best.exists(): trainer = ModelTrainer.load_checkpoint( tagger.load_checkpoint(best), iobes_corpus ) else: trainer: ModelTrainer = ModelTrainer(tagger, iobes_corpus) # 7. start training result = trainer.train( folder, learning_rate=float(datastore["train_config"]["learning_rate"]), anneal_factor=float(datastore["train_config"]["anneal_factor"]), min_learning_rate=float(datastore["train_config"]["min_learning_rate"]), mini_batch_size=int(datastore["train_config"]["batch_size"]), max_epochs=int(datastore["train_config"]["epoch"]), save_final_model=bool(datastore["train_config"]["save_final_model"]), checkpoint=bool(datastore["train_config"]["checkpoint"]), param_selection_mode=bool( datastore["train_config"]["param_selection_mode"] ), patience=int(datastore["train_config"]["patience"]), monitor_test=bool(datastore["train_config"]["monitor_test"]), embeddings_storage_mode=str(datastore["train_config"]["embeddings_storage_mode"]), shuffle=bool(datastore["train_config"]["shuffle"]), ) plotter = Plotter() if bool(datastore["train_config"]["save_plot_training_curve"]): curve = folder + "/loss.tsv" plotter.plot_training_curves(curve) if bool(datastore["train_config"]["save_plot_weights"]): weight = folder + "/weights.txt" plotter.plot_weights(weight) score.append(result["test_score"]) print(score, " \n Moyenne : ", round(sum(score) / len(score), 2)) else: tagger: SequenceTagger = SequenceTagger( hidden_size=int(datastore["model"]["hidden_size"]), embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=bool(datastore["model"]["use_crf"]), dropout=float(datastore["model"]["dropout"]), word_dropout=float(datastore["model"]["word_dropout"]), locked_dropout=float(datastore["model"]["locked_dropout"]), rnn_layers=int(datastore["model"]["rnn_layers"]), ) folder = datastore["train_config"]["folder"] best = Path(folder + "/checkpoint.pt") if not best.exists(): best = Path(folder + "/best-model.pt") if best.exists(): trainer = ModelTrainer.load_checkpoint( tagger.load_checkpoint(best), iobes_corpus ) else: trainer: ModelTrainer = ModelTrainer(tagger, iobes_corpus) # 7. start training trainer.train( folder, learning_rate=float(datastore["train_config"]["learning_rate"]), anneal_factor=float(datastore["train_config"]["anneal_factor"]), min_learning_rate=float(datastore["train_config"]["min_learning_rate"]), mini_batch_size=int(datastore["train_config"]["batch_size"]), max_epochs=int(datastore["train_config"]["epoch"]), save_final_model=bool(datastore["train_config"]["save_final_model"]), checkpoint=bool(datastore["train_config"]["checkpoint"]), param_selection_mode=bool( datastore["train_config"]["param_selection_mode"] ), patience=int(datastore["train_config"]["patience"]), monitor_test=bool(datastore["train_config"]["monitor_test"]), embeddings_storage_mode=str(datastore["train_config"]["embeddings_storage_mode"]), shuffle=bool(datastore["train_config"]["shuffle"]), ) plotter = Plotter() if bool(datastore["train_config"]["save_plot_training_curve"]): curve = folder + "/loss.tsv" plotter.plot_training_curves(curve) if bool(datastore["train_config"]["save_plot_weights"]): weight = folder + "/weights.txt" plotter.plot_weights(weight)
test_file='flair_test_small.txt', dev_file='flair_dev_small.txt') # 2. what tag do we want to predict? tag_type = 'ner' # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary) # 4. initialize embeddings # bert-base-chinese embedding_types = [ WordEmbeddings('zh'), BytePairEmbeddings('multi'), # comment in this line to use character embeddings # CharacterEmbeddings(), # comment in these lines to use flair embeddings # FlairEmbeddings('news-forward'), # FlairEmbeddings('news-backward'), ] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) # 5. initialize sequence tagger from flair.models import SequenceTagger tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings,
def trainer(file_path: Path, filenames: Tuple[str, str, str], checkpoint: str, stack: str, n_epochs: int) -> None: """Train sentiment model using Flair NLP library: https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md To help provide added context, we can stack Glove, Bert or ELMo embeddings along with Flair embeddings. """ # pip install flair allennlp from flair.datasets import ClassificationCorpus from flair.embeddings import FlairEmbeddings, DocumentRNNEmbeddings, DocumentPoolEmbeddings from flair.models import TextClassifier from flair.trainers import ModelTrainer from flair.training_utils import EvaluationMetric from flair.visual.training_curves import Plotter if stack == "glove": from flair.embeddings import WordEmbeddings stacked_embedding = WordEmbeddings('glove') elif stack == "fasttext": from flair.embeddings import WordEmbeddings stacked_embedding = WordEmbeddings('it') elif stack == "elmo": from flair.embeddings import ELMoEmbeddings stacked_embedding = ELMoEmbeddings('original') elif stack == "bert": from flair.embeddings import BertEmbeddings stacked_embedding = BertEmbeddings('bert-base-uncased') elif stack == "bert-multi": from flair.embeddings import BertEmbeddings stacked_embedding = BertEmbeddings('bert-base-multilingual-uncased') elif stack == 'bpe': from flair.embeddings import BytePairEmbeddings stacked_embedding = BytePairEmbeddings('it') else: stacked_embedding = None # Define and Load corpus from the provided dataset train, dev, test = filenames corpus = ClassificationCorpus( file_path, train_file=train, dev_file=dev, test_file=test, ) # Create label dictionary from provided labels in data label_dict = corpus.make_label_dictionary() # Stack Flair string-embeddings with optional embeddings word_embeddings = list( filter(None, [ stacked_embedding, FlairEmbeddings('it-forward'), FlairEmbeddings('it-backward'), ])) # Initialize document embedding by passing list of word embeddings document_embeddings = DocumentRNNEmbeddings( word_embeddings, hidden_size=256, reproject_words=True, dropout=0.5, reproject_words_dimension=256, ) #document_embeddings = DocumentPoolEmbeddings([ # stacked_embedding, # FlairEmbeddings('it-forward'), # FlairEmbeddings('it-backward')],pooling='mean') # Define classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=True) if not checkpoint: trainer = ModelTrainer(classifier, corpus) else: # If checkpoint file is defined, resume training #checkpoint = classifier.load_checkpoint(Path(checkpoint)) trainer = ModelTrainer.load_checkpoint(checkpoint, corpus) # Begin training (enable checkpointing to continue training at a later time, if desired) trainer.train( file_path, max_epochs=n_epochs, checkpoint=True, ) # Plot curves and store weights and losses plotter = Plotter() plotter.plot_training_curves(file_path + '/loss.tsv') plotter.plot_weights(file_path + '/weights.txt')
1: "ner" }, tag_to_bioes="ner", skip_first_line=True) print(corpus) # 2. what tag do we want to predict? tag_type = "ner" # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary.idx2item) # initialize embeddings embedding_types: List[TokenEmbeddings] = [ BytePairEmbeddings(language="multi", dim=300, syllables=1000000) ] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) # initialize sequence tagger from flair.models import SequenceTagger tagger: SequenceTagger = SequenceTagger( hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True, )
def __init__(self, n_dims=200, make_unit_length=True): from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings, BytePairEmbeddings super().__init__(n_dims=n_dims, make_unit_length=make_unit_length) embeddings = [WordEmbeddings('glove'), BytePairEmbeddings('en')] self.embeddings = DocumentPoolEmbeddings(embeddings, fine_tune_mode='none') self.log = getLogger(type(self).__name__)
def __create_models(self): models = [] models_fit = [] #for _params in self.model_params: _params = {} for k, v in self.params.items(): if k.startswith('_'): continue _params[k] = v self.textModels = dict( mtc=TextModel(_params).fit(self.train), #charEmb=DocumentPoolEmbeddings([CharacterEmbeddings()]), #charLangEmb=DocumentPoolEmbeddings([CharacterEmbeddings(),BytePairEmbeddings(self.lang)]), ##charMultiEmb=DocumentPoolEmbeddings([CharacterEmbeddings(),BytePairEmbeddings('multi')]), langEmb=DocumentPoolEmbeddings([BytePairEmbeddings(self.lang)]), charLangMultiEmb=DocumentPoolEmbeddings([ CharacterEmbeddings(), BytePairEmbeddings(self.lang), BytePairEmbeddings('multi') ]), langMultiEmb=DocumentPoolEmbeddings( [BytePairEmbeddings(self.lang), BytePairEmbeddings('multi')]), bytePairEMB=DocumentPoolEmbeddings([BytePairEmbeddings('multi')]), #flairEmbF=DocumentPoolEmbeddings([FlairEmbeddings('multi-forward')]), #flairEmbB=DocumentPoolEmbeddings([FlairEmbeddings('multi-backward')]), #bertEMB=DocumentPoolEmbeddings([TransformerWordEmbeddings('bert-base-uncased', layers='-1')]) ) for km, tmodel in self.textModels.items(): models.append({'name': km}) models_fit.append({'name': km}) if km == 'mtc': xt = tmodel.transform(self.train) xv = tmodel.transform(self.validation) X = tmodel.transform(self.data) else: sentences_train = [Sentence(txt) for txt in self.train] tmodel.embed(sentences_train) xt = np.array([ e.get_embedding().cpu().detach().numpy() for e in sentences_train ]) sentences_val = [Sentence(txt) for txt in self.validation] tmodel.embed(sentences_val) xv = np.array([ e.get_embedding().cpu().detach().numpy() for e in sentences_val ]) sentences = [Sentence(txt) for txt in self.data] tmodel.embed(sentences) X = np.array([ e.get_embedding().cpu().detach().numpy() for e in sentences ]) models[-1]['xv'] = xv models[-1]['xt'] = xt models_fit[-1]['xt'] = X #max_iter=5000 #if km=='mtc': max_iter=1000 #if km=='langMulti': max_iter=5000 #self.models[-1]['clf']=LinearSVC(max_iter=max_iter).fit(xt,self.yt) #yp=self.models[-1]['clf'].decision_function(xv) #scaler=Normalizer().fit(yp) #self.models[-1]['macroF1']=f1_score(self.yv,np.argmax(scaler.transform(yp),axis=1),average='weighted') #self.models[-1]['weightedF1']=f1_score(self.yv,np.argmax(scaler.transform(yp),axis=1),average='weighted') #self.models[-1]['score']=f1_score(self.yv,np.argmax(yp,axis=1),average='weighted') #self.models[-1]['probas']=scaler.transform(yp) ### Fit model with all avaliable data #self.models_fit[-1]['clf']=LinearSVC(max_iter=max_iter).fit(X,self.y) print('Fitting Ensemble') #self.models = Parallel(n_jobs=5)(delayed(self._train_model)(md) for md in models) #self.models_fit = Parallel(n_jobs=5)(delayed(self._train_model)(md) for md in models_fit) self.models, self.models_fit = [], [] for md, mdf in zip(models, models_fit): self.models.append(self._train_model( md)) # = [self._train_model(md) for md in models] self.models_fit.append(self._train_model(md))
def perform_text_classification(data, info_need, condition, resample=False, remove_stopwords=False, classifier="SVM", verbose=1, data_source="cooking", random_state=42, warning_on_off="off"): # ===================================================================== # ================= data preperation and other things ================= # ===================================================================== nltk.download('stopwords') os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "0, 1, 2, 3" # use the third GPU # If there's a GPU available... if torch.cuda.is_available(): # Tell PyTorch to use the GPU. device = torch.device("cuda") print('There are %d GPU(s) available.' % torch.cuda.device_count()) print('We will use the GPU:', torch.cuda.get_device_name(0)) # If not... else: print('No GPU available, using the CPU instead.') device = torch.device("cpu") fast_text_embeddings = StackedEmbeddings( [WordEmbeddings('de'), BytePairEmbeddings('de')]) document_embeddings = DocumentPoolEmbeddings([fast_text_embeddings]) if remove_stopwords: print("Remove stopwords...") data['utterance'] = data['utterance'].apply(remove_stop_words) label_encoder = LabelEncoder() y = data['level_1'] # Control warnings if warning_on_off == "off": warnings.filterwarnings("ignore") elif warning_on_off == "on": warnings.simplefilter('always') # ================================================= # ================= featurization ================= # ================================================= if verbose == 1: print("=== Starting embedding process...") word_embedding_corpus = convert_to_embeddings(data['utterance'], document_embeddings) # ======================================================= # ================= building classfiers ================= # ======================================================= scoring = { 'f1_infoneed': make_scorer(f1_score, pos_label=info_need), 'f1_other': make_scorer(f1_score, pos_label="Other"), 'recall_infoneed': make_scorer(recall_score, pos_label=info_need), 'precision_infoneed': make_scorer(precision_score, pos_label=info_need), 'recall_other': make_scorer(recall_score, pos_label="Other"), 'precision_other': make_scorer(precision_score, pos_label="Other"), 'f1_micro': 'f1_micro', 'f1_macro': 'f1_macro', 'f1_weighted': 'f1_weighted', 'precision_micro': 'precision_micro', 'precision_macro': 'precision_macro', 'precision_weighted': 'precision_weighted', 'recall_micro': 'recall_micro', 'recall_macro': 'recall_macro', 'recall_weighted': 'recall_weighted' } if resample: print("Resample data...") smote_enn = SMOTEENN(random_state=42) word_embedding_corpus, y = smote_enn.fit_resample( word_embedding_corpus, y) print(sorted(Counter(y).items())) # ================================ NAIVE BAYES ===================================== if classifier == "NB": if verbose == 1: print("=== Building Classifier:", classifier) clf = GaussianNB() clf_scores = cross_validate(clf, word_embedding_corpus, y, cv=10, scoring=scoring) print( f"Classifier: NB -- Condition: {condition} -- Info Need: {info_need} -- F1 Average: {mean(clf_scores['test_f1_infoneed'])}" ) # =========================== SUPPORT VECTOR MACHINE ================================ elif classifier == "SVM": if verbose == 1: print("=== Building Classifier:", classifier) clf = SVC(gamma='scale') clf_scores = cross_validate(clf, word_embedding_corpus, y, cv=10, scoring=scoring) print( f"Classifier: SVM -- Condition: {condition} -- Info Need: {info_need} -- F1 Average: {mean(clf_scores['test_f1_infoneed'])}" ) # ================================ RANDOM FOREST ====================================== elif classifier == "RF": if verbose == 1: print("=== Building Classifier:", classifier) clf = RandomForestClassifier(class_weight="balanced") clf_scores = cross_validate(clf, word_embedding_corpus, y, cv=10, scoring=scoring) print( f"Classifier: RF -- Condition: {condition} -- Info Need: {info_need} -- F1 Average: {mean(clf_scores['test_f1_infoneed'])}" ) df_classification_report = pd.DataFrame(columns=[ 'loss', 'task_name', 'info_need', 'model', 'num_epochs', 'condition', 'acc', 'f1_other', 'f1_infoneed', 'precision_infoneed', 'recall_infoneed', 'recall_other', 'precision_other', 'recall_macro', 'precision_macro', 'recall_micro', 'precision_micro', 'recall_weighted', 'precision_weighted', 'f1_weighted', 'f1_macro', 'f1_micro', 'mcc', 'report', 'preds', 'labels', 'stopwords_removed', 'resampled' ]) for fold in range(10): df_classification_report.loc[fold] = [ 'None', 'text_classification', info_need, classifier, 'None', condition, 'None', clf_scores['test_f1_other'][fold], clf_scores['test_f1_infoneed'][fold], clf_scores['test_precision_infoneed'][fold], clf_scores['test_recall_infoneed'][fold], clf_scores['test_recall_other'][fold], clf_scores['test_precision_other'][fold], clf_scores['test_recall_macro'][fold], clf_scores['test_precision_macro'][fold], clf_scores['test_recall_micro'][fold], clf_scores['test_precision_micro'][fold], clf_scores['test_recall_weighted'][fold], clf_scores['test_precision_weighted'][fold], clf_scores['test_f1_weighted'][fold], clf_scores['test_f1_macro'][fold], clf_scores['test_f1_micro'][fold], "None", "None", "None", "None", remove_stopwords, resample ] return df_classification_report
if cmd_args.emb: embeddings = list() for type_emb in cmd_args.emb.split(":"): if type_emb == 'flair': embeddings.append(FlairEmbeddings('spanish-forward-fast')) embeddings.append(FlairEmbeddings('spanish-backward-fast')) elif type_emb == 'bert': embeddings.append(BertEmbeddings('bert-base-multilingual-cased')) elif type_emb == 'glove': embeddings.append(WordEmbeddings('../../../../Data/Models/Glove/glove-sbwc_spanish.i25.gensim.vec')) elif type_emb == 'word2vec': embeddings.append(WordEmbeddings('../../../../Data/Models/Word2Vec/Spanish_CoNLL17/w2v_es_conll17.gensim.vec')) elif type_emb == 'elmo': embeddings.append(ELMoEmbeddings('../../../../Data/Models/Elmo/Spanish_CoNLL17/')) elif type_emb == 'bpe': embeddings.append(BytePairEmbeddings(language='es')) elif type_emb == 'wiki': embeddings.append(WordEmbeddings('../../../../Data/Models/FastText/wiki.es.gensim.vec')) elif type_emb == 'chars': embeddings.append(WordEmbeddings('../../../../Data/Models/Chars/lemma_lowercased_estenten11_freeling_v4_virt.gensim.vec')) else: print('ERROR: type of embedding no accepted' + cmd_args.emb + '. Options: flair, bert, glove, word2vec, elmo, bpe, wiki, chars') exit() prefix_model_output_dir = '_'.join(cmd_args.emb.split(":")) if cmd_args.pooling != 'mean': prefix_model_output_dir += '_' + cmd_args.pooling document_embeddings = DocumentPoolEmbeddings(embeddings, pooling=cmd_args.pooling, fine_tune_mode='linear') if cmd_args.btest: bTestPhase = True
import logging from collections import defaultdict from torch.utils.data.sampler import Sampler import random, torch from flair.data import FlairDataset corpus: flair.data.Corpus = flair.datasets.ClassificationCorpus( Path(os.path.join(path2[i])), test_file='test_.tsv', dev_file='dev.tsv', train_file='train.tsv') # way to select language model model_selector = { "Glove": [WordEmbeddings('glove')], "FastText": [WordEmbeddings('en-news')], "BPE": [BytePairEmbeddings('en')], "FlairFast": [ FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast') ], "FlairNews": [FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward')], "ElmoOriginal": [ELMoEmbeddings('original')], 'Bert': [BertEmbeddings('large-uncased')], 'BertLS': [ BertEmbeddings(bert_model_or_path='bert-large-uncased', layers="0,1,2,3,4,5,6,7,8,9,10,11,12,13,14," "15,16,17,18,19,20,21,22,23,24", use_scalar_mix=True) ],
buffer.readinto(content) return content.decode("utf-8") embeddingList = [] for i in range(1, len(sys.argv)): arg = sys.argv[i] typeAndPath = arg.split(":") type = typeAndPath[0] path = typeAndPath[1] if type == "word": embeddingList.append(WordEmbeddings(path)) if type == "char": embeddingList.append(CharacterEmbeddings(path)) if type == "bytepair": embeddingList.append(BytePairEmbeddings(path)) if type == "flair": embeddingList.append(FlairEmbeddings(path)) if type == "bert": embeddingList.append(BertEmbeddings(path)) if type == "elmo": embeddingList.append(ELMoEmbeddings(path)) if len(embeddingList) > 1: embeddings = StackedEmbeddings(embeddings=embeddingList) else: embeddings = embeddingList[0] stdbuffer = sys.stdin.buffer print("Script is ready") while True: line = decodeString(stdbuffer)
def __init__(self): # initialize the word embeddings self.glove_embedding = WordEmbeddings('pt') self.bpe_embedding = BytePairEmbeddings('pt')
import torch from flair.datasets import WIKINER_GERMAN from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, CharacterEmbeddings, \ BytePairEmbeddings, FlairEmbeddings from flair.models import SequenceTagger from flair.trainers.trainer import ModelTrainer flair.device = torch.device("cuda:0") corpus = WIKINER_GERMAN(in_memory=False) tag_type = 'ner' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('de-wiki'), BytePairEmbeddings('de', 100, 5000), CharacterEmbeddings(), FlairEmbeddings('de-forward'), FlairEmbeddings('de-backward') ] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) tagger: SequenceTagger = SequenceTagger( hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True ) trainer: ModelTrainer = ModelTrainer(tagger, corpus)
1: "ner" }, tag_to_bioes="ner", skip_first_line=True) print(corpus) # 2. what tag do we want to predict? tag_type = "ner" # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary.idx2item) # initialize embeddings embedding_types: List[TokenEmbeddings] = [ BytePairEmbeddings(language="de", dim=300, syllables=200000) ] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) # initialize sequence tagger from flair.models import SequenceTagger tagger: SequenceTagger = SequenceTagger( hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True, )
train_file='datatrain.txt', test_file='datatest.txt', dev_file='datadev.txt') print(corpus) # 2. tag do to predict? tag_type = 'ner' # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary) # 4. initialize embeddings embedding_types: List[TokenEmbeddings] = [ #WordEmbeddings('glove'), BytePairEmbeddings('en') ] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) # 5. initialize sequence tagger from flair.models import SequenceTagger tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) # 6. initialize trainer from flair.trainers import ModelTrainer
def train_model(): #global corpus # define columns columns = {0: "text", 1: "ner"} #columns = {0: "text", 1: "pos", 2: "ner"} #columns = {0: "text", 1: "pos", 2: "np", 3: "ner"} data_folder = training_path print("data folder path", data_folder) # init a corpus using column format, data folder and the names of the train, dev and test files corpus: Corpus = ColumnCorpus(data_folder, columns, train_file='train.txt', dev_file='dev.txt', test_file='test.txt') max_tokens = 250 corpus._train = [x for x in corpus.train if len(x) < max_tokens] corpus._dev = [x for x in corpus.dev if len(x) < max_tokens] corpus._test = [x for x in corpus.test if len(x) < max_tokens] print("Finished data standardization.........") # # 1. get the corpus # corpus: Corpus = WIKINER_ENGLISH().downsample(0.1) # print(corpus) # 2. what tag do we want to predict? tag_type = 'ner' # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary.idx2item) #print("path is",f'{data_folder}/albert-base-v2') # 4. initialize embeddings embedding_types: List[TokenEmbeddings] = [ # WordEmbeddings('/home/Balaram_bhukya/PycharmProjects/Flair_NER/nerData/wordembeddings/FT.50D.gensim'), #WordEmbeddings('glove'), # comment in this line to use character embeddings #CharacterEmbeddings(), BytePairEmbeddings('en'), #TransformerXLEmbeddings(), # comment in these lines to use flair embeddings FlairEmbeddings('news-forward'), #FlairEmbeddings('news-forward-fast',pooling='min') FlairEmbeddings('news-backward') #ELMoEmbeddings() #BertEmbeddings(bert_model_or_path=f'{data_folder}/albert-base-v2') ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) trainer: ModelTrainer = ModelTrainer(tagger, corpus) return trainer, corpus