def create_embeddings(params): embedding_type = params["embedding_type"] assert embedding_type in ["bert", "flair", "char"] if embedding_type == "bert": bert_embedding = BertEmbeddings(params["bert_model_dirpath_or_name"], pooling_operation="mean") embedding_types: List[TokenEmbeddings] = [bert_embedding] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) elif embedding_type == "flair": glove_embedding = WordEmbeddings( '/opt/kanarya/glove/GLOVE/GloVe/vectors.gensim') word2vec_embedding = WordEmbeddings( '/opt/kanarya/huawei_w2v/vector.gensim') fast_text_embedding = WordEmbeddings('tr') char_embedding = CharacterEmbeddings() # bert_embedding = BertEmbeddings('../bert_pretraining/pretraining_outputs/pretraining_output_batch_size_32') embedding_types: List[TokenEmbeddings] = [ fast_text_embedding, glove_embedding, word2vec_embedding, char_embedding ] # embedding_types: List[TokenEmbeddings] = [custom_embedding] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) elif embedding_type == "char": embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=[CharacterEmbeddings()]) else: embeddings = None return embeddings
def train(): columns = {0: 'text', 1: 'pos'} # init a corpus using column format, data folder and the names of the train, dev and test files corpus: Corpus = ColumnCorpus('', columns, train_file=args.train, test_file=args.test, dev_file=args.dev) tag_dictionary = corpus.make_tag_dictionary(tag_type='pos') # initialize embeddings embedding_types: List[TokenEmbeddings] = [ CharacterEmbeddings(), FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), ] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type='pos', use_crf=True) trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train(args.model, learning_rate=0.1, mini_batch_size=32, max_epochs=150)
def train(): # column format - word postag label columns = {0: "word", 1: "postag", 2: "ner"} data_folder = os.path.join(path, "../data/") # read train, dev and test set # here test set is same as dev set corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(data_folder, columns, train_file = "onto.train", dev_file = "onto.testa", test_file="onto.testa") print(corpus) # create label dictionary tag_dictionary = corpus.make_tag_dictionary(tag_type = "ner") print(tag_dictionary.idx2item) # using glove embeddings and character embeddings embedding_types: List[TokenEmbeddings] = [WordEmbeddings("glove"), CharacterEmbeddings()] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings = embedding_types) # create sequence tagger and trainer instance tagger: SequenceTagger = SequenceTagger(hidden_size = 256, embeddings = embeddings, tag_dictionary = tag_dictionary, tag_type = "ner", use_crf = True) trainer: ModelTrainer = ModelTrainer(tagger, corpus) model_path = os.path.join(path, "../models/") # commence training # model shall be saved in model_path under filename final-model.pt # this step takes at least 4 hours to complete, so please ensure access to GPU trainer.train(model_path, learning_rate = 0.1, mini_batch_size = 64, max_epochs = 3)
def create_embeddings(self) -> StackedEmbeddings: embedding_types: List[FlairEmbeddings] = [] if self.config['use_word_embeddings']: embedding_types.append(W2vWordEmbeddings(self.config['word_embeddings_path'])) if self.config['use_char_embeddings']: embedding_types.append(CharacterEmbeddings()) if self.config['use_flair_embeddings']: embedding_types.append(FlairEmbeddings('es-clinical-forward')) embedding_types.append(FlairEmbeddings('es-clinical-backward')) if self.config['use_beto_embeddings']: embedding_types.append( TransformerWordEmbeddings( 'dccuchile/bert-base-spanish-wwm-cased', layers = self.config['layers'], layer_mean = self.config['layer_mean'], subtoken_pooling = self.config['subtoken_pooling'])) embeddings: StackedEmbeddings = StackedEmbeddings(embeddings = embedding_types) return embeddings
def train(self, training_dir=None): from flair.trainers import ModelTrainer if training_dir is None: training_dir = flair_splitter_dep_dir # define columns columns = {0: "text", 1: "ner"} # this is the folder in which train, test and dev files reside data_folder = flair_splitter_dep_dir + "data" # init a corpus using column format, data folder and the names of the train, dev and test files # note that training data should be unescaped, i.e. tokens like "&", not "&" corpus: Corpus = ColumnCorpus( data_folder, columns, train_file="sent_train.txt", test_file="sent_test.txt", dev_file="sent_dev.txt", document_separator_token="-DOCSTART-", ) print(corpus) tag_type = "ner" tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary) # initialize embeddings embedding_types = [ # WordEmbeddings('glove'), # comment in this line to use character embeddings CharacterEmbeddings(), # comment in these lines to use flair embeddings #FlairEmbeddings("news-forward"), #FlairEmbeddings("news-backward"), # BertEmbeddings('distilbert-base-cased') TransformerWordEmbeddings('google/electra-base-discriminator') ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) tagger: SequenceTagger = SequenceTagger( hidden_size=128, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True, ) trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train(training_dir, learning_rate=0.1, mini_batch_size=16, max_epochs=50) self.model = tagger
def build_embedding(self, lang, embedding_codes: List[str]) -> None: self.tic = time.time() self.embedding_name: str = "-".join(embedding_codes) self.lang = lang embedding_types: List[TokenEmbeddings] = [] for code in embedding_codes: code = code.lower() assert code in [ "bpe", "bert", "flair", "ft", "char", "ohe", "elmo", ], f"{code} - Invalid embedding code" if code == "ohe": embedding_types.append(OneHotEmbeddings(corpus=self.corpus)) elif code == "ft": embedding_types.append(WordEmbeddings(self.lang)) elif code == "bpe": embedding_types.append(BytePairEmbeddings(self.lang)) elif code == "bert": embedding_types.append( TransformerWordEmbeddings( model=self.huggingface_ref[self.lang], pooling_operation="first", layers="-1", fine_tune=False, ) ) elif code == "char": embedding_types.append(CharacterEmbeddings()) elif code == "flair": embedding_types.append(FlairEmbeddings(f"{self.lang}-forward")) embedding_types.append(FlairEmbeddings(f"{self.lang}-backward")) elif code == "elmo": embedding_types.append( ELMoEmbeddings(model="large", embedding_mode="all") ) self.embedding: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types ) self.tagger: SequenceTagger = SequenceTagger( hidden_size=256, embeddings=self.embedding, tag_dictionary=self.tag_dictionary, tag_type=self.tag_type, use_crf=True, ) self.trainer: ModelTrainer = ModelTrainer(self.tagger, self.corpus)
def get_embeddings(self): embeddings = [ PolyglotEmbeddings(self.args.lang), CharacterEmbeddings() ] if not self.args.lang in self.embeds_unsupported_langs: embeddings.append(WordEmbeddings(self.args.lang)) return StackedEmbeddings(embeddings=embeddings)
def create_embeddings(params): embedding_type = params["embedding_type"] assert embedding_type in ["bert", "flair", "char"] if embedding_type == "bert": bert_embedding = BertEmbeddings(params["bert_model_dirpath_or_name"], pooling_operation="first") if params[ "bert_model_dirpath_or_name"] == "dbmdz/bert-case-turkish-cased": from transformers import AutoModel, AutoTokenizer bert_embedding.tokenizer = AutoTokenizer.from_pretrained( params["bert_model_dirpath_or_name"]) bert_embedding.model = AutoModel.from_pretrained( params["bert_model_dirpath_or_name"]) embedding_types: List[TokenEmbeddings] = [bert_embedding] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) elif embedding_type == "flair": glove_embedding = WordEmbeddings( '/opt/kanarya/glove/GLOVE/GloVe/vectors.gensim') word2vec_embedding = WordEmbeddings( '/opt/kanarya/huawei_w2v/vector.gensim') fast_text_embedding = WordEmbeddings('tr') char_embedding = CharacterEmbeddings() # bert_embedding = BertEmbeddings('../bert_pretraining/pretraining_outputs/pretraining_output_batch_size_32') embedding_types: List[TokenEmbeddings] = [ fast_text_embedding, glove_embedding, word2vec_embedding, char_embedding ] # embedding_types: List[TokenEmbeddings] = [custom_embedding] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) elif embedding_type == "char": embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=[CharacterEmbeddings()]) else: embeddings = None return embeddings
def fit(self, X, y): """ Build feature vectors and train FLAIR model. Parameters ---------- X : list(list(str)) list of sentences. Sentences are tokenized into list of words. y : list(list(str)) list of list of BIO tags. Returns ------- self """ log.info("Creating FLAIR corpus...") Xtrain, Xval, ytrain, yval = train_test_split(X, y, test_size=0.1) sents_train = self._convert_to_flair(Xtrain, ytrain) sents_val = self._convert_to_flair(Xval, yval) corpus_train = Corpus(sents_train, sents_val, [], name="train-corpus") tag_dict = corpus_train.make_tag_dictionary(tag_type="ner") if self.embeddings is None: embedding_types = [ WordEmbeddings("glove"), CharacterEmbeddings() ] self.embeddings = StackedEmbeddings(embeddings=embedding_types) log.info("Building FLAIR NER...") self.model_ = SequenceTagger(hidden_size=self.hidden_dim, embeddings=self.embeddings, tag_dictionary=tag_dict, tag_type="ner", use_crf=self.use_crf, use_rnn=self.use_rnn, rnn_layers=self.num_rnn_layers, dropout=self.dropout, word_dropout=self.word_dropout, locked_dropout=self.locked_dropout) log.info("Training FLAIR NER...") opt = torch.optim.SGD if self.optimizer == "sgd" else torch.optim.Adam trainer = ModelTrainer(self.model_, corpus_train, opt) trainer.train(base_path=self.basedir, learning_rate=self.learning_rate, mini_batch_size=self.batch_size, max_epochs=self.max_iter) return self
def embed_tweet(tweetList): # initialize the word embeddings tr_embedding = WordEmbeddings('tr') char_embedding = CharacterEmbeddings() # initialize the document embeddings, mode = mean document_embeddings = DocumentPoolEmbeddings( [tr_embedding, char_embedding]) tweetTensors = [] for tweet in tweetList: #print(norm_tweet(tweet)) sentence = Sentence(norm_tweet(tweet)) document_embeddings.embed(sentence) tweetTensors.append(sentence.get_embedding().data) return tweetTensors
def get_embeddings(embeddings: List[str], character: bool, lang: str, bpe_size: int) -> StackedEmbeddings: """To Construct and return a embedding model""" stack = [] for e in embeddings: if e != '': if 'forward' in e or 'backward' in e: stack.append(FlairEmbeddings(e)) else: stack.append(WordEmbeddings(e)) if character: stack.append(CharacterEmbeddings()) if bpe_size > 0: stack.append(BytePairEmbeddings(language=lang, dim=bpe_size)) return StackedEmbeddings(embeddings=stack)
def __init__(self, config): """ Load pretrained language model """ super(LanguageModel, self).__init__() embeddings_stack = [] transformers = config.get("language_model", "transformers") if transformers is not "": transformers = transformers.split(";") for model in transformers: embeddings_stack.append( TransformerWordEmbeddings( model, layers="-1", pooling_operation='mean', # use_scalar_mix=True, fine_tune=True)) word_embeddings = config.get("language_model", "word_embeddings") if word_embeddings is not "": word_embeddings = word_embeddings.split(";") for model in word_embeddings: embeddings_stack.append(WordEmbeddings(model)) flair_embeddings = config.get("language_model", "flair_embeddings") if flair_embeddings is not "": flair_embeddings = flair_embeddings.split(";") for model in flair_embeddings: embeddings_stack.append(FlairEmbeddings(model, fine_tune=True)) character_embeddings = config.get("language_model", "character_embeddigs") if character_embeddings.lower() is "yes": embeddings_stack.append(CharacterEmbeddings(character_embeddings)) bytepair_embeddings = config.get("language_model", "bytepair_embeddings") if bytepair_embeddings.lower() is "yes": embeddings_stack.append(BytePairEmbeddings()) custom_embeddings = config.get("language_model", "custom_embeddings") if custom_embeddings is not "": custom_embeddings = custom_embeddings.split(";") for path in custom_embeddings: embeddings_stack.append(WordEmbeddings(path)) self.lm = StackedEmbeddings(embeddings_stack) self.embedding_dim = self.lm.embedding_length self.dropout = torch.nn.Dropout( float(config.get("language_model", "dropout"))) self.classify = torch.nn.Linear(self.embedding_dim, 2) if config.get("language_model", "relu") == "yes": self.relu = torch.nn.ReLU()
def train(data_dir: str, model_dir: str, dataset_format: str='macss', num_filters: int=150, word_embeddings: str='de-fasttext', offset_embedding_dim: int=50, learning_rate: float=.1, batch_size: int=32, max_epochs: int=50, dropout: float=.5, use_char_embeddings: bool=False, seed: int=0, dev_size: float=.1, test_size: float=.2): logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S') logging.info(f'Training config: {locals().items()}') if dataset_format not in ['macss', 'semeval']: raise ValueError(f"Dataset format '{dataset_format}' not supported.") corpus: TaggedCorpus = dataset_loader[dataset_format](data_dir, dev_size, seed) label_dictionary = corpus.make_label_dictionary() logging.info(f'Corpus: {corpus}') corpus.print_statistics() logging.info(f'Size of label dictionary: {len(label_dictionary)}') logging.info(f'Labels: {label_dictionary.get_items()}') embedding_types: List[TokenEmbeddings] = [ WordEmbeddings(word_embeddings), RelativeOffsetEmbeddings('offset_e1', max_len=200, embedding_dim=offset_embedding_dim), RelativeOffsetEmbeddings('offset_e2', max_len=200, embedding_dim=offset_embedding_dim), ] if use_char_embeddings: embedding_types += CharacterEmbeddings() document_embeddings: DocumentCNNEmbeddings = DocumentCNNEmbeddings(embedding_types, num_filters=num_filters, dropout=dropout) classifier: TextClassifier = TextClassifier(document_embeddings=document_embeddings, label_dictionary=label_dictionary, multi_label=False) trainer: TextClassifierTrainer = TextClassifierTrainer(classifier, corpus, label_dictionary) trainer.train(model_dir, learning_rate=learning_rate, mini_batch_size=batch_size, max_epochs=max_epochs)
def main(): params, config = parse_arguments() print(config) print(params) print("Constructing data loaders...") myvlbert = ResNetVLBERT(config) pre_model = BertRel(params, myvlbert) dl = DataLoader(params) dlbb = DLbb(params) evaluator = Evaluator(params, dl) print("Constructing data loaders...[OK]") if params.mode == 0: print("Training...") t = Trainer(params, config, dl, dlbb, evaluator, pre_model) t.train() print("Training...[OK]") elif params.mode == 1: print("Loading rpbert...") embedding_types = [ WordEmbeddings( '/media/iot538/a73dbfc5-a8a0-4021-a841-3b7d7f3fd964/mnt/xj/wnut17_advanced/pretrain/en-fasttext-crawl-300d-1M' ), CharacterEmbeddings( '/home/iot538/.flair/datasets/common_characters_large'), ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) model = MNER(params, embeddings, pre_model) model_file_path = params.model_file_name model.load_state_dict(torch.load(model_file_path)) if torch.cuda.is_available(): model = model.cuda() print("Loading rpbert...[OK]") print("Evaluating rpbert on test set...") with torch.no_grad(): acc, f1, prec, rec = evaluator.get_accuracy(model, 'test') print("Accuracy : {}".format(acc)) print("F1 : {}".format(f1)) print("Precision : {}".format(prec)) print("Recall : {}".format(rec)) print("Evaluating rpbert on test set...[OK]")
def EmbeddingFactory(parameters, corpus): from flair.embeddings import FlairEmbeddings, StackedEmbeddings, \ WordEmbeddings, OneHotEmbeddings, CharacterEmbeddings, TransformerWordEmbeddings stack = [] for emb in parameters.embedding.split(): if any((spec in emb) for spec in ("bert", "gpt", "xlnet")): stack.append( TransformerWordEmbeddings(model=pretrainedstr( emb, parameters.language), fine_tune=parameters.tune_embedding)) elif emb == "flair": stack += [ FlairEmbeddings(f"{parameters.language}-forward", fine_tune=parameters.tune_embedding), FlairEmbeddings(f"{parameters.language}-backward", fine_tune=parameters.tune_embedding) ] elif emb == "pos": stack.append( OneHotEmbeddings(corpus, field="pos", embedding_length=parameters.pos_embedding_dim, min_freq=1)) elif emb == "fasttext": stack.append(WordEmbeddings(parameters.language)) elif emb == "word": stack.append( OneHotEmbeddings( corpus, field="text", embedding_length=parameters.word_embedding_dim, min_freq=parameters.word_minfreq)) elif emb == "char": stack.append( CharacterEmbeddings( char_embedding_dim=parameters.char_embedding_dim, hidden_size_char=parameters.char_bilstm_dim)) else: raise NotImplementedError() return StackedEmbeddings(stack)
def get_flair_vectors(vocab): print("Looking for flair vectors") #import flair embeddings! #we can change here to use different embeddings glove_embedding = WordEmbeddings('glove') twitter_embedding = WordEmbeddings('en-twitter') character_embeddings = CharacterEmbeddings() stacked_embeddings = StackedEmbeddings(embeddings=[glove_embedding, character_embeddings,twitter_embedding]) flair_vectors = {} found = 0 for word in vocab: wt=Sentence(word) stacked_embeddings.embed(wt) vector=wt[0].embedding.detach().numpy() #if the word is not in the embedding dict, the vector will be all zero if np.sum(np.abs(vector))>0: flair_vectors[word]=vector found += 1 print('\n') print('Found %d words in GLOVE' % found) return flair_vectors
def __init__(self, device="cpu"): super(RankNetWithEmbeddings, self).__init__() self._device = device fasttext_embedding = WordEmbeddings('en-news') # flair_embedding_forward = FlairEmbeddings('news-forward') # flair_embedding_backward = FlairEmbeddings('news-backward') byte_pair_embedding = BytePairEmbeddings('en') glove_embeddings = WordEmbeddings('glove') character_embedding = CharacterEmbeddings() self._mention_embedding = DocumentPoolEmbeddings([fasttext_embedding]) self._label_embedding = DocumentPoolEmbeddings([ fasttext_embedding, ]) self._context_embedding = DocumentPoolEmbeddings([fasttext_embedding]) self._description_embedding = DocumentPoolEmbeddings([ fasttext_embedding, ]) input_length = self._mention_embedding.embedding_length \ + self._context_embedding.embedding_length \ + self._label_embedding.embedding_length \ + self._description_embedding.embedding_length self.model = nn.Sequential( nn.Linear(input_length, 256), nn.ReLU(), # nn.Dropout(0.2), nn.Linear(256, 64), nn.ReLU(), # nn.Dropout(0.2), nn.Linear(64, 1), nn.Tanh(), ) self.output_sig = nn.Sigmoid() self.to(device)
def hyper_opt(corpus): print("hyper_opt is started") # define your search space search_space = SearchSpace() search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[ StackedEmbeddings([ WordEmbeddings('en'), WordEmbeddings('glove'), CharacterEmbeddings(), FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), ELMoEmbeddings() ]) ]) search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[256]) #search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2]) #search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5) search_space.add(Parameter.LEARNING_RATE, hp.choice, options=[0.01, 0.1]) search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[32, 64]) # create the parameter selector param_selector = SequenceTaggerParamSelector( corpus, 'ner', #'/content/gdrive/My Drive/resume_ner_data/hyperparam_selection', model_path, max_epochs=50, training_runs=2, optimization_value=OptimizationValue.DEV_SCORE) # start the optimization param_selector.optimize(search_space, max_evals=100)
from flair.models import SequenceTagger from flair.embeddings import WordEmbeddings from flair.embeddings import CharacterEmbeddings from flair.embeddings import FlairEmbeddings from flair.embeddings import TransformerWordEmbeddings from flair.embeddings import DocumentPoolEmbeddings from flair.data import Sentence # Flair library supports compination of various word embeddings generated by various base models # constructing all the models once and not loading a new one at every function call; saves a lot of ram; faster # we have acces to all the transformers present in the 'hugging face' library bert_embedding = TransformerWordEmbeddings('bert-base-cased') roberta_embedding = TransformerWordEmbeddings('roberta-base') glove_embedding = WordEmbeddings('glove') character_embeddings = CharacterEmbeddings() flair_forward = FlairEmbeddings('news-forward-fast') flair_backward = FlairEmbeddings('news-backward-fast') def vectorize(string: str = None, selected_base_models: list = None): # 'vectorizes' the input string using one or a combination of word embeddings - if 'vector representation' # is being selected at Algorithms construction time. """ :param string, input string :param selected_base_models list of the models we want to use in order to create word embeddings :return: embedding """ if not selected_base_models: raise SystemExit(f"[ERROR]: function {vectorize.__name__}() -> Provide at least one base model: ['bert'," f"'roberta', 'glove', 'character', 'flair_forward', 'flair_backward']")
def train(data_dir: str, model_dir: str, dataset_format: str = 'macss_tdt', num_filters: int = 150, word_embeddings: str = 'de-fasttext', offset_embedding_dim: int = 100, learning_rate: float = .1, batch_size: int = 32, max_epochs: int = 1, dropout: float = .5, use_char_embeddings: bool = False, seed: int = 0, dev_size: float = .1, test_size: float = .2, concept_embedding_dim: int = 100): all_data = open('all_data.txt', encoding='utf8').read().split("\n") test_dev_percent = math.floor((len(all_data) * 25) / 100) k_folds = math.floor(len(all_data) / test_dev_percent) random.shuffle(all_data) config_name = '1_Some_Setting_Name' for i in range(k_folds): data_path = 'resources/' + config_name + '/' + str(i + 1) test_dev_set = all_data[(test_dev_percent * (i + 1)) - test_dev_percent:test_dev_percent * (i + 1)] train = all_data[0:(test_dev_percent * (i + 1)) - test_dev_percent] + all_data[test_dev_percent * (i + 1):len(all_data)] random.shuffle(test_dev_set) test_perc = math.floor((len(test_dev_set) * 60) / 100) test = test_dev_set[0:test_perc] dev = test_dev_set[test_perc:len(test_dev_set)] os.makedirs(data_path, exist_ok=True) train_txt = open(data_path + '/train.txt', 'w+') test_txt = open(data_path + '/test.txt', 'w+') dev_txt = open(data_path + '/dev.txt', 'w+') os.system('cp -r ./Data/vocabulary/ ' + data_path) train_txt.write('\n'.join(train)) test_txt.write('\n'.join(test)) dev_txt.write('\n'.join(dev)) train_txt.close() test_txt.close() dev_txt.close() #print("Train Directory: ", data_dir, dev_size, seed, "\n") logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S') if dataset_format not in ['macss_tdt']: raise ValueError( f"Dataset format '{dataset_format}' not supported.") corpus: TaggedCorpus = dataset_loader[dataset_format](data_path, 'train.txt', 'dev.txt', 'test.txt') label_dictionary = corpus.make_label_dictionary() # rel-type # Comment out the embeddings that you don't need embedding_types: List[TokenEmbeddings] = [ # mEx Fine-Tuned Word Embeddings #WordEmbeddings('../../Resources/mex-ft-wiki-de-finetuned-biomedical.gensim'), # Default German FastText Word Embeddings #WordEmbeddings('../../Resources/ft-wiki-de.gensim'), # Relative Offset Embeddings RelativeOffsetEmbeddings('offset_e1', max_len=200, embedding_dim=offset_embedding_dim), RelativeOffsetEmbeddings('offset_e2', max_len=200, embedding_dim=offset_embedding_dim), # Concept Embeddings ConceptEmbeddings('concept_1', max_len=200, embedding_dim=concept_embedding_dim), ConceptEmbeddings('concept_2', max_len=200, embedding_dim=concept_embedding_dim), ] if use_char_embeddings: embedding_types += CharacterEmbeddings() document_embeddings: DocumentCNNEmbeddings = DocumentCNNEmbeddings( embedding_types, num_filters=num_filters, dropout=dropout) classifier: TextClassifier = TextClassifier( document_embeddings=document_embeddings, label_dictionary=label_dictionary, multi_label=False) trainer: TextClassifierTrainer = TextClassifierTrainer( classifier, corpus, label_dictionary) trainer.train(data_path, learning_rate=learning_rate, mini_batch_size=batch_size, max_epochs=3, use_tensorboard=False, embeddings_in_memory=False)
def train_sequence_labeling_model(data_folder, proposed_tags_vocabulary_size, skf_split_no): """ Trains the sequence labeling model (by default model uses one RNN layer). Model is trained to predict part of speech tag and takes into account information about: - text (plain text made of tokens that together form a sentence), - occurrence of separator before token, - proposed tags for given token. It is trained with use of Stacked Embeddings used to combine different embeddings together. Words are embedded using a concatenation of three vector embeddings: - WordEmbeddings - classic word embeddings. That kind of embeddings are static and word-level, meaning that each distinct word gets exactly one pre-computed embedding. Here FastText embeddings trained over polish Wikipedia are used. - CharacterEmbeddings - allow to add character-level word embeddings during model training. These embeddings are randomly initialized when the class is being initialized, so they are not meaningful unless they are trained on a specific downstream task. For instance, the standard sequence labeling architecture used by Lample et al. (2016) is a combination of classic word embeddings with task-trained character features. Normally this would require to implement a hierarchical embedding architecture in which character-level embeddings for each word are computed using an RNN and then concatenated with word embeddings. In Flair, this is simplified by treating CharacterEmbeddings just like any other embedding class. To reproduce the Lample architecture, there is only a need to combine them with standard WordEmbeddings in an embedding stack. - One Hot Embeddings - embeddings that encode each word in a vocabulary as a one-hot vector, followed by an embedding layer. These embeddings thus do not encode any prior knowledge as do most other embeddings. They also differ in that they require to see a Corpus during instantiation, so they can build up a vocabulary consisting of the most common words seen in the corpus, plus an UNK token for all rare words. There are one One Hot Embeddings used in training: to embed information about proposed tags (concatenated with a ';') and appearance of separator before each token. Model training is based on stratified 10 fold cross validation split indicated by skf_split_no argument. Model and training logs are saved in resources_ex_3/taggers/example-pos/it-<skf_split_no> directory (where <skf_split_no> is the number of stratified 10 fold cross validation split used to train the model). :param data_folder: folder where files with column corpus split are stored. Those columns are used to initialize ColumnCorpus object :param proposed_tags_vocabulary_size: number of proposed tags :param skf_split_no: number that indicates one of stratified 10 fold cross validation splits (from range 1 to 10) used to train the model """ # define columns columns = {0: 'text', 1: 'pos', 2: 'is_separator', 3: 'proposed_tags'} # init a corpus using column format, data folder and the names of the train and test files # 1. get the corpus corpus: Corpus = ColumnCorpus(data_folder, columns, train_file='train_' + str(skf_split_no), test_file='test_' + str(skf_split_no), dev_file=None) log.info(corpus) # 2. what tag do we want to predict tag_type = 'pos' # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) log.info(tag_dictionary) # 4. initialize embeddings local_model_path = use_scratch_dir_if_available( 'resources/polish_FastText_embeddings') embedding_types: List[TokenEmbeddings] = [ WordEmbeddings(local_model_path) if os.path.exists(local_model_path) else WordEmbeddings('pl'), CharacterEmbeddings( use_scratch_dir_if_available('resources/polish_letters_dict')), OneHotEmbeddings(corpus=corpus, field='is_separator', embedding_length=3, min_freq=3), OneHotEmbeddings(corpus=corpus, field='proposed_tags', embedding_length=math.ceil( (proposed_tags_vocabulary_size + 1)**0.25), min_freq=3) ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) # 5. initialize sequence tagger tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=False, rnn_layers=1) # 6. initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) # 7. start training trainer.train( use_scratch_dir_if_available('resources_ex_3/taggers/example-pos/it-' + str(skf_split_no)), learning_rate=0.1, mini_batch_size=32, embeddings_storage_mode='gpu', max_epochs=sys.maxsize, monitor_test=True) # 8. plot weight traces (optional) plotter = Plotter() plotter.plot_weights( use_scratch_dir_if_available('resources_ex_3/taggers/example-pos/it-' + str(skf_split_no) + '/weights.txt'))
# Set up the Corpus columns = {0: 'text', 1:'ner'} data_folder = './data/IOBES' corpus: Corpus = ColumnCorpus(data_folder, columns, train_file="train.txt", dev_file="dev.txt", test_file="test.txt") tag_type = 'ner' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) # define search_space search_space = SearchSpace() search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[ StackedEmbeddings([ ELMoEmbeddings('original') ]), StackedEmbeddings([ ELMoEmbeddings('original'), CharacterEmbeddings() ]) ]) search_space.add(Parameter.HIDDEN_SIZE, hp.randint, upper=400) search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1,2]) search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5) search_space.add(Parameter.LEARNING_RATE, hp.uniform, low=0.01, high=0.25) search_space.add(Parameter.PATIENCE, hp.choice, options=[3, 5]) search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[16, 32]) search_space.add(Parameter.USE_CRF, hp.choice, options=[True, False]) # initialise embeddings param_selector = SequenceTaggerParamSelector( corpus, tag_type='ner',
def train( review_category, params, update_model= False, learning_rate=0.01, embeddings_storage_mode='gpu', checkpoint= True, batch_growth_annealing= True, weight_decay = 1e-4, shuffle=True, train_with_dev=True, mini_batch_size=2, maxi_batch_size=128, anneal_factor=0.5, patience=2, max_epochs=150 ): review_category = str(review_category) print('loading training corpus from %s'%(params.data_folder)) corpus: Corpus = ClassificationCorpus(params.data_folder, train_file= review_category+'_train.txt', test_file= review_category+'_test.txt', dev_file= review_category+'_dev.txt') label_dict = corpus.make_label_dictionary() print('labels: ',label_dict) if eval(params.transformer): print('initializing transformer document embeddings using %s ...'%(params.transformer_pretrain_lm)) # 3. initialize transformer document embeddings (many models are available) document_embeddings = TransformerDocumentEmbeddings(params.transformer_pretrain_lm, fine_tune=True) else: print('initializing document embeddings') word_embeddings= [ WordEmbeddings('glove'), # comment in this line to use character embeddings CharacterEmbeddings(), # comment in these lines to use flair embeddings FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), BertEmbeddings(), # TransformerXLEmbeddings(), #RoBERTaEmbeddings(), #XLNetEmbeddings() ] # Can choose between many RNN types (GRU by default, to change use rnn_type parameter) document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256, ) if not update_model: print('building review_analysis classifier ...') # create the text classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict) # initialize the text classifier trainer print("initializing review_analysis classifier's trainer") trainer = ModelTrainer(classifier, corpus, optimizer=Adam) else: # continue trainer at later point checkpoint_path = params.checkpoint_dir+'/%s/checkpoint.pt'%(review_category) print('loading checkpoint from %s'%(checkpoint_path)) trainer = ModelTrainer.load_checkpoint(checkpoint_path, corpus) ####### training the model print("training the review_category: %s model ..."%(review_category)) try: trainer.train(params.checkpoint_dir+'/%s'%(review_category), learning_rate=learning_rate, embeddings_storage_mode=embeddings_storage_mode, checkpoint= checkpoint, batch_growth_annealing= batch_growth_annealing, weight_decay = weight_decay, shuffle=shuffle, train_with_dev=train_with_dev, mini_batch_size=mini_batch_size, maxi_batch_size=maxi_batch_size, anneal_factor=anneal_factor, patience=patience, max_epochs=max_epochs) except: print('chuncking batch ... by %d'%(params.mini_batch_chunk_size)) trainer.train(params.checkpoint_dir+'/%s'%(review_category), learning_rate=learning_rate, embeddings_storage_mode=embeddings_storage_mode, checkpoint= checkpoint, batch_growth_annealing= batch_growth_annealing, weight_decay = weight_decay, shuffle=shuffle, train_with_dev=train_with_dev, mini_batch_size=mini_batch_size, maxi_batch_size=maxi_batch_size, anneal_factor=anneal_factor, patience=patience, max_epochs=max_epochs, mini_batch_chunk_size=params.mini_batch_chunk_size)
# Set up the Corpus columns = {0: 'text', 1:'ner'} data_folder = './data/IOBES' corpus: Corpus = ColumnCorpus(data_folder, columns, train_file="train.txt", dev_file="test.txt", test_file="test.txt") tag_type = 'ner' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) # define search_space search_space = SearchSpace() search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[ StackedEmbeddings([ FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward') ]), StackedEmbeddings([ FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), CharacterEmbeddings() ]) ]) search_space.add(Parameter.HIDDEN_SIZE, hp.randint, upper=400) search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1,2]) search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5) search_space.add(Parameter.LEARNING_RATE, hp.choice, options=[0.05, 0.1, 0.15, 0.2]) search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[16, 32]) search_space.add(Parameter.USE_CRF, hp.choice, options=[True, False]) search_space.add(Parameter.USE_RNN, hp.choice, options=[True]) # initialise embeddings param_selector = SequenceTaggerParamSelector( corpus, tag_type='ner',
def init_embeddings(corpus_name, embedding_type): """ Initializes embeddings for a given corpus. Parameters: corpus_name (str): name of the corpus used to load proper embeddings embedding_type (str): type of embeddings (e.g. flair, elmo, bert, word+char) Returns: tuple(StackedEmbeddings, bool): loaded embeddings """ from typing import List from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings from flair.embeddings import FlairEmbeddings from flair.embeddings import BertEmbeddings, ELMoEmbeddings from flair.embeddings import WordEmbeddings, CharacterEmbeddings embedding_types: List[TokenEmbeddings] = [] if corpus_name in ['conll03_en']: if embedding_type == 'flair': embedding_types.append(WordEmbeddings('glove')) embedding_types.append(FlairEmbeddings('news-forward')) embedding_types.append(FlairEmbeddings('news-backward')) embeddings_in_memory = True elif embedding_type == 'bert': embedding_types.append( BertEmbeddings(bert_model_or_path='bert-base-cased')) #embedding_types.append(BertEmbeddings(bert_model_or_path='bert-large-cased')) embeddings_in_memory = True elif embedding_type == 'elmo': embedding_types.append(ELMoEmbeddings()) embeddings_in_memory = True elif embedding_type == 'word+char': # similar to Lample et al. (2016) embedding_types.append(WordEmbeddings('glove')) embedding_types.append(CharacterEmbeddings()) embeddings_in_memory = False # because it contains a char model (problem with deepcopy) else: log.error(f"no settings for '{embedding_type}'!") exit(EXIT_FAILURE) elif corpus_name in ["conll03_de", "germeval"]: if embedding_type == 'flair': embedding_types.append(WordEmbeddings('de')) embedding_types.append(FlairEmbeddings('german-forward')) embedding_types.append(FlairEmbeddings('german-backward')) embeddings_in_memory = True elif embedding_type == 'word+char': # similar to Lample et al. (2016) embedding_types.append(WordEmbeddings('de')) embedding_types.append(CharacterEmbeddings()) embeddings_in_memory = False # because it contains a char model (problem with deepcopy) else: log.error(f"no settings for '{embedding_type}'!") exit(EXIT_FAILURE) else: log.error(f"unknown corpus or embeddings '{corpus_name}'!") exit(EXIT_FAILURE) embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) log.info("'{}' function finished!".format(sys._getframe().f_code.co_name)) return embeddings, embeddings_in_memory
# Set up the Corpus columns = {0: 'text', 1:'ner'} data_folder = './data/IOBES' corpus: Corpus = ColumnCorpus(data_folder, columns, train_file="train.txt", dev_file="dev.txt", test_file="test.txt") tag_type = 'ner' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) # define search_space search_space = SearchSpace() search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[ StackedEmbeddings([ FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), BertEmbeddings('bert-large-cased') ]), StackedEmbeddings([ FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), BertEmbeddings('bert-large-cased'), CharacterEmbeddings() ]) ]) search_space.add(Parameter.HIDDEN_SIZE, hp.randint, upper=400) search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1,2]) search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5) search_space.add(Parameter.LEARNING_RATE, hp.choice, options=[0.05, 0.1, 0.15, 0.2]) search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[16, 32]) search_space.add(Parameter.USE_CRF, hp.choice, options=[True, False]) # initialise embeddings param_selector = SequenceTaggerParamSelector( corpus, tag_type='ner', base_path="Optimisation_evals/stacks/flair_BERT/dev",
def train(args, tag_type): ''' Training script to be run for training the ner model Parameters: ----------- args:arguments passed to the parser on CLI ''' data_dir = args.input_dir + '/data' corpus = ColumnCorpus(data_folder=data_dir, column_format={ 0: 'text', 1: 'ner' }, train_file=args.train_file, test_file=args.test_file, dev_file=args.dev_file) # print(corpus.train[0]) # print(corpus) # tag_type = 'ner' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) # print(tag_dictionary) if args.character_embeddings: embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('glove'), CharacterEmbeddings(), FlairEmbeddings(args.flair_model_name_or_path_forward), FlairEmbeddings(args.flair_model_name_or_path_backward), ] else: embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('glove'), FlairEmbeddings(args.flair_model_name_or_path_forward), FlairEmbeddings(args.flair_model_name_or_path_backward), ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) # initialize sequence tagger tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) # initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) if (args.train_or_predict == "continue_train"): print("continue training") checkpoint = '/Users/titashneogi/workspace/NLP/NER/data/flair/cumulative_model/checkpoint.pt' trainer = ModelTrainer.load_checkpoint(checkpoint, corpus) # start training trainer.train(args.model_dir, learning_rate=args.train_learning_rate, mini_batch_size=args.per_gpu_batch_size, max_epochs=args.num_train_epochs, embeddings_storage_mode=args.embeddings_storage_mode) model = SequenceTagger.load(args.model_dir + '/final-model.pt') if (args.predict_file): with open(data_dir + args.predict_file, 'r') as f: str_file = f.read() sentence = Sentence(str_file) model.predict(sentence) print(sentence.to_tagged_string())
tag_type = 'ner' # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary.idx2item) cachedir = Path( '/media/bubbles/fecf5b15-5a64-477b-8192-f8508a986ffe/ai/nishant/embeddings' ) # # 4. initialize embeddings embedding_types: List[TokenEmbeddings] = [ # WordEmbeddings('glove'), # comment in this line to use character embeddings CharacterEmbeddings( path_to_char_dict= "/media/bubbles/fecf5b15-5a64-477b-8192-f8508a986ffe/ai/abs/flair-custom/custom_dict.pkl" ), # comment in these lines to use flair embeddings # FlairEmbeddings('news-forward'), # CharLMEmbeddings('news-forward',use_cache=True), ELMoEmbeddings('elmo-small'), # BertEmbeddings(), # FlairEmbeddings('news-backward-fast'), ] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) #5. initialize sequence tagger from flair.models import SequenceTagger
search_space = SearchSpace() search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[ StackedEmbeddings([ ELMoEmbeddings('original'), FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), BertEmbeddings('bert-large-cased') ]), StackedEmbeddings([ ELMoEmbeddings('original'), FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), BertEmbeddings('bert-large-cased'), CharacterEmbeddings() ]) ]) search_space.add(Parameter.HIDDEN_SIZE, hp.randint, upper=400) search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2]) search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5) search_space.add(Parameter.LEARNING_RATE, hp.choice, options=[0.05, 0.1, 0.15, 0.2]) search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[16, 32]) search_space.add(Parameter.USE_CRF, hp.choice, options=[True, False]) # initialise embeddings param_selector = SequenceTaggerParamSelector( corpus,
from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings, BertEmbeddings, CharacterEmbeddings, ELMoEmbeddings model_name = "glove" runs = 1 use_glove = True use_cui2vec = False use_flair = False use_elmo = False use_bert = False mini_batch_size = 32 word_embeddings = [] if use_glove: word_embeddings.append(WordEmbeddings('glove')) word_embeddings.append(CharacterEmbeddings()) if use_cui2vec: word_embeddings.append(WordEmbeddings('./cui2vec_embed_vectors.bin')) if use_flair: word_embeddings.append(FlairEmbeddings('./forward-lm.pt')) word_embeddings.append(FlairEmbeddings('./backward-lm.pt')) if use_elmo: word_embeddings.append(ELMoEmbeddings('pubmed')) if use_bert: word_embeddings.append(BertEmbeddings('./bert-base-clinical-cased')) mini_batch_size = 8 stacked_word_embeddings = StackedEmbeddings(word_embeddings) from flair.embeddings import DocumentRNNEmbeddings