def _get_embedding_model( model_name_or_path: Union[str, HFModelResult, FlairModelResult] ) -> Union[FlairEmbeddings, WordEmbeddings, TransformerWordEmbeddings, Sentence]: "Load the proper `Embeddings` model from `model_name_or_path`" if isinstance(model_name_or_path, FlairModelResult): nm = model_name_or_path.name try: return WordEmbeddings(nm.strip('flairNLP/')) except: return FlairEmbeddings(nm.strip('flairNLP/')) elif isinstance(model_name_or_path, HFModelResult): return TransformerWordEmbeddings(model_name_or_path.name) else: res = _flair_hub.search_model_by_name(model_name_or_path, user_uploaded=True) if len(res) < 1: # No models found res = _hf_hub.search_model_by_name(model_name_or_path, user_uploaded=True) if len(res) < 1: raise ValueError( f'Embeddings not found for the model key: {model_name_or_path}, check documentation or custom model path to verify specified model' ) else: return TransformerWordEmbeddings( res[0].name ) # Returning the first should always be the non-fast option else: nm = res[0].name try: return WordEmbeddings(nm.strip('flairNLP/')) except: return FlairEmbeddings(nm.strip('flairNLP/'))
def dump_bert_vecs(df, dump_dir): print("Getting BERT vectors...") embedding = TransformerWordEmbeddings('bert-base-uncased') word_counter = defaultdict(int) stop_words = set(stopwords.words('english')) stop_words.add("would") except_counter = 0 for index, row in df.iterrows(): if index % 100 == 0: print("Finished sentences: " + str(index) + " out of " + str(len(df))) line = row["sentence"] sentences = sent_tokenize(line) for sentence_ind, sent in enumerate(sentences): flag = 0 i = 0 sentence = None while flag == 0: sentence = Sentence(sent[:(len(sent) - i * 100)], use_tokenizer=True) try: embedding.embed(sentence) flag = 1 except Exception as e: except_counter += 1 print("Length of sentence: ", len(sent) - i * 100) print("Exception Counter while getting BERT: ", except_counter, sentence_ind, index, e) i += 1 if sentence is None or len(sentence) == 0: print("Length of sentence is 0: ", index) for token_ind, token in enumerate(sentence): word = token.text word = word.translate( str.maketrans('', '', string.punctuation)) if word in stop_words or "/" in word or len(word) == 0: continue word_dump_dir = dump_dir + word os.makedirs(word_dump_dir, exist_ok=True) fname = word_dump_dir + "/" + str( word_counter[word]) + ".pkl" word_counter[word] += 1 vec = token.embedding.cpu().numpy() try: with open(fname, "wb") as handler: pickle.dump(vec, handler) except Exception as e: except_counter += 1 print("Exception Counter while dumping BERT: ", except_counter, sentence_ind, index, word, e)
def dump_bert_vecs(df, dump_dir): print("Getting BERT vectors...") embedding = TransformerWordEmbeddings('roberta-base', layers='-1') word_counter = defaultdict(int) stop_words = set(stopwords.words('english')) stop_words.add("would") except_counter = 0 key = list(word_cnt.keys()) for index, row in df.iterrows(): file1 = open("progress.txt", "w+") file1.write(str(index)) print(index) if index % 100 == 0: print("Finished sentences: " + str(index) + " out of " + str(len(df))) line = row["news"] sentences = sent_tokenize(line) for sentence_ind, sent in enumerate(sentences): sentence = Sentence(sent, use_tokenizer=True) try: embedding.embed(sentence) except Exception as e: except_counter += 1 print("Exception Counter while getting BERT: ", except_counter, sentence_ind, index, e) continue for token_ind, token in enumerate(sentence): word = token.text word = word.translate( str.maketrans('', '', string.punctuation)) if word in stop_words or "/" in word or len(word) == 0 or ( word not in key) or word_cnt[word] < 10: #print("word") continue word_dump_dir = dump_dir + word os.makedirs(word_dump_dir, exist_ok=True) fname = word_dump_dir + "/" + str( word_counter[word]) + ".pkl" word_counter[word] += 1 vec = token.embedding.cpu().numpy() try: with open(fname, "wb") as handler: pickle.dump(vec, handler) except Exception as e: except_counter += 1 print("Exception Counter while dumping BERT: ", except_counter, sentence_ind, index, word, e)
def __init__(self, args, name, asp_word2idx, selected_idx=None, need_neg_senti=False): self.asp_word2idx = asp_word2idx self.need_neg_senti = need_neg_senti self.args = args self.embedding = TransformerWordEmbeddings('bert-base-uncased',layers='-1') if name == 'train': self.load_corpus_with_NULL_ITEM(os.path.join(args.data_dir,args.train), 'train', selected_idx, filter_null=args.unsupervised) elif name == 'dev': self.load_corpus_with_NULL_ITEM(os.path.join(args.data_dir,args.dev), 'dev', selected_idx) elif name == 'test': self.load_corpus_with_NULL_ITEM(os.path.join(args.data_dir,args.test), 'test', selected_idx) else: raise NotImplementedError self.len = len(self.corpus_y)
def get_scibert_flair_embeddings(): return [ TransformerWordEmbeddings(model="allenai/scibert_scivocab_uncased", fine_tune=True), FlairEmbeddings("pubmed-forward"), FlairEmbeddings("pubmed-backward") ]
def create_embeddings(self) -> StackedEmbeddings: embedding_types: List[FlairEmbeddings] = [] if self.config['use_word_embeddings']: embedding_types.append(W2vWordEmbeddings(self.config['word_embeddings_path'])) if self.config['use_char_embeddings']: embedding_types.append(CharacterEmbeddings()) if self.config['use_flair_embeddings']: embedding_types.append(FlairEmbeddings('es-clinical-forward')) embedding_types.append(FlairEmbeddings('es-clinical-backward')) if self.config['use_beto_embeddings']: embedding_types.append( TransformerWordEmbeddings( 'dccuchile/bert-base-spanish-wwm-cased', layers = self.config['layers'], layer_mean = self.config['layer_mean'], subtoken_pooling = self.config['subtoken_pooling'])) embeddings: StackedEmbeddings = StackedEmbeddings(embeddings = embedding_types) return embeddings
def __init__(self, *embeddings: str): print("May need a couple moments to instantiate...") self.embedding_stack = [] # Load correct Embeddings module for model_name_or_path in embeddings: if ("flair" in model_name_or_path or model_name_or_path in FLAIR_PRETRAINED_MODEL_NAMES): self.embedding_stack.append( FlairEmbeddings(model_name_or_path)) else: try: self.embedding_stack.append( WordEmbeddings(model_name_or_path)) except ValueError: try: self.embedding_stack.append( TransformerWordEmbeddings(model_name_or_path)) except ValueError: raise ValueError( f"Embeddings not found for the model key: {model_name_or_path}, check documentation or custom model path to verify specified model" ) assert len(self.embedding_stack) != 0 self.stacked_embeddings = StackedEmbeddings( embeddings=self.embedding_stack)
def train(): generate_datasets() DATA_FOLDER = '../content/data' # MAX_TOKENS = 500 columns = {0: 'text', 1: 'pos', 2: 'tag'} data_folder = DATA_FOLDER corpus: Corpus = ColumnCorpus(data_folder, columns, train_file='train-labelled.txt', test_file='dev-labelled.txt', in_memory=False) # corpus._train = [x for x in corpus.train if len(x) < MAX_TOKENS] # corpus._test = [x for x in corpus.test if len(x) < MAX_TOKENS] tag_type = 'tag' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary) embeddings = TransformerWordEmbeddings('roberta-base', layers='-4', fine_tune=True) tagger: SequenceTagger = SequenceTagger(hidden_size=128, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, # dropout=0.3334816033039888, use_crf=True) trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train('resources/taggers/task-TC', learning_rate=0.2, mini_batch_size=64, max_epochs=100, embeddings_storage_mode='gpu'),
def build_embedding(self, lang, embedding_codes: List[str]) -> None: self.tic = time.time() self.embedding_name: str = "-".join(embedding_codes) self.lang = lang embedding_types: List[TokenEmbeddings] = [] for code in embedding_codes: code = code.lower() assert code in [ "bpe", "bert", "flair", "ft", "char", "ohe", "elmo", ], f"{code} - Invalid embedding code" if code == "ohe": embedding_types.append(OneHotEmbeddings(corpus=self.corpus)) elif code == "ft": embedding_types.append(WordEmbeddings(self.lang)) elif code == "bpe": embedding_types.append(BytePairEmbeddings(self.lang)) elif code == "bert": embedding_types.append( TransformerWordEmbeddings( model=self.huggingface_ref[self.lang], pooling_operation="first", layers="-1", fine_tune=False, ) ) elif code == "char": embedding_types.append(CharacterEmbeddings()) elif code == "flair": embedding_types.append(FlairEmbeddings(f"{self.lang}-forward")) embedding_types.append(FlairEmbeddings(f"{self.lang}-backward")) elif code == "elmo": embedding_types.append( ELMoEmbeddings(model="large", embedding_mode="all") ) self.embedding: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types ) self.tagger: SequenceTagger = SequenceTagger( hidden_size=256, embeddings=self.embedding, tag_dictionary=self.tag_dictionary, tag_type=self.tag_type, use_crf=True, ) self.trainer: ModelTrainer = ModelTrainer(self.tagger, self.corpus)
def train(self, training_dir=None): from flair.trainers import ModelTrainer if training_dir is None: training_dir = flair_splitter_dep_dir # define columns columns = {0: "text", 1: "ner"} # this is the folder in which train, test and dev files reside data_folder = flair_splitter_dep_dir + "data" # init a corpus using column format, data folder and the names of the train, dev and test files # note that training data should be unescaped, i.e. tokens like "&", not "&" corpus: Corpus = ColumnCorpus( data_folder, columns, train_file="sent_train.txt", test_file="sent_test.txt", dev_file="sent_dev.txt", document_separator_token="-DOCSTART-", ) print(corpus) tag_type = "ner" tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary) # initialize embeddings embedding_types = [ # WordEmbeddings('glove'), # comment in this line to use character embeddings CharacterEmbeddings(), # comment in these lines to use flair embeddings #FlairEmbeddings("news-forward"), #FlairEmbeddings("news-backward"), # BertEmbeddings('distilbert-base-cased') TransformerWordEmbeddings('google/electra-base-discriminator') ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) tagger: SequenceTagger = SequenceTagger( hidden_size=128, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True, ) trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train(training_dir, learning_rate=0.1, mini_batch_size=16, max_epochs=50) self.model = tagger
class WeVectorizer: def __init__(self, op_relations, vectorizer='spacy'): if vectorizer == 'spacy': self.vectorizer = en_core_web_md.load() else: self.vectorizer = TransformerWordEmbeddings('roberta-base') self.vectors = self.vectorizer_data(op_relations) def _vectorizer_data(self, relations): vecs = [] for sent_id, per_cand, org_cand, sent_raw in tqdm(relations): sent = sent_raw.strip("().\n") org = org_cand['text'] per = per_cand['text'] sent_clean = sent.replace(org, "").replace(per, "") vecs.append(self.vec_sent(sent_clean, per, org)) vecs = np.array(vecs) return vecs def vectorizer_data(self, relations): vecs = [] for sent_id, per_cand, org_cand, sent_raw in tqdm(relations): sent = sent_raw.strip("().\n") sent = Sentence(sent) self.vectorizer.embed(sent) vecs.append(sent[0].embedding.cpu().detach().numpy()) vecs = np.array(vecs) return vecs def vec_sent(self, sent, per_candidate, org_candidate): toks = [ t for t in self.vectorizer(sent) if not any([t.is_space, t.is_punct, t.is_stop, t.is_currency]) and t.has_vector ] sent_vecs = np.array([t.vector for t in toks]).mean(axis=0) per_vec = self.vectorize_ent(per_candidate) org_vec = self.vectorize_ent(org_candidate) res = np.concatenate([sent_vecs, per_vec, org_vec]) return res def vectorize_ent(self, org_candidate): return np.array([t.vector for t in self.vectorizer(org_candidate)]).mean(axis=0)
def test_train_load_use_classifier(results_base_path, tasks_base_path): corpus = ColumnCorpus( data_folder=tasks_base_path / "conllu", train_file="train.conllup", dev_file="train.conllup", test_file="train.conllup", column_format={ 1: "text", 2: "pos", 3: "ner" }, ) relation_label_dict = corpus.make_label_dictionary(label_type="relation") embeddings = TransformerWordEmbeddings() model: RelationExtractor = RelationExtractor( embeddings=embeddings, label_dictionary=relation_label_dict, label_type="relation", entity_label_type="ner", train_on_gold_pairs_only=True, ) # initialize trainer trainer: ModelTrainer = ModelTrainer(model, corpus) trainer.train( results_base_path, learning_rate=0.1, mini_batch_size=2, max_epochs=3, shuffle=False, ) del trainer, model, relation_label_dict, corpus loaded_model: RelationExtractor = RelationExtractor.load( results_base_path / "final-model.pt") loaded_model.train_on_gold_pairs_only = False sentence = Sentence( ["Apple", "was", "founded", "by", "Steve", "Jobs", "."]) for token, tag in zip(sentence.tokens, ["B-ORG", "O", "O", "O", "B-PER", "I-PER", "O"]): token.set_label("ner", tag) loaded_model.predict(sentence) assert "founded_by" == sentence.get_labels("relation")[0].value # loaded_model.predict([sentence, sentence_empty]) # loaded_model.predict([sentence_empty]) del loaded_model
def train_model(directory='Data', use_BERT=True): # define columns columns = { 0: 'ID', 1: 'text', 2: 'empty_0', 3: 'pos', 4: 'empty_1', 5: 'empty_2', 6: 'empty_3', 7: 'empty_4', 8: 'empty_5', 9: 'tox' } # this is the folder in which train, test and dev files reside data_folder = directory # init a corpus using column format, data folder and the names of the train, dev and test files corpus: Corpus = ColumnCorpus(data_folder, columns, train_file='converted_data_train.conll', test_file='converted_data_test.conll', dev_file='converted_data_dev.conll') # tag to predict tag_type = 'tox' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) # embeddings if use_BERT: bert_embeddings = [ TransformerWordEmbeddings('bert-large-uncased', fine_tune=True) ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=bert_embeddings) else: embedding_types = [WordEmbeddings('glove')] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) # initialize sequence tagger tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) # initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) # start training trainer.train('resources/taggers/toxic_classifier_bert', learning_rate=0.1, mini_batch_size=32, max_epochs=5)
def test_transformer_word_embeddings_forward_language_ids(): cos = torch.nn.CosineSimilarity(dim=0, eps=1e-10) sent_en = Sentence(["This", "is", "a", "sentence"], language_code="en") sent_de = Sentence(["Das", "ist", "ein", "Satz"], language_code="de") embeddings = TransformerWordEmbeddings("xlm-mlm-ende-1024", allow_long_sentences=False) embeddings.embed([sent_de, sent_en]) expected_similarities = [ 0.7102344036102295, 0.7598986625671387, 0.7437312602996826, 0.5584433674812317 ] for (token_de, token_en, exp_sim) in zip(sent_de, sent_en, expected_similarities): sim = cos(token_de.embedding, token_en.embedding).item() assert abs(exp_sim - sim) < 1e-5
def test_sequence_tagger_transformer_finetune(results_base_path, tasks_base_path): flair.set_seed(123) # load dataset corpus: Corpus = ColumnCorpus( data_folder=tasks_base_path / "trivial" / "trivial_bioes", column_format={ 0: "text", 1: "ner" }, ) tag_dictionary = corpus.make_label_dictionary("ner") # tagger without CRF tagger: SequenceTagger = SequenceTagger( hidden_size=64, embeddings=TransformerWordEmbeddings("distilbert-base-uncased", fine_tune=True), tag_dictionary=tag_dictionary, tag_type="ner", use_crf=False, use_rnn=False, reproject_embeddings=False, ) # train trainer = ModelTrainer(tagger, corpus) trainer.fine_tune( results_base_path, mini_batch_size=2, max_epochs=10, shuffle=True, learning_rate=0.5e-4, ) loaded_model: SequenceTagger = SequenceTagger.load(results_base_path / "final-model.pt") sentence = Sentence("this is New York") sentence_empty = Sentence(" ") loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # check if loaded model can predict entities = [span.text for span in sentence.get_spans("ner")] assert "New York" in entities # check if loaded model successfully fit the training data result: Result = loaded_model.evaluate(corpus.test, gold_label_type="ner") assert result.classification_report["micro avg"]["f1-score"] == 1.0 del loaded_model
def trainNER(data_dir, model_dir): parser = argparse.ArgumentParser() parser.add_argument("--model", default='bert-base-cased', type=str, required=True, help="The pretrained model to produce embeddings") args = parser.parse_args() model = args.model # pdb.set_trace() try: corpus: Corpus = CONLL_03(base_path=data_dir + '/') except FileNotFoundError: columns = {0: 'text', 1: 'ner'} corpus: Corpus = ColumnCorpus(data_dir, columns) corpus.filter_empty_sentences() tag_type = 'ner' # tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) tag_dictionary = corpus.make_label_dictionary('ner') print(tag_dictionary.get_items()) stats = corpus.obtain_statistics() print(stats) # ['<unk>', 'O', 'B-DEVICE', 'I-DEVICE', 'B-TREE', 'I-TREE', 'B-APPLICATION', 'I-APPLICATION', 'B-LOCATION', 'I-LOCATION', '<START>', '<STOP>'] # pdb.set_trace() embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('glove'), TransformerWordEmbeddings( model=model, layers='0', # dtype: str pooling_operation='first_last', use_scalar_mix=False, batch_size=16, fine_tune=False, allow_long_sentences=False) ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) # biLSTM + CRF # tagger: SequenceTagger = SequenceTagger(hidden_size=256, # embeddings=embeddings, # tag_dictionary=tag_dictionary, # tag_type=tag_type) model_path = '/home/carolyn/Projects/mygit/Flair-NER/exprmt-20201120/conll_frac/10ptdata/models-5e-20201124/final-model.pt' tagger: SequenceTagger = SequenceTagger.load(model_path) trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train(model_dir, train_with_dev=False, max_epochs=10) # 150
def test_transformers_keep_tokenizer_when_saving(results_base_path): embeddings = TransformerWordEmbeddings( "sentence-transformers/paraphrase-albert-small-v2") results_base_path.mkdir(exist_ok=True, parents=True) initial_tagger_path = results_base_path / "initial_tokenizer.pk" reloaded_tagger_path = results_base_path / "reloaded_tokenizer.pk" initial_tagger = SequenceTagger(embeddings, Dictionary(), "ner") initial_tagger.save(initial_tagger_path) reloaded_tagger = SequenceTagger.load(initial_tagger_path) reloaded_tagger.save(reloaded_tagger_path)
def main(directory, embeddings, strategy): # 1. find corpora in data directory corpora = {"train": None, "dev": None, "test": None} for labelset in corpora: for file in sorted(os.listdir(directory)): if infer_split(file) == labelset: corpora[labelset] = pd.read_csv( os.path.join(directory, file), sep="\t", names=["text", "pos", "lemma", "label"], engine="python", error_bad_lines=False, quoting=csv.QUOTE_NONE).fillna("") break if embeddings == "elmo": embedder = ELMoEmbeddings("original") elif embeddings == "flair": embedder = FlairEmbeddings("news-forward") elif embeddings == "bert": embedder = TransformerWordEmbeddings('bert-base-cased') embeddings_dir = os.path.join(directory, embeddings + "_embeddings") if not os.path.exists(embeddings_dir): os.makedirs(embeddings_dir, exist_ok=True) strategy = np.mean if strategy == "mean" else np.max if strategy == "max" else np.sum if strategy == "sum" else None for labelset, corpus in corpora.items(): if corpus is None: print(f"empty corpus: {labelset}") continue voc = sorted(corpus["text"].unique()) print(f"Unique tokens: {len(voc)}") with open(os.path.join(embeddings_dir, labelset + ".w2v"), "w") as f: for word in voc: sentence = Sentence(word) if len(sentence) == 0: continue embedder.embed(sentence) token_embedding = strategy( [token.embedding.cpu().numpy() for token in sentence], axis=0) f.write( word + " " + " ".join([str(num) for num in token_embedding.tolist()]) + '\n')
def __init__(self, config): """ Load pretrained language model """ super(LanguageModel, self).__init__() embeddings_stack = [] transformers = config.get("language_model", "transformers") if transformers is not "": transformers = transformers.split(";") for model in transformers: embeddings_stack.append( TransformerWordEmbeddings( model, layers="-1", pooling_operation='mean', # use_scalar_mix=True, fine_tune=True)) word_embeddings = config.get("language_model", "word_embeddings") if word_embeddings is not "": word_embeddings = word_embeddings.split(";") for model in word_embeddings: embeddings_stack.append(WordEmbeddings(model)) flair_embeddings = config.get("language_model", "flair_embeddings") if flair_embeddings is not "": flair_embeddings = flair_embeddings.split(";") for model in flair_embeddings: embeddings_stack.append(FlairEmbeddings(model, fine_tune=True)) character_embeddings = config.get("language_model", "character_embeddigs") if character_embeddings.lower() is "yes": embeddings_stack.append(CharacterEmbeddings(character_embeddings)) bytepair_embeddings = config.get("language_model", "bytepair_embeddings") if bytepair_embeddings.lower() is "yes": embeddings_stack.append(BytePairEmbeddings()) custom_embeddings = config.get("language_model", "custom_embeddings") if custom_embeddings is not "": custom_embeddings = custom_embeddings.split(";") for path in custom_embeddings: embeddings_stack.append(WordEmbeddings(path)) self.lm = StackedEmbeddings(embeddings_stack) self.embedding_dim = self.lm.embedding_length self.dropout = torch.nn.Dropout( float(config.get("language_model", "dropout"))) self.classify = torch.nn.Linear(self.embedding_dim, 2) if config.get("language_model", "relu") == "yes": self.relu = torch.nn.ReLU()
def trainNER(data_dir, model_dir): parser = argparse.ArgumentParser() parser.add_argument("--model", default='bert-base-cased', type=str, required=True, help="The pretrained model to produce embeddings") args = parser.parse_args() model = args.model columns = {0: 'text', 1: 'ner'} # pdb.set_trace() # print(data_dir + '/eng.train') corpus: Corpus = ColumnCorpus(data_dir, columns) corpus.filter_empty_sentences() tag_type = 'ner' # tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) tag_dictionary = corpus.make_label_dictionary('ner') print(tag_dictionary.get_items()) stats = corpus.obtain_statistics() print(stats) # ['<unk>', 'O', 'B-DEVICE', 'I-DEVICE', 'B-TREE', 'I-TREE', 'B-APPLICATION', 'I-APPLICATION', 'B-LOCATION', 'I-LOCATION', '<START>', '<STOP>'] # pdb.set_trace() embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('glove'), TransformerWordEmbeddings( model=model, layers='0', # dtype: str pooling_operation='first_last', use_scalar_mix=False, batch_size=16, fine_tune=False, allow_long_sentences=False) ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) # biLSTM + CRF tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type) trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train(model_dir, train_with_dev=True, max_epochs=10) # 150
def _get_embedding_model( model_name_or_path: str ) -> Union[FlairEmbeddings, WordEmbeddings, TransformerWordEmbeddings, Sentence]: "Load the proper `Embeddings` model from `model_name_or_path`" if ("flair" in model_name_or_path or model_name_or_path in FLAIR_PRETRAINED_MODEL_NAMES): return FlairEmbeddings(model_name_or_path) else: try: return WordEmbeddings(model_name_or_path) except ValueError: try: return TransformerWordEmbeddings(model_name_or_path) except ValueError: raise ValueError( f"Embeddings not found for the model key: {model_name_or_path}, check documentation or custom model path to verify specified model" )
def handle(self, *args, **options): file = options.get('file') or 'annotated_sentences' model_folder = options.get('model_folder') or 'model-var' columns = {0: 'text', 1: 'var'} data_folder = 'data/txt' corpus = ColumnCorpus(data_folder, columns, train_file=f'{file}.txt') tag_type = 'var' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) embedding_types = [ WordEmbeddings('glove'), # comment in this line to use character embeddings # CharacterEmbeddings(), # comment in these lines to use flair embeddings # FlairEmbeddings('news-forward'), # FlairEmbeddings('news-backward'), TransformerWordEmbeddings('bert-base-uncased'), ] embeddings = StackedEmbeddings(embeddings=embedding_types) tagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) trainer = ModelTrainer(tagger, corpus) trainer.train(f'data/models/taggers/{model_folder}', learning_rate=0.1, mini_batch_size=32, max_epochs=150) self.stdout.write(self.style.SUCCESS(f'Successfully trained model on dataset file.'))
def EmbeddingFactory(parameters, corpus): from flair.embeddings import FlairEmbeddings, StackedEmbeddings, \ WordEmbeddings, OneHotEmbeddings, CharacterEmbeddings, TransformerWordEmbeddings stack = [] for emb in parameters.embedding.split(): if any((spec in emb) for spec in ("bert", "gpt", "xlnet")): stack.append( TransformerWordEmbeddings(model=pretrainedstr( emb, parameters.language), fine_tune=parameters.tune_embedding)) elif emb == "flair": stack += [ FlairEmbeddings(f"{parameters.language}-forward", fine_tune=parameters.tune_embedding), FlairEmbeddings(f"{parameters.language}-backward", fine_tune=parameters.tune_embedding) ] elif emb == "pos": stack.append( OneHotEmbeddings(corpus, field="pos", embedding_length=parameters.pos_embedding_dim, min_freq=1)) elif emb == "fasttext": stack.append(WordEmbeddings(parameters.language)) elif emb == "word": stack.append( OneHotEmbeddings( corpus, field="text", embedding_length=parameters.word_embedding_dim, min_freq=parameters.word_minfreq)) elif emb == "char": stack.append( CharacterEmbeddings( char_embedding_dim=parameters.char_embedding_dim, hidden_size_char=parameters.char_bilstm_dim)) else: raise NotImplementedError() return StackedEmbeddings(stack)
def embed_text( self, text: Union[List[Sentence], Sentence, List[str], str], model_name_or_path: str = "bert-base-cased", ) -> List[Sentence]: """Produces embeddings for text * **text** - Text input, it can be a string or any of Flair's `Sentence` input formats * **model_name_or_path** - The hosted model name key or model path **return** - A list of Flair's `Sentence`s """ # Convert into sentences if isinstance(text, str): sentences = Sentence(text) elif isinstance(text, list) and all(isinstance(t, str) for t in text): sentences = [Sentence(t) for t in text] else: sentences = text # Load correct Embeddings module if not self.models[model_name_or_path]: if ("flair" in model_name_or_path or model_name_or_path in FLAIR_PRETRAINED_MODEL_NAMES): self.models[model_name_or_path] = FlairEmbeddings( model_name_or_path) else: try: self.models[model_name_or_path] = WordEmbeddings( model_name_or_path) except ValueError: try: self.models[ model_name_or_path] = TransformerWordEmbeddings( model_name_or_path) except ValueError: raise ValueError( f"Embeddings not found for the model key: {model_name_or_path}, check documentation or custom model path to verify specified model" ) return Sentence("") embedding = self.models[model_name_or_path] return embedding.embed(sentences)
def test_transformer_weird_sentences(): embeddings = TransformerWordEmbeddings('distilbert-base-uncased', layers='all', use_scalar_mix=True) sentence = Sentence("Hybrid mesons , qq ̄ states with an admixture") embeddings.embed(sentence) for token in sentence: assert len(token.get_embedding()) == 768 sentence = Sentence("typical proportionalities of ∼ 1nmV − 1 [ 3,4 ] .") embeddings.embed(sentence) for token in sentence: assert len(token.get_embedding()) == 768
def load_model(bert=None, document=False, flair=False): """Load word embeddings model.""" if bert == 'bio': # https://github.com/flairNLP/flair/issues/1085 # also see readme for instructions bertpath = './bert/bert-base-biobert-cased' elif bert == 'sci': # https://github.com/flairNLP/flair/issues/744 # https://github.com/flairNLP/flair/issues/1239 bertpath = './bert/scibert_scivocab_uncased' else: bertpath = 'bert-base-uncased' if document and not flair: bert_embedding = TransformerDocumentEmbeddings(model=bertpath, batch_size=4) return bert_embedding bert_embedding = TransformerWordEmbeddings(model=bertpath, pooling_operation='first', batch_size=4) if flair: flair_embedding_forward = FlairEmbeddings('en-forward') flair_embedding_backward = FlairEmbeddings('en-backward') embed_arr = [ bert_embedding, flair_embedding_backward, flair_embedding_forward, ] else: embed_arr = [bert_embedding] if document: document_embeddings = DocumentPoolEmbeddings( embed_arr, fine_tune_mode='nonlinear') else: document_embeddings = StackedEmbeddings(embed_arr) return document_embeddings
def train(self, training_dir=None): from flair.trainers import ModelTrainer if training_dir is None: training_dir = script_dir + "flair" + os.sep # define columns columns = {0: "text", 1: "ner"} # this is the folder in which train, test and dev files reside data_folder = training_dir + "data" # init a corpus using column format, data folder and the names of the train, dev and test files # note that training data should be unescaped, i.e. tokens like "&", not "&" corpus: Corpus = ColumnCorpus( data_folder, columns, train_file="sent_train.txt", test_file="sent_test.txt", dev_file="sent_dev.txt", ) print(corpus) tag_type = "ner" tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary) # initialize embeddings embeddings: TransformerWordEmbeddings = TransformerWordEmbeddings('onlplab/alephbert-base') tagger: SequenceTagger = SequenceTagger( hidden_size=128, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True, ) trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train(training_dir, learning_rate=0.1, mini_batch_size=32, max_epochs=50) self.model = tagger
def test_transformer_word_embeddings(): embeddings = TransformerWordEmbeddings('distilbert-base-uncased', layers='-1,-2,-3,-4', layer_mean=False) sentence: Sentence = Sentence("I love Berlin") embeddings.embed(sentence) for token in sentence.tokens: assert len(token.get_embedding()) == 3072 token.clear_embeddings() assert len(token.get_embedding()) == 0 embeddings = TransformerWordEmbeddings('distilbert-base-uncased', layers='all', layer_mean=False) embeddings.embed(sentence) for token in sentence.tokens: assert len(token.get_embedding()) == 5376 token.clear_embeddings() assert len(token.get_embedding()) == 0 del embeddings embeddings = TransformerWordEmbeddings('distilbert-base-uncased', layers='all', layer_mean=True) embeddings.embed(sentence) for token in sentence.tokens: assert len(token.get_embedding()) == 768 token.clear_embeddings() assert len(token.get_embedding()) == 0 del embeddings
def test_transformer_weird_sentences(): embeddings = TransformerWordEmbeddings('distilbert-base-uncased', layers='all', layer_mean=True) sentence = Sentence("Hybrid mesons , qq ̄ states with an admixture") embeddings.embed(sentence) for token in sentence: assert len(token.get_embedding()) == 768 sentence = Sentence( "typical proportionalities of ∼ 1nmV − 1 [ 3,4 ] .") embeddings.embed(sentence) for token in sentence: assert len(token.get_embedding()) == 768 sentence = Sentence("🤟 🤟 🤟 hüllo") embeddings.embed(sentence) for token in sentence: assert len(token.get_embedding()) == 768 sentence = Sentence("🤟hallo 🤟 🤟 🤟 🤟") embeddings.embed(sentence) for token in sentence: assert len(token.get_embedding()) == 768 sentence = Sentence("🤟hallo 🤟 🤟 🤟 🤟") embeddings.embed(sentence) for token in sentence: assert len(token.get_embedding()) == 768 sentence = Sentence("🤟") embeddings.embed(sentence) for token in sentence: assert len(token.get_embedding()) == 768 sentence = Sentence("🤟") sentence_2 = Sentence("second sentence") embeddings.embed([sentence, sentence_2]) for token in sentence: assert len(token.get_embedding()) == 768 for token in sentence_2: assert len(token.get_embedding()) == 768
from preprocessing.normalize import normalize from utility.frequency_loader import load_frequencies, load_doc_frequencies from utility.run_experiment import run_experiment import os if not os.path.exists(IMAGE_PATH): os.makedirs(IMAGE_PATH) sick_all, sick_train, sick_test, sick_dev = download_and_load_sick_dataset() print('Downloaded data') frequency = load_frequencies("data/frequencies/frequencies.tsv") doc_frequency = load_doc_frequencies("data/frequencies/doc_frequencies.tsv") word2vec = load_word2vec(w2v_path) elmo = ELMoEmbeddings('large') bert = TransformerWordEmbeddings('bert-large-cased') flair = StackedEmbeddings([ WordEmbeddings('glove'), FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward') ]) elmo_bert = StackedEmbeddings([elmo, bert]) print("Loaded Resources") benchmarks = [("AVG-W2V", ft.partial(run_avg_benchmark, model=word2vec, use_stoplist=False)), ("AVG-ELMO", ft.partial(run_context_avg_benchmark,