def test_tagged_corpus_make_label_dictionary_string(): sentence_1 = Sentence(u'sentence 1', labels=[u'class_1']) sentence_2 = Sentence(u'sentence 2', labels=[u'class_2']) sentence_3 = Sentence(u'sentence 3', labels=[u'class_1']) corpus = TaggedCorpus([sentence_1, sentence_2, sentence_3], [], []) label_dict = corpus.make_label_dictionary() assert (2 == len(label_dict)) assert (u'<unk>' not in label_dict.get_items()) assert (u'class_1' in label_dict.get_items()) assert (u'class_2' in label_dict.get_items())
def train(self, intent_fst) -> None: from flair.data import Sentence, Token from flair.models import SequenceTagger, TextClassifier from flair.embeddings import ( FlairEmbeddings, StackedEmbeddings, DocumentRNNEmbeddings, ) from flair.data import TaggedCorpus from flair.trainers import ModelTrainer # Directory to look for downloaded embeddings cache_dir = self.profile.read_path( self.profile.get("intent.flair.cache_dir", "flair/cache") ) os.makedirs(cache_dir, exist_ok=True) # Directory to store generated models data_dir = self.profile.write_path( self.profile.get("intent.flair.data_dir", "flair/data") ) if os.path.exists(data_dir): shutil.rmtree(data_dir) self.embeddings = self.profile.get("intent.flair.embeddings", []) assert len(self.embeddings) > 0, "No word embeddings" # Create directories to write training data to class_data_dir = os.path.join(data_dir, "classification") ner_data_dir = os.path.join(data_dir, "ner") os.makedirs(class_data_dir, exist_ok=True) os.makedirs(ner_data_dir, exist_ok=True) # Convert FST to training data class_data_path = os.path.join(class_data_dir, "train.txt") ner_data_path = os.path.join(ner_data_dir, "train.txt") # { intent: [ { 'text': ..., 'entities': { ... } }, ... ] } sentences_by_intent: Dict[str, Any] = {} # Get sentences for training do_sampling = self.profile.get("intent.flair.do_sampling", True) start_time = time.time() if do_sampling: # Sample from each intent FST num_samples = int(self.profile.get("intent.flair.num_samples", 10000)) intent_map_path = self.profile.read_path( self.profile.get("training.intent.intent_map", "intent_map.json") ) with open(intent_map_path, "r") as intent_map_file: intent_map = json.load(intent_map_file) # Gather FSTs for all known intents fsts_dir = self.profile.write_dir( self.profile.get("speech_to_text.fsts_dir") ) intent_fst_paths = { intent_id: os.path.join(fsts_dir, f"{intent_id}.fst") for intent_id in intent_map.keys() } # Generate samples self._logger.debug( f"Generating {num_samples} sample(s) from {len(intent_fst_paths)} intent(s)" ) sentences_by_intent = sample_sentences_by_intent( intent_fst_paths, num_samples ) else: # Exhaustively generate all sentences self._logger.debug( "Generating all possible sentences (may take a long time)" ) sentences_by_intent = make_sentences_by_intent(intent_fst) sentence_time = time.time() - start_time self._logger.debug(f"Generated sentences in {sentence_time} second(s)") # Get least common multiple in order to balance sentences by intent lcm_sentences = lcm(*(len(sents) for sents in sentences_by_intent.values())) # Generate examples class_sentences = [] ner_sentences: Dict[str, List[Sentence]] = defaultdict(list) for intent_name, intent_sents in sentences_by_intent.items(): num_repeats = max(1, lcm_sentences // len(intent_sents)) for intent_sent in intent_sents: # Only train an intent classifier if there's more than one intent if len(sentences_by_intent) > 1: # Add balanced copies for i in range(num_repeats): class_sent = Sentence(labels=[intent_name]) for word in intent_sent["tokens"]: class_sent.add_token(Token(word)) class_sentences.append(class_sent) if len(intent_sent["entities"]) == 0: continue # no entities, no sequence tagger # Named entity recognition (NER) example token_idx = 0 entity_start = {ev["start"]: ev for ev in intent_sent["entities"]} entity_end = {ev["end"]: ev for ev in intent_sent["entities"]} entity = None word_tags = [] for word in intent_sent["tokens"]: # Determine tag label tag = "O" if not entity else f"I-{entity}" if token_idx in entity_start: entity = entity_start[token_idx]["entity"] tag = f"B-{entity}" word_tags.append((word, tag)) # word ner token_idx += len(word) + 1 if (token_idx - 1) in entity_end: entity = None # Add balanced copies for i in range(num_repeats): ner_sent = Sentence() for word, tag in word_tags: token = Token(word) token.add_tag("ner", tag) ner_sent.add_token(token) ner_sentences[intent_name].append(ner_sent) # Start training max_epochs = int(self.profile.get("intent.flair.max_epochs", 100)) # Load word embeddings self._logger.debug(f"Loading word embeddings from {cache_dir}") word_embeddings = [ FlairEmbeddings(os.path.join(cache_dir, "embeddings", e)) for e in self.embeddings ] if len(class_sentences) > 0: self._logger.debug("Training intent classifier") # Random 80/10/10 split class_train, class_dev, class_test = self._split_data(class_sentences) class_corpus = TaggedCorpus(class_train, class_dev, class_test) # Intent classification doc_embeddings = DocumentRNNEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256, ) classifier = TextClassifier( doc_embeddings, label_dictionary=class_corpus.make_label_dictionary(), multi_label=False, ) self._logger.debug( f"Intent classifier has {len(class_sentences)} example(s)" ) trainer = ModelTrainer(classifier, class_corpus) trainer.train(class_data_dir, max_epochs=max_epochs) else: self._logger.info("Skipping intent classifier training") if len(ner_sentences) > 0: self._logger.debug(f"Training {len(ner_sentences)} NER sequence tagger(s)") # Named entity recognition stacked_embeddings = StackedEmbeddings(word_embeddings) for intent_name, intent_ner_sents in ner_sentences.items(): ner_train, ner_dev, ner_test = self._split_data(intent_ner_sents) ner_corpus = TaggedCorpus(ner_train, ner_dev, ner_test) tagger = SequenceTagger( hidden_size=256, embeddings=stacked_embeddings, tag_dictionary=ner_corpus.make_tag_dictionary(tag_type="ner"), tag_type="ner", use_crf=True, ) ner_intent_dir = os.path.join(ner_data_dir, intent_name) os.makedirs(ner_intent_dir, exist_ok=True) self._logger.debug( f"NER tagger for {intent_name} has {len(intent_ner_sents)} example(s)" ) trainer = ModelTrainer(tagger, ner_corpus) trainer.train(ner_intent_dir, max_epochs=max_epochs) else: self._logger.info("Skipping NER sequence tagger training")
def main(args): args = parser.parse_args() # 0. Make a list of word embeddings if args.method == 'glove': word_embeddings = [WordEmbeddings('glove')] elif args.method == 'flair': word_embeddings = [ WordEmbeddings('glove'), FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward') ] elif args.method == 'cui_svd': word_embeddings = [ BackOffEmbeddings( WordEmbeddings('glove'), WordEmbeddings('resources/embeddings/cui2vec100.npy')) ] elif args.method == 'cui_proj': word_embeddings = [ BackOffEmbeddings( WordEmbeddings('glove'), WordEmbeddings( 'resources/embeddings/cui2vec_projected_100-100.gensim')) ] elif args.method == 'mimic': word_embeddings = [ WordEmbeddings( 'resources/embeddings/mimic3_mixed_embeddings100.gensim') ] elif args.method == 'cui2vec': word_embeddings = [ BackOffEmbeddings( WordEmbeddings('glove'), WordEmbeddings( 'resources/embeddings/cui2vec_combined_glove_100dim.gensim' )) ] elif args.method == 'mimic_lm': word_embeddings = [ WordEmbeddings('glove'), FlairEmbeddings('resources/taggers/mimic-forward/best-lm.pt'), FlairEmbeddings('resources/taggers/mimic-backward/best-lm.pt') ] else: raise Exception( "Received option for method %s that cannot be interpreted." % (args.method)) if 'bg' in args.data_file: multi = True print( "Running in multiple label setting because 'bg' was in the data file name %s" % (args.data_file)) else: multi = False # 1. get the corpus sents: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file( args.data_file) corpus = TaggedCorpus(sents, None, None) # 2. create the label dictionary label_dict = corpus.make_label_dictionary() # 3. split the training data into folds num_folds = args.num_folds seed = 719 kf = KFold(n_splits=num_folds, random_state=seed) kf.get_n_splits() # 4. iterate over folds: total_acc = 0 fold = 1 for train_index, test_index in kf.split(corpus.train): # 4a. initialize the text classifier trainer split_traindev = np.array(corpus.train)[train_index].tolist() traindev_size = len(split_traindev) train_dev_splitpoint = int(0.9 * traindev_size) split_train = split_traindev[:train_dev_splitpoint] split_dev = split_traindev[train_dev_splitpoint:] split_test = np.array(corpus.train)[test_index].tolist() split_corpus = TaggedCorpus(split_train, dev=split_dev, test=split_test) print("After split, size of splits: train=%d, dev=%d, test=%d" % (len(split_train), len(split_dev), len(split_test))) # 4b. do training: with tempfile.TemporaryDirectory() as model_dir: # init document embedding by passing list of word embeddings document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings( word_embeddings, hidden_size=128, reproject_words=True, reproject_words_dimension=64, ) classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=multi) trainer = ModelTrainer(classifier, split_corpus) results = trainer.train(model_dir, embeddings_in_memory=False, learning_rate=0.1, mini_batch_size=128, anneal_factor=0.5, patience=5, max_epochs=100) fold_acc = results['test_score'] total_acc += fold_acc print(f"Finished fold {fold} with accuracy {fold_acc}") fold += 1 total_acc /= num_folds print("Finished with total cross-fold accuracy of %f" % (total_acc))
from typing import List from flair.data import Sentence, TaggedCorpus from flair.data_fetcher import NLPTaskDataFetcher, NLPTask from flair.embeddings import WordEmbeddings, CharLMEmbeddings, DocumentLSTMEmbeddings from flair.models.text_classification_model import TextClassifier from flair.trainers.text_classification_trainer import TextClassifierTrainer sentences_train: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file('training.preprocessed.txt') sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file('dev.preprocessed.txt') sentences_test: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file('test.preprocessed.txt') corpus = TaggedCorpus(sentences_train, sentences_dev, sentences_test) # 2. create the label dictionary label_dict = corpus.make_label_dictionary() # 3. make a list of word embeddings word_embeddings = [WordEmbeddings('de-fasttext'), CharLMEmbeddings('german-forward'), CharLMEmbeddings('german-backward')] # 4. init document embedding by passing list of word embeddings document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings(word_embeddings, hidden_states=32) # 5. create the text classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False) # 6. initialize the text classifier trainer trainer = TextClassifierTrainer(classifier, corpus, label_dict)