예제 #1
0
def test_tagged_corpus_make_label_dictionary_string():
    sentence_1 = Sentence(u'sentence 1', labels=[u'class_1'])
    sentence_2 = Sentence(u'sentence 2', labels=[u'class_2'])
    sentence_3 = Sentence(u'sentence 3', labels=[u'class_1'])
    corpus = TaggedCorpus([sentence_1, sentence_2, sentence_3], [], [])
    label_dict = corpus.make_label_dictionary()
    assert (2 == len(label_dict))
    assert (u'<unk>' not in label_dict.get_items())
    assert (u'class_1' in label_dict.get_items())
    assert (u'class_2' in label_dict.get_items())
예제 #2
0
    def train(self, intent_fst) -> None:
        from flair.data import Sentence, Token
        from flair.models import SequenceTagger, TextClassifier
        from flair.embeddings import (
            FlairEmbeddings,
            StackedEmbeddings,
            DocumentRNNEmbeddings,
        )
        from flair.data import TaggedCorpus
        from flair.trainers import ModelTrainer

        # Directory to look for downloaded embeddings
        cache_dir = self.profile.read_path(
            self.profile.get("intent.flair.cache_dir", "flair/cache")
        )

        os.makedirs(cache_dir, exist_ok=True)

        # Directory to store generated models
        data_dir = self.profile.write_path(
            self.profile.get("intent.flair.data_dir", "flair/data")
        )

        if os.path.exists(data_dir):
            shutil.rmtree(data_dir)

        self.embeddings = self.profile.get("intent.flair.embeddings", [])
        assert len(self.embeddings) > 0, "No word embeddings"

        # Create directories to write training data to
        class_data_dir = os.path.join(data_dir, "classification")
        ner_data_dir = os.path.join(data_dir, "ner")
        os.makedirs(class_data_dir, exist_ok=True)
        os.makedirs(ner_data_dir, exist_ok=True)

        # Convert FST to training data
        class_data_path = os.path.join(class_data_dir, "train.txt")
        ner_data_path = os.path.join(ner_data_dir, "train.txt")

        # { intent: [ { 'text': ..., 'entities': { ... } }, ... ] }
        sentences_by_intent: Dict[str, Any] = {}

        # Get sentences for training
        do_sampling = self.profile.get("intent.flair.do_sampling", True)
        start_time = time.time()

        if do_sampling:
            # Sample from each intent FST
            num_samples = int(self.profile.get("intent.flair.num_samples", 10000))
            intent_map_path = self.profile.read_path(
                self.profile.get("training.intent.intent_map", "intent_map.json")
            )

            with open(intent_map_path, "r") as intent_map_file:
                intent_map = json.load(intent_map_file)

            # Gather FSTs for all known intents
            fsts_dir = self.profile.write_dir(
                self.profile.get("speech_to_text.fsts_dir")
            )

            intent_fst_paths = {
                intent_id: os.path.join(fsts_dir, f"{intent_id}.fst")
                for intent_id in intent_map.keys()
            }

            # Generate samples
            self._logger.debug(
                f"Generating {num_samples} sample(s) from {len(intent_fst_paths)} intent(s)"
            )

            sentences_by_intent = sample_sentences_by_intent(
                intent_fst_paths, num_samples
            )
        else:
            # Exhaustively generate all sentences
            self._logger.debug(
                "Generating all possible sentences (may take a long time)"
            )
            sentences_by_intent = make_sentences_by_intent(intent_fst)

        sentence_time = time.time() - start_time
        self._logger.debug(f"Generated sentences in {sentence_time} second(s)")

        # Get least common multiple in order to balance sentences by intent
        lcm_sentences = lcm(*(len(sents) for sents in sentences_by_intent.values()))

        # Generate examples
        class_sentences = []
        ner_sentences: Dict[str, List[Sentence]] = defaultdict(list)
        for intent_name, intent_sents in sentences_by_intent.items():
            num_repeats = max(1, lcm_sentences // len(intent_sents))
            for intent_sent in intent_sents:
                # Only train an intent classifier if there's more than one intent
                if len(sentences_by_intent) > 1:
                    # Add balanced copies
                    for i in range(num_repeats):
                        class_sent = Sentence(labels=[intent_name])
                        for word in intent_sent["tokens"]:
                            class_sent.add_token(Token(word))

                        class_sentences.append(class_sent)

                if len(intent_sent["entities"]) == 0:
                    continue  # no entities, no sequence tagger

                # Named entity recognition (NER) example
                token_idx = 0
                entity_start = {ev["start"]: ev for ev in intent_sent["entities"]}
                entity_end = {ev["end"]: ev for ev in intent_sent["entities"]}
                entity = None

                word_tags = []
                for word in intent_sent["tokens"]:
                    # Determine tag label
                    tag = "O" if not entity else f"I-{entity}"
                    if token_idx in entity_start:
                        entity = entity_start[token_idx]["entity"]
                        tag = f"B-{entity}"

                    word_tags.append((word, tag))

                    # word ner
                    token_idx += len(word) + 1

                    if (token_idx - 1) in entity_end:
                        entity = None

                # Add balanced copies
                for i in range(num_repeats):
                    ner_sent = Sentence()
                    for word, tag in word_tags:
                        token = Token(word)
                        token.add_tag("ner", tag)
                        ner_sent.add_token(token)

                    ner_sentences[intent_name].append(ner_sent)

        # Start training
        max_epochs = int(self.profile.get("intent.flair.max_epochs", 100))

        # Load word embeddings
        self._logger.debug(f"Loading word embeddings from {cache_dir}")
        word_embeddings = [
            FlairEmbeddings(os.path.join(cache_dir, "embeddings", e))
            for e in self.embeddings
        ]

        if len(class_sentences) > 0:
            self._logger.debug("Training intent classifier")

            # Random 80/10/10 split
            class_train, class_dev, class_test = self._split_data(class_sentences)
            class_corpus = TaggedCorpus(class_train, class_dev, class_test)

            # Intent classification
            doc_embeddings = DocumentRNNEmbeddings(
                word_embeddings,
                hidden_size=512,
                reproject_words=True,
                reproject_words_dimension=256,
            )

            classifier = TextClassifier(
                doc_embeddings,
                label_dictionary=class_corpus.make_label_dictionary(),
                multi_label=False,
            )

            self._logger.debug(
                f"Intent classifier has {len(class_sentences)} example(s)"
            )
            trainer = ModelTrainer(classifier, class_corpus)
            trainer.train(class_data_dir, max_epochs=max_epochs)
        else:
            self._logger.info("Skipping intent classifier training")

        if len(ner_sentences) > 0:
            self._logger.debug(f"Training {len(ner_sentences)} NER sequence tagger(s)")

            # Named entity recognition
            stacked_embeddings = StackedEmbeddings(word_embeddings)

            for intent_name, intent_ner_sents in ner_sentences.items():
                ner_train, ner_dev, ner_test = self._split_data(intent_ner_sents)
                ner_corpus = TaggedCorpus(ner_train, ner_dev, ner_test)

                tagger = SequenceTagger(
                    hidden_size=256,
                    embeddings=stacked_embeddings,
                    tag_dictionary=ner_corpus.make_tag_dictionary(tag_type="ner"),
                    tag_type="ner",
                    use_crf=True,
                )

                ner_intent_dir = os.path.join(ner_data_dir, intent_name)
                os.makedirs(ner_intent_dir, exist_ok=True)

                self._logger.debug(
                    f"NER tagger for {intent_name} has {len(intent_ner_sents)} example(s)"
                )
                trainer = ModelTrainer(tagger, ner_corpus)
                trainer.train(ner_intent_dir, max_epochs=max_epochs)
        else:
            self._logger.info("Skipping NER sequence tagger training")
def main(args):
    args = parser.parse_args()

    # 0. Make a list of word embeddings
    if args.method == 'glove':
        word_embeddings = [WordEmbeddings('glove')]
    elif args.method == 'flair':
        word_embeddings = [
            WordEmbeddings('glove'),
            FlairEmbeddings('news-forward'),
            FlairEmbeddings('news-backward')
        ]
    elif args.method == 'cui_svd':
        word_embeddings = [
            BackOffEmbeddings(
                WordEmbeddings('glove'),
                WordEmbeddings('resources/embeddings/cui2vec100.npy'))
        ]
    elif args.method == 'cui_proj':
        word_embeddings = [
            BackOffEmbeddings(
                WordEmbeddings('glove'),
                WordEmbeddings(
                    'resources/embeddings/cui2vec_projected_100-100.gensim'))
        ]
    elif args.method == 'mimic':
        word_embeddings = [
            WordEmbeddings(
                'resources/embeddings/mimic3_mixed_embeddings100.gensim')
        ]
    elif args.method == 'cui2vec':
        word_embeddings = [
            BackOffEmbeddings(
                WordEmbeddings('glove'),
                WordEmbeddings(
                    'resources/embeddings/cui2vec_combined_glove_100dim.gensim'
                ))
        ]
    elif args.method == 'mimic_lm':
        word_embeddings = [
            WordEmbeddings('glove'),
            FlairEmbeddings('resources/taggers/mimic-forward/best-lm.pt'),
            FlairEmbeddings('resources/taggers/mimic-backward/best-lm.pt')
        ]
    else:
        raise Exception(
            "Received option for method %s that cannot be interpreted." %
            (args.method))

    if 'bg' in args.data_file:
        multi = True
        print(
            "Running in multiple label setting because 'bg' was in the data file name %s"
            % (args.data_file))
    else:
        multi = False

    # 1. get the corpus
    sents: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file(
        args.data_file)
    corpus = TaggedCorpus(sents, None, None)

    # 2. create the label dictionary
    label_dict = corpus.make_label_dictionary()

    # 3. split the training data into folds
    num_folds = args.num_folds
    seed = 719
    kf = KFold(n_splits=num_folds, random_state=seed)
    kf.get_n_splits()

    # 4. iterate over folds:
    total_acc = 0
    fold = 1
    for train_index, test_index in kf.split(corpus.train):
        # 4a. initialize the text classifier trainer
        split_traindev = np.array(corpus.train)[train_index].tolist()
        traindev_size = len(split_traindev)
        train_dev_splitpoint = int(0.9 * traindev_size)
        split_train = split_traindev[:train_dev_splitpoint]
        split_dev = split_traindev[train_dev_splitpoint:]

        split_test = np.array(corpus.train)[test_index].tolist()
        split_corpus = TaggedCorpus(split_train,
                                    dev=split_dev,
                                    test=split_test)

        print("After split, size of splits: train=%d, dev=%d, test=%d" %
              (len(split_train), len(split_dev), len(split_test)))

        # 4b. do training:
        with tempfile.TemporaryDirectory() as model_dir:
            # init document embedding by passing list of word embeddings
            document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings(
                word_embeddings,
                hidden_size=128,
                reproject_words=True,
                reproject_words_dimension=64,
            )
            classifier = TextClassifier(document_embeddings,
                                        label_dictionary=label_dict,
                                        multi_label=multi)
            trainer = ModelTrainer(classifier, split_corpus)

            results = trainer.train(model_dir,
                                    embeddings_in_memory=False,
                                    learning_rate=0.1,
                                    mini_batch_size=128,
                                    anneal_factor=0.5,
                                    patience=5,
                                    max_epochs=100)

        fold_acc = results['test_score']
        total_acc += fold_acc
        print(f"Finished fold {fold} with accuracy {fold_acc}")
        fold += 1
    total_acc /= num_folds

    print("Finished with total cross-fold accuracy of %f" % (total_acc))
예제 #4
0
from typing import List

from flair.data import Sentence, TaggedCorpus
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.embeddings import WordEmbeddings, CharLMEmbeddings, DocumentLSTMEmbeddings
from flair.models.text_classification_model import TextClassifier
from flair.trainers.text_classification_trainer import TextClassifierTrainer

sentences_train: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file('training.preprocessed.txt')
sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file('dev.preprocessed.txt')
sentences_test: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file('test.preprocessed.txt')

corpus = TaggedCorpus(sentences_train, sentences_dev, sentences_test)

# 2. create the label dictionary
label_dict = corpus.make_label_dictionary()

# 3. make a list of word embeddings
word_embeddings = [WordEmbeddings('de-fasttext'),
                   CharLMEmbeddings('german-forward'),
                   CharLMEmbeddings('german-backward')]

# 4. init document embedding by passing list of word embeddings
document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings(word_embeddings, hidden_states=32)

# 5. create the text classifier
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False)

# 6. initialize the text classifier trainer
trainer = TextClassifierTrainer(classifier, corpus, label_dict)