Пример #1
0
 def post_init(self):
     import flair
     flair.device = self.device
     from flair.embeddings import WordEmbeddings, FlairEmbeddings, BytePairEmbeddings, PooledFlairEmbeddings, \
         DocumentPoolEmbeddings
     embeddings_list = []
     for e in self.embeddings:
         model_name, model_id = e.split(':', maxsplit=1)
         emb = None
         try:
             if model_name == 'flair':
                 emb = FlairEmbeddings(model_id)
             elif model_name == 'pooledflair':
                 emb = PooledFlairEmbeddings(model_id)
             elif model_name == 'word':
                 emb = WordEmbeddings(model_id)
             elif model_name == 'byte-pair':
                 emb = BytePairEmbeddings(model_id)
         except ValueError:
             self.logger.error(f'embedding not found: {e}')
             continue
         if emb is not None:
             embeddings_list.append(emb)
     if embeddings_list:
         self.model = DocumentPoolEmbeddings(embeddings_list,
                                             pooling=self.pooling_strategy)
         self.logger.info(
             f'flair encoder initialized with embeddings: {self.embeddings}'
         )
     else:
         self.logger.error('flair encoder initialization failed.')
Пример #2
0
    def post_init(self):
        from flair.embeddings import WordEmbeddings, FlairEmbeddings, BytePairEmbeddings, PooledFlairEmbeddings, \
            DocumentPoolEmbeddings

        if self.model is not None:
            return
        embeddings_list = []
        for e in self.embeddings:
            model_name, model_id = e.split(':', maxsplit=1)
            emb = None
            try:
                if model_name == 'flair':
                    emb = FlairEmbeddings(model_id)
                elif model_name == 'pooledflair':
                    emb = PooledFlairEmbeddings(model_id)
                elif model_name == 'word':
                    emb = WordEmbeddings(model_id)
                elif model_name == 'byte-pair':
                    emb = BytePairEmbeddings(model_id)
            except ValueError:
                self.logger.error('embedding not found: {}'.format(e))
                continue
            if emb is not None:
                embeddings_list.append(emb)
        if embeddings_list:
            self.model = DocumentPoolEmbeddings(embeddings_list,
                                                pooling=self.pooling_strategy)
            self.logger.info(
                'initialize flair encoder with embeddings: {}'.format(
                    self.embeddings))
Пример #3
0
 def __init__(self, pipeline):
     self.mode = pipeline.mode
     self.type = pipeline.embedding_type
     embedders = []
     for component in pipeline.embedders:
         if "forward" in component or "backward" in component:
             embedders.append(FlairEmbeddings(component))
         elif "glove" in component:
             embedders.append(WordEmbeddings(component))
         elif "bert" in component:
             embedders.append(BertEmbeddings(component))
         elif len(component) == 2:
             # see https://github.com/zalandoresearch/flair/blob/master/resources/docs/embeddings/FASTTEXT_EMBEDDINGS.md#fasttext-embeddings
             embedders.append(WordEmbeddings(component))
             embedders.append(BytePairEmbeddings(component))
         else:
             raise ValueError(f"unknown embedder: {component}")
     if self.type == "document":
         self.embedder = self._make_doc_embedder(pipeline, embedders)
     elif self.type == "word":
         self.embedder = StackedEmbeddings(embedders)
     elif self.type == "both":
         self.embedders = [
             self._make_doc_embedder(pipeline, embedders),
             StackedEmbeddings(embedders),
         ]
     else:
         raise ValueError(
             f"Innapropriate embedding type {pipeline.embedding_type}, "
             "should be 'word', 'document', or 'both'.")
Пример #4
0
    def build_embedding(self, lang, embedding_codes: List[str]) -> None:

        self.tic = time.time()
        self.embedding_name: str = "-".join(embedding_codes)
        self.lang = lang

        embedding_types: List[TokenEmbeddings] = []

        for code in embedding_codes:

            code = code.lower()
            assert code in [
                "bpe",
                "bert",
                "flair",
                "ft",
                "char",
                "ohe",
                "elmo",
            ], f"{code} - Invalid embedding code"

            if code == "ohe":
                embedding_types.append(OneHotEmbeddings(corpus=self.corpus))
            elif code == "ft":
                embedding_types.append(WordEmbeddings(self.lang))
            elif code == "bpe":
                embedding_types.append(BytePairEmbeddings(self.lang))
            elif code == "bert":
                embedding_types.append(
                    TransformerWordEmbeddings(
                        model=self.huggingface_ref[self.lang],
                        pooling_operation="first",
                        layers="-1",
                        fine_tune=False,
                    )
                )
            elif code == "char":
                embedding_types.append(CharacterEmbeddings())
            elif code == "flair":
                embedding_types.append(FlairEmbeddings(f"{self.lang}-forward"))
                embedding_types.append(FlairEmbeddings(f"{self.lang}-backward"))
            elif code == "elmo":
                embedding_types.append(
                    ELMoEmbeddings(model="large", embedding_mode="all")
                )

        self.embedding: StackedEmbeddings = StackedEmbeddings(
            embeddings=embedding_types
        )

        self.tagger: SequenceTagger = SequenceTagger(
            hidden_size=256,
            embeddings=self.embedding,
            tag_dictionary=self.tag_dictionary,
            tag_type=self.tag_type,
            use_crf=True,
        )

        self.trainer: ModelTrainer = ModelTrainer(self.tagger, self.corpus)
Пример #5
0
def load_model():
    """Load word embeddings model."""
    fasttext_embedding = WordEmbeddings('en-crawl')
    byte_embedding = BytePairEmbeddings('en')
    flair_embedding_forward = FlairEmbeddings('en-forward')
    flair_embedding_backward = FlairEmbeddings('en-backward')

    document_embeddings = DocumentPoolEmbeddings([fasttext_embedding,
                                                  byte_embedding,
                                                  flair_embedding_backward,
                                                  flair_embedding_forward,
                                                  ],
                                                 fine_tune_mode='nonlinear')
    return document_embeddings
Пример #6
0
def load_doc_embeddings():
    """Load word embeddings model."""
    fasttext_embedding = WordEmbeddings('en-crawl')
    byte_embedding = BytePairEmbeddings('en')
    flair_embedding_forward = FlairEmbeddings('en-forward')
    flair_embedding_backward = FlairEmbeddings('en-backward')

    document_embeddings = DocumentPoolEmbeddings([fasttext_embedding,
                                                  byte_embedding,
                                                  flair_embedding_backward,
                                                  flair_embedding_forward#Removed extraneous comma here, suprised it didn't cause an error???
                                                  ],
                                                 fine_tune_mode='nonlinear')
    return document_embeddings
Пример #7
0
def get_embeddings(embeddings: List[str], character: bool, lang: str,
                   bpe_size: int) -> StackedEmbeddings:
    """To Construct and return a embedding model"""
    stack = []
    for e in embeddings:
        if e != '':
            if 'forward' in e or 'backward' in e:
                stack.append(FlairEmbeddings(e))
            else:
                stack.append(WordEmbeddings(e))
    if character:
        stack.append(CharacterEmbeddings())
    if bpe_size > 0:
        stack.append(BytePairEmbeddings(language=lang, dim=bpe_size))

    return StackedEmbeddings(embeddings=stack)
Пример #8
0
 def __init__(self, config):
     """
     Load pretrained language model
     """
     super(LanguageModel, self).__init__()
     embeddings_stack = []
     transformers = config.get("language_model", "transformers")
     if transformers is not "":
         transformers = transformers.split(";")
         for model in transformers:
             embeddings_stack.append(
                 TransformerWordEmbeddings(
                     model,
                     layers="-1",
                     pooling_operation='mean',
                     # use_scalar_mix=True,
                     fine_tune=True))
     word_embeddings = config.get("language_model", "word_embeddings")
     if word_embeddings is not "":
         word_embeddings = word_embeddings.split(";")
         for model in word_embeddings:
             embeddings_stack.append(WordEmbeddings(model))
     flair_embeddings = config.get("language_model", "flair_embeddings")
     if flair_embeddings is not "":
         flair_embeddings = flair_embeddings.split(";")
         for model in flair_embeddings:
             embeddings_stack.append(FlairEmbeddings(model, fine_tune=True))
     character_embeddings = config.get("language_model",
                                       "character_embeddigs")
     if character_embeddings.lower() is "yes":
         embeddings_stack.append(CharacterEmbeddings(character_embeddings))
     bytepair_embeddings = config.get("language_model",
                                      "bytepair_embeddings")
     if bytepair_embeddings.lower() is "yes":
         embeddings_stack.append(BytePairEmbeddings())
     custom_embeddings = config.get("language_model", "custom_embeddings")
     if custom_embeddings is not "":
         custom_embeddings = custom_embeddings.split(";")
         for path in custom_embeddings:
             embeddings_stack.append(WordEmbeddings(path))
     self.lm = StackedEmbeddings(embeddings_stack)
     self.embedding_dim = self.lm.embedding_length
     self.dropout = torch.nn.Dropout(
         float(config.get("language_model", "dropout")))
     self.classify = torch.nn.Linear(self.embedding_dim, 2)
     if config.get("language_model", "relu") == "yes":
         self.relu = torch.nn.ReLU()
Пример #9
0
def get_embeddings(pooling_op='min'):
    return StackedEmbeddings(embeddings=[
        # pre-trained embeddings
        PooledFlairEmbeddings(
            'es-forward',
            pooling=pooling_op,
        ),
        PooledFlairEmbeddings(
            'es-backward',
            pooling=pooling_op,
        ),
        BytePairEmbeddings(
            language='es',
            dim=300,
        ),

        # self-trained embeddings
        SpanishHealthCorpusEmbeddings('wang2vec'),
        # SpanishHealthCorpusEmbeddings('fastText'),
    ])
Пример #10
0
    def __init__(self, device="cpu"):
        super(RankNetWithEmbeddings, self).__init__()

        self._device = device

        fasttext_embedding = WordEmbeddings('en-news')
        # flair_embedding_forward = FlairEmbeddings('news-forward')
        # flair_embedding_backward = FlairEmbeddings('news-backward')
        byte_pair_embedding = BytePairEmbeddings('en')
        glove_embeddings = WordEmbeddings('glove')
        character_embedding = CharacterEmbeddings()

        self._mention_embedding = DocumentPoolEmbeddings([fasttext_embedding])
        self._label_embedding = DocumentPoolEmbeddings([
            fasttext_embedding,
        ])
        self._context_embedding = DocumentPoolEmbeddings([fasttext_embedding])
        self._description_embedding = DocumentPoolEmbeddings([
            fasttext_embedding,
        ])

        input_length =   self._mention_embedding.embedding_length \
                       + self._context_embedding.embedding_length \
                       + self._label_embedding.embedding_length   \
                       + self._description_embedding.embedding_length

        self.model = nn.Sequential(
            nn.Linear(input_length, 256),
            nn.ReLU(),
            # nn.Dropout(0.2),
            nn.Linear(256, 64),
            nn.ReLU(),
            # nn.Dropout(0.2),
            nn.Linear(64, 1),
            nn.Tanh(),
        )

        self.output_sig = nn.Sigmoid()
        self.to(device)
Пример #11
0
 def __init__(self, make_unit_length=True):
     super().__init__(n_dims=200, make_unit_length=make_unit_length)
     embeddings = [WordEmbeddings('glove'), BytePairEmbeddings('en')]
     self.embeddings = DocumentPoolEmbeddings(embeddings)
     self.log = getLogger(type(self).__name__)
Пример #12
0
 def __init__(self,
              lang,
              embeddings_dim,
              embedding_weights,
              hidden_dim,
              hidden_layers,
              dropout,
              output_layers=["embed_wsd"],
              lemma2synsets=None,
              synsets2id={},
              pos_tags={},
              entity_tags={},
              use_flair=False,
              combine_WN_FN=False):
     super(WSDModel, self).__init__()
     self.use_flair = use_flair
     self.combine_WN_FN = combine_WN_FN
     self.output_layers = output_layers
     self.hidden_layers = hidden_layers
     self.hidden_dim = hidden_dim
     self.num_wsd_classes = 0
     self.synsets2id = synsets2id
     output_emb_dim = embeddings_dim
     if use_flair is True:
         if lang == "Bulgarian":
             # BG EMBEDDINGS:
             self.word_embeddings = StackedEmbeddings([
                 WordEmbeddings(
                     '/home/lenovo/dev/PostDoc/LREC/Embeddings/cc.bg.300.vec_FILTERED_OOV.gensim'
                 ),
                 # WordEmbeddings('bg'),
                 # FastTextEmbeddings('/home/lenovo/dev/PostDoc/LREC/Embeddings/cc.bg.300.vec_FILTERED_OOV'),
                 # Byte pair embeddings for English
                 BytePairEmbeddings('bg'),
                 FlairEmbeddings('bg-forward-fast'),
                 FlairEmbeddings('bg-backward-fast'),
                 CharacterEmbeddings()
             ])
         elif lang == "English":
             # EN EMBEDDINGS:
             self.word_embeddings = StackedEmbeddings([
                 WordEmbeddings(
                     '/home/lenovo/dev/word-embeddings/glove.6B/glove.6B.300d_MOD.gensim'
                 ),
                 WordEmbeddings(
                     '/home/lenovo/dev/word-embeddings/lemma_sense_embeddings/'
                     'WN30WN30glConOne-C15I7S7N5_200M_syn_and_lemma_WikipediaLemmatized_FILTERED.gensim'
                 ),
                 # WordEmbeddings('bg'),
                 # FastTextEmbeddings('/home/lenovo/dev/PostDoc/LREC/Embeddings/cc.bg.300.vec_FILTERED_OOV'),
                 # Byte pair embeddings for English
                 BytePairEmbeddings('en'),
                 FlairEmbeddings('en-forward-fast'),
                 FlairEmbeddings('en-backward-fast'),
                 CharacterEmbeddings()
             ])
         else:
             print("Unknown language!")
             exit(1)
         embeddings_dim = self.word_embeddings.embedding_length
     else:
         self.word_embeddings = nn.Embedding.from_pretrained(
             embedding_weights, freeze=True)
     self.lstm = nn.LSTM(embeddings_dim,
                         hidden_dim,
                         hidden_layers,
                         bidirectional=True,
                         batch_first=True,
                         dropout=dropout)
     if "embed_wsd" in self.output_layers:
         # We want output with the size of the lemma&synset embeddings
         self.emb_relu = nn.ReLU()
         self.output_emb = nn.Linear(2 * hidden_dim, output_emb_dim)
     if "embed_frameID" in self.output_layers:
         self.emb_relu_frames = nn.ReLU()
         self.output_emb_frames = nn.Linear(2 * hidden_dim, output_emb_dim)
     if "classify_wsd" in self.output_layers:
         if len(self.synsets2id) > 0:
             self.output_classify = nn.Linear(2 * hidden_dim,
                                              len(self.synsets2id))
             self.num_wsd_classes = len(self.synsets2id)
         else:
             lemma2layers = collections.OrderedDict()
             for lemma, synsets in lemma2synsets.items():
                 lemma2layers[lemma] = nn.Linear(2 * hidden_dim,
                                                 len(synsets))
                 if len(synsets) > self.num_wsd_classes:
                     self.num_wsd_classes = len(synsets)
             self.classifiers = nn.Sequential(lemma2layers)
     if "pos_tagger" in self.output_layers:
         self.pos_tags = nn.Linear(2 * hidden_dim, len(pos_tags))
     if "ner" in self.output_layers:
         self.ner = nn.Linear(2 * hidden_dim, len(entity_tags))
     self.dropout = nn.Dropout(dropout)
Пример #13
0
    def train_all(self):
        config_file = open(self.config, "r")
        if self.config.split('.')[-1] == "yml":
            datastore = yaml.load(config_file)
        elif self.config.split('.')[-1] == "json":
            datastore = json.loads(config_file.read())
        else:
            print("Need a json or yaml file as config")
            sys.exit(0)

        columns = {
            int(datastore["dataset_reader"]["position_text"]): "text",
            int(datastore["dataset_reader"]["position_ner"]): "ner",
        }

        # focus_on = datastore["dataset_reader"]["focus_on"]

        if bool(datastore["dataset_reader"]["only_train"]):

            all_corpus = []
            log.info("Reading data from {}".format(datastore["dataset_reader"]["data_folder"]))

            all_corpus = ColumnCorpusTrain(
                datastore["dataset_reader"]["data_folder"],
                columns,
                train_file=datastore["dataset_reader"]["train_name"],
            )

            tag_type = "ner"
            tag_dictionary = all_corpus[0].make_tag_dictionary(tag_type=tag_type)

        else:

            iobes_corpus = ColumnCorpus(
                datastore["dataset_reader"]["data_folder"],
                columns,
                train_file=datastore["dataset_reader"]["train_name"],
                dev_file=datastore["dataset_reader"]["dev_name"],
                test_file=datastore["dataset_reader"]["test_name"],
            )

            tag_type = "ner"
            tag_dictionary = iobes_corpus.make_tag_dictionary(tag_type=tag_type)

            try:
                train_ratio = float(datastore["dataset_reader"]["train_ratio"])
                iobes_corpus = Corpus(iobes_corpus.train[0:int(len(iobes_corpus.train) * train_ratio)],
                                      iobes_corpus.dev, iobes_corpus.test)
                log_ratio = "Using only ", str(train_ratio * 100), "% of the train dataset"
                log.info(log_ratio)
            except:
                pass

        embed_list = []
        word_char = []
        char_word = []
        for embed in datastore["embeddings"]["embeddings_list"]:

            if embed == "bpe":
                embed_list.append(BytePairEmbeddings(datastore["embeddings"]["lang"]))
            elif embed == "fasttext":
                embed_list.append(WordEmbeddings(datastore["embeddings"]["lang"]))
            elif embed == "flair" and datastore["embeddings"]["lang"] == "en":
                embed_list.append(FlairEmbeddings("news-forward"))
                embed_list.append(FlairEmbeddings("news-backward"))
            elif embed == "bert-base-uncased":
                if datastore["embeddings"]["lang"] == "en":
                    embed_list.append(BertEmbeddings("bert-base-uncased"))
            elif embed == "bert-base-cased":
                if datastore["embeddings"]["lang"] == "en":
                    embed_list.append(BertEmbeddings("bert-base-cased"))
            elif embed == "bert-large-uncased":
                if datastore["embeddings"]["lang"] == "en":
                    embed_list.append(BertEmbeddings("bert-large-uncased"))
            elif embed == "bert-large-cased":
                if datastore["embeddings"]["lang"] == "en":
                    embed_list.append(BertEmbeddings("bert-large-cased"))
            elif embed == "elmo-small":
                if datastore["embeddings"]["lang"] == "en":
                    embed_list.append(ELMoEmbeddings("small"))
            elif embed == "elmo-medium":
                if datastore["embeddings"]["lang"] == "en":
                    embed_list.append(ELMoEmbeddings("medium"))
            elif embed == "elmo-original":
                if datastore["embeddings"]["lang"] == "en":
                    embed_list.append(ELMoEmbeddings("original"))
            elif embed == "bert-base-chinese":
                if datastore["embeddings"]["lang"] == "zh":
                    embed_list.append(emb.BertEmbeddingsChinese("bert-base-chinese"))
            else:
                split_name = embed.split(".")
                ext = split_name[-1]
                kind = split_name[-2]

                if ext == "pt":  # Flair type

                    extra_index = 0
                    try:
                        extra_index = int(datastore["embeddings"]["extra_index"])
                    except:
                        pass

                    if kind == "char":
                        embed_list.append(emb.FlairEmbeddingsChar(embed, extra_index=extra_index))
                    elif kind == "char-seg":
                        embed_list.append(emb.FlairEmbeddingsWordLevelCharSeg(embed, extra_index=extra_index))

                if ext == "vec":  # Char type
                    if kind == "char-seg":
                        embed_list.append(emb.WordEmbeddingsVecCharSeg(embed))
                    elif kind == "char":
                        embed_list.append(emb.WordEmbeddingsVecFirst(embed))
                    elif kind == "word":
                        embed_list.append(emb.WordEmbeddingsVecWord(embed))
                    elif kind == "bichar":
                        embed_list.append(emb.WordEmbeddingsVecBichar(embed))
                if ext == "bin":
                    if kind == "word":
                        embed_list.append(emb.WordEmbeddingsBinWord(embed))
                    elif kind == "bichar":
                        embed_list.append(emb.WordEmbeddingsBinBichar(embed))

        try:
            if bool(datastore["embeddings"]["ner_embed"]) == True:
                print("Generate NER embeddings..")
                embed_list.append(
                    emb.nerEmbedding(
                        generateNerEmbFromTrain(
                            iobes_corpus.train, tag_dictionary.get_items()
                        )
                    )
                )
        except:
            pass
        try:
            if bool(datastore["embeddings"]["one_hot"]) == True:
                print("Generate one hot embeddings..")
                embed_list.append(emb.OneHotEmbeddings(iobes_corpus))
        except:
            pass
        try:
            if datastore["embeddings"]["embeddings_ngram_list"] != None:
                embed_list.append(
                    emb.WordEmbeddingsVecNGramList(
                        datastore["embeddings"]["embeddings_ngram_list"]
                    )
                )
        except:
            pass

        if len(word_char) == 1 and len(char_word) == 1:
            embed_list.append(emb.WordEmbeddingsVecWordChar(word_char[0], char_word[0]))

        embedding_types: List[TokenEmbeddings] = embed_list

        embeddings: emb.StackedEmbeddingsNew = emb.StackedEmbeddingsNew(
            embeddings=embedding_types
        )

        if bool(datastore["dataset_reader"]["only_train"]):
            score = []
            for i in range(len(all_corpus)):

                tagger: SequenceTagger = SequenceTagger(
                    hidden_size=int(datastore["model"]["hidden_size"]),
                    embeddings=embeddings,
                    tag_dictionary=tag_dictionary,
                    tag_type=tag_type,
                    use_crf=bool(datastore["model"]["use_crf"]),
                    dropout=float(datastore["model"]["dropout"]),
                    word_dropout=float(datastore["model"]["word_dropout"]),
                    locked_dropout=float(datastore["model"]["locked_dropout"]),
                    rnn_layers=int(datastore["model"]["rnn_layers"]),
                )

                folder = datastore["train_config"]["folder"] + "/" + str(i)
                best = Path(folder + "/checkpoint.pt")
                iobes_corpus = all_corpus[i]
                if not best.exists():
                    best = Path(folder + "/best-model.pt")

                if best.exists():
                    trainer = ModelTrainer.load_checkpoint(
                        tagger.load_checkpoint(best), iobes_corpus
                    )
                else:
                    trainer: ModelTrainer = ModelTrainer(tagger, iobes_corpus)

                # 7. start training

                result = trainer.train(
                    folder,
                    learning_rate=float(datastore["train_config"]["learning_rate"]),
                    anneal_factor=float(datastore["train_config"]["anneal_factor"]),
                    min_learning_rate=float(datastore["train_config"]["min_learning_rate"]),
                    mini_batch_size=int(datastore["train_config"]["batch_size"]),
                    max_epochs=int(datastore["train_config"]["epoch"]),
                    save_final_model=bool(datastore["train_config"]["save_final_model"]),
                    checkpoint=bool(datastore["train_config"]["checkpoint"]),
                    param_selection_mode=bool(
                        datastore["train_config"]["param_selection_mode"]
                    ),
                    patience=int(datastore["train_config"]["patience"]),
                    monitor_test=bool(datastore["train_config"]["monitor_test"]),
                    embeddings_storage_mode=str(datastore["train_config"]["embeddings_storage_mode"]),
                    shuffle=bool(datastore["train_config"]["shuffle"]),
                )

                plotter = Plotter()
                if bool(datastore["train_config"]["save_plot_training_curve"]):
                    curve = folder + "/loss.tsv"
                    plotter.plot_training_curves(curve)
                if bool(datastore["train_config"]["save_plot_weights"]):
                    weight = folder + "/weights.txt"
                    plotter.plot_weights(weight)

                score.append(result["test_score"])

            print(score, "  \n Moyenne : ", round(sum(score) / len(score), 2))


        else:

            tagger: SequenceTagger = SequenceTagger(
                hidden_size=int(datastore["model"]["hidden_size"]),
                embeddings=embeddings,
                tag_dictionary=tag_dictionary,
                tag_type=tag_type,
                use_crf=bool(datastore["model"]["use_crf"]),
                dropout=float(datastore["model"]["dropout"]),
                word_dropout=float(datastore["model"]["word_dropout"]),
                locked_dropout=float(datastore["model"]["locked_dropout"]),
                rnn_layers=int(datastore["model"]["rnn_layers"]),
            )

            folder = datastore["train_config"]["folder"]
            best = Path(folder + "/checkpoint.pt")
            if not best.exists():
                best = Path(folder + "/best-model.pt")

            if best.exists():
                trainer = ModelTrainer.load_checkpoint(
                    tagger.load_checkpoint(best), iobes_corpus
                )
            else:
                trainer: ModelTrainer = ModelTrainer(tagger, iobes_corpus)

            # 7. start training

            trainer.train(
                folder,
                learning_rate=float(datastore["train_config"]["learning_rate"]),
                anneal_factor=float(datastore["train_config"]["anneal_factor"]),
                min_learning_rate=float(datastore["train_config"]["min_learning_rate"]),
                mini_batch_size=int(datastore["train_config"]["batch_size"]),
                max_epochs=int(datastore["train_config"]["epoch"]),
                save_final_model=bool(datastore["train_config"]["save_final_model"]),
                checkpoint=bool(datastore["train_config"]["checkpoint"]),
                param_selection_mode=bool(
                    datastore["train_config"]["param_selection_mode"]
                ),
                patience=int(datastore["train_config"]["patience"]),
                monitor_test=bool(datastore["train_config"]["monitor_test"]),
                embeddings_storage_mode=str(datastore["train_config"]["embeddings_storage_mode"]),
                shuffle=bool(datastore["train_config"]["shuffle"]),
            )

            plotter = Plotter()
            if bool(datastore["train_config"]["save_plot_training_curve"]):
                curve = folder + "/loss.tsv"
                plotter.plot_training_curves(curve)
            if bool(datastore["train_config"]["save_plot_weights"]):
                weight = folder + "/weights.txt"
                plotter.plot_weights(weight)
Пример #14
0
                              test_file='flair_test_small.txt',
                              dev_file='flair_dev_small.txt')

# 2. what tag do we want to predict?
tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary)

# 4. initialize embeddings
# bert-base-chinese

embedding_types = [
    WordEmbeddings('zh'),
    BytePairEmbeddings('multi'),
    # comment in this line to use character embeddings
    # CharacterEmbeddings(),

    # comment in these lines to use flair embeddings
    # FlairEmbeddings('news-forward'),
    # FlairEmbeddings('news-backward'),
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

# 5. initialize sequence tagger
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
Пример #15
0
def trainer(file_path: Path, filenames: Tuple[str, str, str], checkpoint: str,
            stack: str, n_epochs: int) -> None:
    """Train sentiment model using Flair NLP library:
    https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md

    To help provide added context, we can stack Glove, Bert or ELMo embeddings along with Flair embeddings.
    """
    # pip install flair allennlp
    from flair.datasets import ClassificationCorpus
    from flair.embeddings import FlairEmbeddings, DocumentRNNEmbeddings, DocumentPoolEmbeddings
    from flair.models import TextClassifier
    from flair.trainers import ModelTrainer
    from flair.training_utils import EvaluationMetric
    from flair.visual.training_curves import Plotter

    if stack == "glove":
        from flair.embeddings import WordEmbeddings
        stacked_embedding = WordEmbeddings('glove')
    elif stack == "fasttext":
        from flair.embeddings import WordEmbeddings
        stacked_embedding = WordEmbeddings('it')
    elif stack == "elmo":
        from flair.embeddings import ELMoEmbeddings
        stacked_embedding = ELMoEmbeddings('original')
    elif stack == "bert":
        from flair.embeddings import BertEmbeddings
        stacked_embedding = BertEmbeddings('bert-base-uncased')
    elif stack == "bert-multi":
        from flair.embeddings import BertEmbeddings
        stacked_embedding = BertEmbeddings('bert-base-multilingual-uncased')
    elif stack == 'bpe':
        from flair.embeddings import BytePairEmbeddings
        stacked_embedding = BytePairEmbeddings('it')
    else:
        stacked_embedding = None

    # Define and Load corpus from the provided dataset
    train, dev, test = filenames
    corpus = ClassificationCorpus(
        file_path,
        train_file=train,
        dev_file=dev,
        test_file=test,
    )
    # Create label dictionary from provided labels in data
    label_dict = corpus.make_label_dictionary()

    # Stack Flair string-embeddings with optional embeddings
    word_embeddings = list(
        filter(None, [
            stacked_embedding,
            FlairEmbeddings('it-forward'),
            FlairEmbeddings('it-backward'),
        ]))
    # Initialize document embedding by passing list of word embeddings
    document_embeddings = DocumentRNNEmbeddings(
        word_embeddings,
        hidden_size=256,
        reproject_words=True,
        dropout=0.5,
        reproject_words_dimension=256,
    )

    #document_embeddings = DocumentPoolEmbeddings([
    #    stacked_embedding,
    #    FlairEmbeddings('it-forward'),
    #    FlairEmbeddings('it-backward')],pooling='mean')

    # Define classifier
    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dict,
                                multi_label=True)

    if not checkpoint:
        trainer = ModelTrainer(classifier, corpus)
    else:
        # If checkpoint file is defined, resume training
        #checkpoint = classifier.load_checkpoint(Path(checkpoint))
        trainer = ModelTrainer.load_checkpoint(checkpoint, corpus)

    # Begin training (enable checkpointing to continue training at a later time, if desired)
    trainer.train(
        file_path,
        max_epochs=n_epochs,
        checkpoint=True,
    )

    # Plot curves and store weights and losses
    plotter = Plotter()
    plotter.plot_training_curves(file_path + '/loss.tsv')
    plotter.plot_weights(file_path + '/weights.txt')
        1: "ner"
    },
    tag_to_bioes="ner",
    skip_first_line=True)
print(corpus)

# 2. what tag do we want to predict?
tag_type = "ner"

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)

# initialize embeddings
embedding_types: List[TokenEmbeddings] = [
    BytePairEmbeddings(language="multi", dim=300, syllables=1000000)
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

# initialize sequence tagger
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(
    hidden_size=256,
    embeddings=embeddings,
    tag_dictionary=tag_dictionary,
    tag_type=tag_type,
    use_crf=True,
)
Пример #17
0
 def __init__(self, n_dims=200, make_unit_length=True):
     from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings, BytePairEmbeddings
     super().__init__(n_dims=n_dims, make_unit_length=make_unit_length)
     embeddings = [WordEmbeddings('glove'), BytePairEmbeddings('en')]
     self.embeddings = DocumentPoolEmbeddings(embeddings, fine_tune_mode='none')
     self.log = getLogger(type(self).__name__)
Пример #18
0
 def __create_models(self):
     models = []
     models_fit = []
     #for _params in self.model_params:
     _params = {}
     for k, v in self.params.items():
         if k.startswith('_'):
             continue
         _params[k] = v
     self.textModels = dict(
         mtc=TextModel(_params).fit(self.train),
         #charEmb=DocumentPoolEmbeddings([CharacterEmbeddings()]),
         #charLangEmb=DocumentPoolEmbeddings([CharacterEmbeddings(),BytePairEmbeddings(self.lang)]),
         ##charMultiEmb=DocumentPoolEmbeddings([CharacterEmbeddings(),BytePairEmbeddings('multi')]),
         langEmb=DocumentPoolEmbeddings([BytePairEmbeddings(self.lang)]),
         charLangMultiEmb=DocumentPoolEmbeddings([
             CharacterEmbeddings(),
             BytePairEmbeddings(self.lang),
             BytePairEmbeddings('multi')
         ]),
         langMultiEmb=DocumentPoolEmbeddings(
             [BytePairEmbeddings(self.lang),
              BytePairEmbeddings('multi')]),
         bytePairEMB=DocumentPoolEmbeddings([BytePairEmbeddings('multi')]),
         #flairEmbF=DocumentPoolEmbeddings([FlairEmbeddings('multi-forward')]),
         #flairEmbB=DocumentPoolEmbeddings([FlairEmbeddings('multi-backward')]),
         #bertEMB=DocumentPoolEmbeddings([TransformerWordEmbeddings('bert-base-uncased', layers='-1')])
     )
     for km, tmodel in self.textModels.items():
         models.append({'name': km})
         models_fit.append({'name': km})
         if km == 'mtc':
             xt = tmodel.transform(self.train)
             xv = tmodel.transform(self.validation)
             X = tmodel.transform(self.data)
         else:
             sentences_train = [Sentence(txt) for txt in self.train]
             tmodel.embed(sentences_train)
             xt = np.array([
                 e.get_embedding().cpu().detach().numpy()
                 for e in sentences_train
             ])
             sentences_val = [Sentence(txt) for txt in self.validation]
             tmodel.embed(sentences_val)
             xv = np.array([
                 e.get_embedding().cpu().detach().numpy()
                 for e in sentences_val
             ])
             sentences = [Sentence(txt) for txt in self.data]
             tmodel.embed(sentences)
             X = np.array([
                 e.get_embedding().cpu().detach().numpy() for e in sentences
             ])
         models[-1]['xv'] = xv
         models[-1]['xt'] = xt
         models_fit[-1]['xt'] = X
         #max_iter=5000
         #if km=='mtc': max_iter=1000
         #if km=='langMulti': max_iter=5000
         #self.models[-1]['clf']=LinearSVC(max_iter=max_iter).fit(xt,self.yt)
         #yp=self.models[-1]['clf'].decision_function(xv)
         #scaler=Normalizer().fit(yp)
         #self.models[-1]['macroF1']=f1_score(self.yv,np.argmax(scaler.transform(yp),axis=1),average='weighted')
         #self.models[-1]['weightedF1']=f1_score(self.yv,np.argmax(scaler.transform(yp),axis=1),average='weighted')
         #self.models[-1]['score']=f1_score(self.yv,np.argmax(yp,axis=1),average='weighted')
         #self.models[-1]['probas']=scaler.transform(yp)
         ### Fit model with all avaliable data
         #self.models_fit[-1]['clf']=LinearSVC(max_iter=max_iter).fit(X,self.y)
     print('Fitting Ensemble')
     #self.models  =  Parallel(n_jobs=5)(delayed(self._train_model)(md) for md in models)
     #self.models_fit = Parallel(n_jobs=5)(delayed(self._train_model)(md) for md in models_fit)
     self.models, self.models_fit = [], []
     for md, mdf in zip(models, models_fit):
         self.models.append(self._train_model(
             md))  #  =  [self._train_model(md) for md in models]
         self.models_fit.append(self._train_model(md))
def perform_text_classification(data,
                                info_need,
                                condition,
                                resample=False,
                                remove_stopwords=False,
                                classifier="SVM",
                                verbose=1,
                                data_source="cooking",
                                random_state=42,
                                warning_on_off="off"):

    # =====================================================================
    # ================= data preperation and other things =================
    # =====================================================================
    nltk.download('stopwords')

    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = "0, 1, 2, 3"  # use the third GPU
    # If there's a GPU available...
    if torch.cuda.is_available():

        # Tell PyTorch to use the GPU.
        device = torch.device("cuda")

        print('There are %d GPU(s) available.' % torch.cuda.device_count())

        print('We will use the GPU:', torch.cuda.get_device_name(0))

    # If not...
    else:
        print('No GPU available, using the CPU instead.')
        device = torch.device("cpu")

    fast_text_embeddings = StackedEmbeddings(
        [WordEmbeddings('de'), BytePairEmbeddings('de')])
    document_embeddings = DocumentPoolEmbeddings([fast_text_embeddings])

    if remove_stopwords:
        print("Remove stopwords...")
        data['utterance'] = data['utterance'].apply(remove_stop_words)

    label_encoder = LabelEncoder()
    y = data['level_1']

    # Control warnings
    if warning_on_off == "off":
        warnings.filterwarnings("ignore")

    elif warning_on_off == "on":
        warnings.simplefilter('always')

    # =================================================
    # ================= featurization =================
    # =================================================

    if verbose == 1:
        print("=== Starting embedding process...")
    word_embedding_corpus = convert_to_embeddings(data['utterance'],
                                                  document_embeddings)

    # =======================================================
    # ================= building classfiers =================
    # =======================================================
    scoring = {
        'f1_infoneed': make_scorer(f1_score, pos_label=info_need),
        'f1_other': make_scorer(f1_score, pos_label="Other"),
        'recall_infoneed': make_scorer(recall_score, pos_label=info_need),
        'precision_infoneed': make_scorer(precision_score,
                                          pos_label=info_need),
        'recall_other': make_scorer(recall_score, pos_label="Other"),
        'precision_other': make_scorer(precision_score, pos_label="Other"),
        'f1_micro': 'f1_micro',
        'f1_macro': 'f1_macro',
        'f1_weighted': 'f1_weighted',
        'precision_micro': 'precision_micro',
        'precision_macro': 'precision_macro',
        'precision_weighted': 'precision_weighted',
        'recall_micro': 'recall_micro',
        'recall_macro': 'recall_macro',
        'recall_weighted': 'recall_weighted'
    }

    if resample:
        print("Resample data...")
        smote_enn = SMOTEENN(random_state=42)
        word_embedding_corpus, y = smote_enn.fit_resample(
            word_embedding_corpus, y)
        print(sorted(Counter(y).items()))

    # ================================ NAIVE BAYES =====================================
    if classifier == "NB":

        if verbose == 1:
            print("=== Building Classifier:", classifier)
        clf = GaussianNB()
        clf_scores = cross_validate(clf,
                                    word_embedding_corpus,
                                    y,
                                    cv=10,
                                    scoring=scoring)
        print(
            f"Classifier: NB -- Condition: {condition} -- Info Need: {info_need} -- F1 Average: {mean(clf_scores['test_f1_infoneed'])}"
        )

# =========================== SUPPORT VECTOR MACHINE ================================
    elif classifier == "SVM":

        if verbose == 1:
            print("=== Building Classifier:", classifier)

        clf = SVC(gamma='scale')
        clf_scores = cross_validate(clf,
                                    word_embedding_corpus,
                                    y,
                                    cv=10,
                                    scoring=scoring)
        print(
            f"Classifier: SVM -- Condition: {condition} -- Info Need: {info_need} -- F1 Average: {mean(clf_scores['test_f1_infoneed'])}"
        )

    # ================================ RANDOM FOREST ======================================
    elif classifier == "RF":

        if verbose == 1:
            print("=== Building Classifier:", classifier)

        clf = RandomForestClassifier(class_weight="balanced")
        clf_scores = cross_validate(clf,
                                    word_embedding_corpus,
                                    y,
                                    cv=10,
                                    scoring=scoring)
        print(
            f"Classifier: RF -- Condition: {condition} -- Info Need: {info_need} -- F1 Average: {mean(clf_scores['test_f1_infoneed'])}"
        )

    df_classification_report = pd.DataFrame(columns=[
        'loss', 'task_name', 'info_need', 'model', 'num_epochs', 'condition',
        'acc', 'f1_other', 'f1_infoneed', 'precision_infoneed',
        'recall_infoneed', 'recall_other', 'precision_other', 'recall_macro',
        'precision_macro', 'recall_micro', 'precision_micro',
        'recall_weighted', 'precision_weighted', 'f1_weighted', 'f1_macro',
        'f1_micro', 'mcc', 'report', 'preds', 'labels', 'stopwords_removed',
        'resampled'
    ])

    for fold in range(10):
        df_classification_report.loc[fold] = [
            'None', 'text_classification', info_need, classifier, 'None',
            condition, 'None', clf_scores['test_f1_other'][fold],
            clf_scores['test_f1_infoneed'][fold],
            clf_scores['test_precision_infoneed'][fold],
            clf_scores['test_recall_infoneed'][fold],
            clf_scores['test_recall_other'][fold],
            clf_scores['test_precision_other'][fold],
            clf_scores['test_recall_macro'][fold],
            clf_scores['test_precision_macro'][fold],
            clf_scores['test_recall_micro'][fold],
            clf_scores['test_precision_micro'][fold],
            clf_scores['test_recall_weighted'][fold],
            clf_scores['test_precision_weighted'][fold],
            clf_scores['test_f1_weighted'][fold],
            clf_scores['test_f1_macro'][fold],
            clf_scores['test_f1_micro'][fold], "None", "None", "None", "None",
            remove_stopwords, resample
        ]
    return df_classification_report
Пример #20
0
    if cmd_args.emb:
        embeddings = list()
        for type_emb in cmd_args.emb.split(":"):
            if type_emb == 'flair':
                embeddings.append(FlairEmbeddings('spanish-forward-fast'))
                embeddings.append(FlairEmbeddings('spanish-backward-fast'))
            elif type_emb == 'bert':
                embeddings.append(BertEmbeddings('bert-base-multilingual-cased'))
            elif type_emb == 'glove':
                embeddings.append(WordEmbeddings('../../../../Data/Models/Glove/glove-sbwc_spanish.i25.gensim.vec'))
            elif type_emb == 'word2vec':
                embeddings.append(WordEmbeddings('../../../../Data/Models/Word2Vec/Spanish_CoNLL17/w2v_es_conll17.gensim.vec'))
            elif type_emb == 'elmo':
                embeddings.append(ELMoEmbeddings('../../../../Data/Models/Elmo/Spanish_CoNLL17/'))
            elif type_emb == 'bpe':
                embeddings.append(BytePairEmbeddings(language='es'))
            elif type_emb == 'wiki':
                embeddings.append(WordEmbeddings('../../../../Data/Models/FastText/wiki.es.gensim.vec'))
            elif type_emb == 'chars':
                embeddings.append(WordEmbeddings('../../../../Data/Models/Chars/lemma_lowercased_estenten11_freeling_v4_virt.gensim.vec'))
            else:
                print('ERROR: type of embedding no accepted' + cmd_args.emb + '. Options: flair, bert, glove, word2vec, elmo, bpe, wiki, chars')
                exit()

        prefix_model_output_dir = '_'.join(cmd_args.emb.split(":"))
        if cmd_args.pooling != 'mean':
            prefix_model_output_dir += '_' + cmd_args.pooling
        document_embeddings = DocumentPoolEmbeddings(embeddings, pooling=cmd_args.pooling, fine_tune_mode='linear')

    if cmd_args.btest:
        bTestPhase = True
Пример #21
0
    import logging
    from collections import defaultdict
    from torch.utils.data.sampler import Sampler
    import random, torch
    from flair.data import FlairDataset

    corpus: flair.data.Corpus = flair.datasets.ClassificationCorpus(
        Path(os.path.join(path2[i])),
        test_file='test_.tsv',
        dev_file='dev.tsv',
        train_file='train.tsv')
    # way to select language model
    model_selector = {
        "Glove": [WordEmbeddings('glove')],
        "FastText": [WordEmbeddings('en-news')],
        "BPE": [BytePairEmbeddings('en')],
        "FlairFast": [
            FlairEmbeddings('news-forward-fast'),
            FlairEmbeddings('news-backward-fast')
        ],
        "FlairNews":
        [FlairEmbeddings('news-forward'),
         FlairEmbeddings('news-backward')],
        "ElmoOriginal": [ELMoEmbeddings('original')],
        'Bert': [BertEmbeddings('large-uncased')],
        'BertLS': [
            BertEmbeddings(bert_model_or_path='bert-large-uncased',
                           layers="0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,"
                           "15,16,17,18,19,20,21,22,23,24",
                           use_scalar_mix=True)
        ],
Пример #22
0
    buffer.readinto(content)
    return content.decode("utf-8")


embeddingList = []
for i in range(1, len(sys.argv)):
    arg = sys.argv[i]
    typeAndPath = arg.split(":")
    type = typeAndPath[0]
    path = typeAndPath[1]
    if type == "word":
        embeddingList.append(WordEmbeddings(path))
    if type == "char":
        embeddingList.append(CharacterEmbeddings(path))
    if type == "bytepair":
        embeddingList.append(BytePairEmbeddings(path))
    if type == "flair":
        embeddingList.append(FlairEmbeddings(path))
    if type == "bert":
        embeddingList.append(BertEmbeddings(path))
    if type == "elmo":
        embeddingList.append(ELMoEmbeddings(path))
if len(embeddingList) > 1:
    embeddings = StackedEmbeddings(embeddings=embeddingList)
else:
    embeddings = embeddingList[0]

stdbuffer = sys.stdin.buffer
print("Script is ready")
while True:
    line = decodeString(stdbuffer)
Пример #23
0
 def __init__(self):
     # initialize the word embeddings
     self.glove_embedding = WordEmbeddings('pt')
     self.bpe_embedding = BytePairEmbeddings('pt')
import torch
from flair.datasets import WIKINER_GERMAN
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, CharacterEmbeddings, \
    BytePairEmbeddings, FlairEmbeddings
from flair.models import SequenceTagger
from flair.trainers.trainer import ModelTrainer

flair.device = torch.device("cuda:0")

corpus = WIKINER_GERMAN(in_memory=False)
tag_type = 'ner'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

embedding_types: List[TokenEmbeddings] = [
    WordEmbeddings('de-wiki'),
    BytePairEmbeddings('de', 100, 5000),
    CharacterEmbeddings(),
    FlairEmbeddings('de-forward'),
    FlairEmbeddings('de-backward')
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

tagger: SequenceTagger = SequenceTagger(
    hidden_size=256,
    embeddings=embeddings,
    tag_dictionary=tag_dictionary,
    tag_type=tag_type,
    use_crf=True
)

trainer: ModelTrainer = ModelTrainer(tagger, corpus)
Пример #25
0
        1: "ner"
    },
    tag_to_bioes="ner",
    skip_first_line=True)
print(corpus)

# 2. what tag do we want to predict?
tag_type = "ner"

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)

# initialize embeddings
embedding_types: List[TokenEmbeddings] = [
    BytePairEmbeddings(language="de", dim=300, syllables=200000)
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

# initialize sequence tagger
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(
    hidden_size=256,
    embeddings=embeddings,
    tag_dictionary=tag_dictionary,
    tag_type=tag_type,
    use_crf=True,
)
Пример #26
0
                              train_file='datatrain.txt',
                              test_file='datatest.txt',
                              dev_file='datadev.txt')
print(corpus)

# 2. tag do to predict?
tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary)

# 4. initialize embeddings
embedding_types: List[TokenEmbeddings] = [
    #WordEmbeddings('glove'),
    BytePairEmbeddings('en')
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

# 5. initialize sequence tagger
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

# 6. initialize trainer
from flair.trainers import ModelTrainer
Пример #27
0
def train_model():
    #global corpus
    # define columns
    columns = {0: "text", 1: "ner"}
    #columns = {0: "text", 1: "pos", 2: "ner"}
    #columns = {0: "text", 1: "pos", 2: "np", 3: "ner"}
    data_folder = training_path
    print("data folder path", data_folder)
    # init a corpus using column format, data folder and the names of the train, dev and test files
    corpus: Corpus = ColumnCorpus(data_folder,
                                  columns,
                                  train_file='train.txt',
                                  dev_file='dev.txt',
                                  test_file='test.txt')

    max_tokens = 250
    corpus._train = [x for x in corpus.train if len(x) < max_tokens]
    corpus._dev = [x for x in corpus.dev if len(x) < max_tokens]
    corpus._test = [x for x in corpus.test if len(x) < max_tokens]

    print("Finished data standardization.........")

    # # 1. get the corpus
    # corpus: Corpus = WIKINER_ENGLISH().downsample(0.1)
    # print(corpus)

    # 2. what tag do we want to predict?
    tag_type = 'ner'

    # 3. make the tag dictionary from the corpus
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    print(tag_dictionary.idx2item)
    #print("path is",f'{data_folder}/albert-base-v2')
    # 4. initialize embeddings
    embedding_types: List[TokenEmbeddings] = [

        # WordEmbeddings('/home/Balaram_bhukya/PycharmProjects/Flair_NER/nerData/wordembeddings/FT.50D.gensim'),
        #WordEmbeddings('glove'),

        # comment in this line to use character embeddings
        #CharacterEmbeddings(),
        BytePairEmbeddings('en'),
        #TransformerXLEmbeddings(),
        # comment in these lines to use flair embeddings
        FlairEmbeddings('news-forward'),
        #FlairEmbeddings('news-forward-fast',pooling='min')
        FlairEmbeddings('news-backward')
        #ELMoEmbeddings()
        #BertEmbeddings(bert_model_or_path=f'{data_folder}/albert-base-v2')
    ]
    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)

    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type,
                                            use_crf=True)

    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    return trainer, corpus