def build_model(vocab: Vocabulary,
                args,
                **kwargs) -> Model:
    print("Building the model")
    vocab_size = vocab.get_vocab_size("tokens")
    EMBED_DIMS = 200

    if args.pretrained_WE_path:
    # turn the tokens into 300 dim embedding. Then, turn the embeddings into encodings
        embedder = BasicTextFieldEmbedder(
            {"tokens": Embedding(embedding_dim=EMBED_DIMS, num_embeddings=vocab_size,
                                 pretrained_file=args.pretrained_WE_path, vocab=vocab, )})

    else:
        embedder = BasicTextFieldEmbedder(
            {"tokens": Embedding(embedding_dim=EMBED_DIMS, num_embeddings=vocab_size)})

    encoder = CnnEncoder(embedding_dim=EMBED_DIMS, ngram_filter_sizes = (2,3,5),
                         num_filters=5) # num_filters is a tad bit dangerous: the reason is that we have this many filters for EACH ngram f

    # encoder = BertPooler("bert-base-cased")
    # the output dim is just the num filters *len(ngram_filter_sizes)

    #     construct the regularizer applicator
    regularizer_applicator = None
    if args.use_reg :
        l2_reg = L2Regularizer()
        regexes = [("embedder", l2_reg),
                   ("encoder", l2_reg),
                   ("classifier", l2_reg)
                   ]
        regularizer_applicator = RegularizerApplicator(regexes)

    return MortalityClassifier(vocab, embedder, encoder,regularizer_applicator,**kwargs)
def build_model(vocab: Vocabulary, use_reg: bool = True) -> Model:
    print("Building the model")
    vocab_size = vocab.get_vocab_size("tokens")
    EMBED_DIMS = 300
    # turn the tokens into 300 dim embedding. Then, turn the embeddings into encodings
    embedder = BasicTextFieldEmbedder({
        "tokens":
        Embedding(embedding_dim=EMBED_DIMS, num_embeddings=vocab_size)
    })
    encoder = CnnEncoder(
        embedding_dim=EMBED_DIMS,
        ngram_filter_sizes=(2, 3, 4, 5),
        num_filters=5
    )  # num_filters is a tad bit dangerous: the reason is that we have this many filters for EACH ngram f
    # encoder = BertPooler("bert-base-cased")
    # the output dim is just the num filters *len(ngram_filter_sizes)

    #     construct the regularizer applicator
    regularizer_applicator = None
    if use_reg:
        l2_reg = L2Regularizer()
        regexes = [("embedder", l2_reg), ("encoder", l2_reg),
                   ("classifier", l2_reg)]
        regularizer_applicator = RegularizerApplicator(regexes)

    return DecompensationClassifier(vocab, embedder, encoder,
                                    regularizer_applicator)
示例#3
0
 def test_l2_regularization(self):
     model = torch.nn.Sequential(
             torch.nn.Linear(5, 10),
             torch.nn.Linear(10, 5)
     )
     initializer = InitializerApplicator([(".*", lambda tensor: constant_(tensor, 0.5))])
     initializer(model)
     value = RegularizerApplicator([("", L2Regularizer(1.0))])(model)
     assert value.data.numpy() == 28.75
示例#4
0
 def test_regularizer_applicator_respects_regex_matching(self):
     model = torch.nn.Sequential(
             torch.nn.Linear(5, 10),
             torch.nn.Linear(10, 5)
     )
     initializer = InitializerApplicator([(".*", lambda tensor: constant_(tensor, 1.))])
     initializer(model)
     value = RegularizerApplicator([("weight", L2Regularizer(0.5)),
                                    ("bias", L1Regularizer(1.0))])(model)
     assert value.data.numpy() == 65.0
示例#5
0
 def test_l1_regularization(self):
     model = torch.nn.Sequential(
             torch.nn.Linear(5, 10),
             torch.nn.Linear(10, 5)
     )
     initializer = InitializerApplicator([(".*", lambda tensor: constant_(tensor, -1))])
     initializer(model)
     value = RegularizerApplicator([("", L1Regularizer(1.0))])(model)
     # 115 because of biases.
     assert value.data.numpy() == 115.0
 def test_l2_regularization(self):
     model = torch.nn.Sequential(torch.nn.Linear(5, 10),
                                 torch.nn.Linear(10, 5))
     constant_init = Initializer.from_params(
         Params({
             "type": "constant",
             "val": 0.5
         }))
     initializer = InitializerApplicator([(".*", constant_init)])
     initializer(model)
     value = RegularizerApplicator([("", L2Regularizer(1.0))])(model)
     assert value.data.numpy() == 28.75
 def test_regularizer_applicator_respects_regex_matching(self):
     model = torch.nn.Sequential(torch.nn.Linear(5, 10),
                                 torch.nn.Linear(10, 5))
     constant_init = Initializer.from_params(
         Params({
             "type": "constant",
             "val": 1.
         }))
     initializer = InitializerApplicator([(".*", constant_init)])
     initializer(model)
     value = RegularizerApplicator([("weight", L2Regularizer(0.5)),
                                    ("bias", L1Regularizer(1.0))])(model)
     assert value.data.numpy() == 65.0
 def test_l1_regularization(self):
     model = torch.nn.Sequential(torch.nn.Linear(5, 10),
                                 torch.nn.Linear(10, 5))
     constant_init = Initializer.from_params(
         Params({
             "type": "constant",
             "val": -1
         }))
     initializer = InitializerApplicator([(".*", constant_init)])
     initializer(model)
     value = RegularizerApplicator([("", L1Regularizer(1.0))])(model)
     # 115 because of biases.
     assert value.data.numpy() == 115.0
    def from_params(cls, vocab: Vocabulary, params: Params) -> 'SentenceClassifier':
        embedder_params = params.pop("text_field_embedder")
        text_field_embedder = TextFieldEmbedder.from_params(vocab, embedder_params)

        question_encoder = Seq2VecEncoder.from_params(params.pop("question_encoder"))

        initializer = InitializerApplicator.from_params(params.pop('initializer', []))
        regularizer = RegularizerApplicator.from_params(params.pop('regularizer', []))

        return cls(vocab=vocab,
                   text_field_embedder=text_field_embedder,
                   question_encoder=question_encoder,
                   initializer=initializer,
                   regularizer=regularizer)
示例#10
0
    def from_params(cls, vocab: Vocabulary, params: Params) -> 'ToxicModel':
        embedder_params = params.pop("text_field_embedder")
        text_field_embedder = TextFieldEmbedder.from_params(vocab, embedder_params)
        encoder = Seq2VecEncoder.from_params(params.pop("encoder"))
        classifier_feedforward = FeedForward.from_params(params.pop("classifier_feedforward"))

        initializer = InitializerApplicator.from_params(params.pop('initializer', []))
        regularizer = RegularizerApplicator.from_params(params.pop('regularizer', []))

        return cls(vocab=vocab,
                   text_field_embedder=text_field_embedder,
                   encoder=encoder,
                   classifier_feedforward=classifier_feedforward,
                   initializer=initializer,
                   regularizer=regularizer)
示例#11
0
    def test_from_params(self):
        params = Params({"regularizers": [("conv", "l1"), ("linear", {"type": "l2", "alpha": 10})]})
        regularizer_applicator = RegularizerApplicator.from_params(params.pop("regularizers"))
        regularizers = regularizer_applicator._regularizers  # pylint: disable=protected-access

        conv = linear = None
        for regex, regularizer in regularizers:
            if regex == "conv":
                conv = regularizer
            elif regex == "linear":
                linear = regularizer

        assert isinstance(conv, L1Regularizer)
        assert isinstance(linear, L2Regularizer)
        assert linear.alpha == 10
示例#12
0
 def test_frozen_params(self):
     model = torch.nn.Sequential(torch.nn.Linear(5, 10),
                                 torch.nn.Linear(10, 5))
     constant_init = Initializer.from_params(
         Params({
             "type": "constant",
             "val": -1
         }))
     initializer = InitializerApplicator([(".*", constant_init)])
     initializer(model)
     # freeze the parameters of the first linear
     for name, param in model.named_parameters():
         if re.search(r"0.*$", name):
             param.requires_grad = False
     value = RegularizerApplicator([("", L1Regularizer(1.0))])(model)
     # 55 because of bias (5*10 + 5)
     assert value.data.numpy() == 55
示例#13
0
def build_model_Transformer(vocab: Vocabulary, use_reg: bool = True) -> Model:
    print("Building the model")
    vocab_size = vocab.get_vocab_size("tokens")
    EMBED_DIMS = 300
    # turn the tokens into 300 dim embedding. Then, turn the embeddings into encodings
    embedder = PretrainedTransformerEmbedder(BERT_MODEL_NAME)
    encoder = BertPooler(
        BERT_MODEL_NAME
    )  # num_filters is a tad bit dangerous: the reason is that we have this many filters for EACH ngram f
    # encoder = BertPooler("bert-base-cased")
    # the output dim is just the num filters *len(ngram_filter_sizes)

    #     construct the regularizer applicator
    regularizer_applicator = None
    if use_reg:
        l2_reg = L2Regularizer()
        regexes = [("embedder", l2_reg), ("encoder", l2_reg),
                   ("classifier", l2_reg)]
        regularizer_applicator = RegularizerApplicator(regexes)

    return MortalityClassifier(vocab, embedder, encoder,
                               regularizer_applicator)
示例#14
0
    def __init__(
        self,
        vocab: Vocabulary,
        text_field_embedder: TextFieldEmbedder,
        encoder: Seq2VecEncoder,
        classifier_feedforward: FeedForward,
        initializer: InitializerApplicator = InitializerApplicator(),
        regularizer: Optional[RegularizerApplicator] = RegularizerApplicator()
    ) -> None:

        super().__init__(vocab, regularizer)
        self.text_field_embedder = text_field_embedder
        self.num_classes = self.vocab.get_vocab_size('labels')
        self.encoder = encoder
        self.classifier_feedforward = classifier_feedforward
        self.loss = torch.nn.BCEWithLogitsLoss()
        #self.loss = torch.nn.MultiLabelMarginLoss(reduction='sum')
        self.f1 = MultiLabelF1Measure()
        self.labels = [
            'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
            'identity_hate'
        ]

        initializer(self)
示例#15
0
    def train(self, args_hpo, index):
        """
        trains the model, and return the metrics to the meta optimizer.
        :param args_hpo:
        :param index:
        :return:
        """
        PrintColors.prYellow('\n===== training with: {}'.format(args_hpo))
        PrintColors.prGreen('----- in {} mode -----'.format('train'))
        ''' ============ LOAD DATA ================================================================================ '''
        starting_time = time.time()
        lm_dataset_reader = LanguageModelSegmentReader(global_constants=GLOBAL_CONSTANTS)
        train_data, val_data = (lm_dataset_reader.read(folder) for folder in
                                [_train_data_path, _val_data_path])
        lm_vocabulary = Vocabulary.from_instances(train_data + val_data)
        iterator = BasicIterator(batch_size=args_hpo.batch_size)
        iterator.index_with(lm_vocabulary)
        ''' ============ DEFINE MODEL ============================================================================= '''
        ''' 
        the class params 'pop' its parameters i.e. they disappear after first use. So we instantiate a Params 
        instance for each model defining execution. More than that, they turn dicts into Mutable mappings and 
        destroys the original dict. So here's your copy allennlp. Thanks. (I still love you)
        '''
        token_embedding = Embedding.from_params(vocab=lm_vocabulary,
                                                params=Params(copy.deepcopy(GLOBAL_CONSTANTS.GLOVE_PARAMS_CONFIG)))

        token_embedder: TextFieldEmbedder = BasicTextFieldEmbedder({'tokens': token_embedding})
        ''' define encoder to wrap up an lstm feature extractor '''
        contextualizer: Seq2SeqEncoder = PytorchSeq2SeqWrapper(
            torch.nn.LSTM(input_size=args_hpo.word_embedding_size,
                          hidden_size=args_hpo.ed_ncoder_size,
                          bidirectional=False, batch_first=True))

        model = LanguageModel(vocab=lm_vocabulary,
                              text_field_embedder=token_embedder,
                              contextualizer=contextualizer,
                              dropout=args_hpo.dropout,
                              regularizer=RegularizerApplicator([('l2', L2Regularizer(alpha=args_hpo.l2))]),
                              )\
            .cuda(_device)

        ''' ============ TRAIN ================================================================================ '''
        '''  callbacks  '''
        if index == 0:
            for file in os.listdir(os.path.join(*['.', 'lm_models'])):
                path = os.path.join(*['.', 'lm_models', file])
                if os.path.isfile(path):
                    os.remove(path)
                else:
                    shutil.rmtree(path)
        serialization_path = 'models_lm_{}_{}'.format(_tag, index)
        serialization_path_longer = os.path.join(*['.', 'lm_models', serialization_path])
        vocab_path = 'vocab_lm_{}_{}'.format(_tag, index)
        vocab_dir_longer = os.path.join(*['.', 'lm_models', vocab_path])
        if not os.path.exists(serialization_path_longer):
            os.mkdir(serialization_path_longer)
        callbacks = list()
        ''' for validation '''
        callbacks.append(validate.Validate(validation_data=val_data, validation_iterator=iterator))
        ''' for early stopping. it tracks 'loss' returned by model.forward() '''
        callbacks.append(track_metrics.TrackMetrics(patience=3))
        ''' for grad clipping '''
        callbacks.append(gradient_norm_and_clip.GradientNormAndClip(grad_clipping=args_hpo.clip))
        ''' 
            for checkpointing
            TODO: NOTE:serialization path CANNOT exist before training ??
        '''
        model_checkpointer = checkpointer.Checkpointer(serialization_dir=serialization_path_longer,
                                                       num_serialized_models_to_keep=1)
        callbacks.append(checkpoint.Checkpoint(checkpointer=model_checkpointer))
        ''' for sample generations '''

        callback_trainer = CallbackTrainer(
            model=model,
            training_data=train_data,
            iterator=iterator,
            optimizer=torch.optim.Adam(model.parameters(), lr=args_hpo.lr),
            num_epochs=_n_epochs,
            serialization_dir=serialization_path_longer,
            cuda_device=_device,
            callbacks=callbacks
        )

        ''' trainer saves the model, but the vocabulary needs to be saved, too '''
        lm_vocabulary.save_to_files(vocab_dir_longer)

        ''' check the metric names to synchronize with the class '''
        metrics = callback_trainer.train()
        metrics['time_consumed(hrs)'] = round((time.time() - starting_time) / 3600, 4)

        return metrics
示例#16
0
class Net(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = torch.nn.Linear(2, 3)
        self.linear2 = torch.nn.Linear(3, 2)
        self.conv = torch.nn.Conv1d(2, 2, 2)

    def forward(self, inputs):
        pass


print("Using individual regularizers:")
model = Net()
init_const = ConstantInitializer(val=10.0)
init_const(model.linear1.weight)
init_const(model.linear2.weight)

l1_regularizer = L1Regularizer(alpha=0.01)
print(l1_regularizer(model.linear1.weight))  # 0.01 * 10 * 6 = 0.6

l2_regularizer = L2Regularizer(alpha=0.01)
print(l2_regularizer(model.linear2.weight))  # 0.01 * (10)^2 * 6

print("Using an applicator:")
applicator = RegularizerApplicator(regexes=[
    ("linear1.weight", L1Regularizer(alpha=0.01)),
    ("linear2.weight", L2Regularizer()),
])
print(applicator(model))  # 0.6 + 6

class Net(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = torch.nn.Linear(2, 3)
        self.linear2 = torch.nn.Linear(3, 2)
        self.conv = torch.nn.Conv1d(2, 2, 2)

    def forward(self, inputs):
        pass


print('Using individual regularizers:')
model = Net()
init_const = ConstantInitializer(val=10.)
init_const(model.linear1.weight)
init_const(model.linear2.weight)

l1_regularizer = L1Regularizer(alpha=0.01)
print(l1_regularizer(model.linear1.weight))  # 0.01 * 10 * 6 = 0.6

l2_regularizer = L2Regularizer(alpha=0.01)
print(l2_regularizer(model.linear2.weight))  # 0.01 * (10)^2 * 6

print('Using an applicator:')
applicator = RegularizerApplicator(
    regexes=[('linear1.weight',
              L1Regularizer(alpha=.01)), ('linear2.weight', L2Regularizer())])
print(applicator(model))  # 0.6 + 6
def train_valid_base_text_decision_fix_text_features_model(
        model_name: str,
        single_round_label: bool,
        use_only_prev_round: bool,
        train_data_file_name: str,
        validation_data_file_name: str,
        no_history: bool = False,
        func_batch_size: int = 9,
        numbers_columns: list = None,
        add_numeric_data: bool = True):
    """
    This function train and validate model that use fix texts features only.
    :param: model_name: the full model name
    :param single_round_label: the label to use: single round of total payoff
    :param use_only_prev_round: if to use all the history or only the previous round
    :param train_data_file_name: the name of the train_data to use
    :param validation_data_file_name: the name of the validation_data to use
    :param no_history: if we don't want to use any history data
    :param func_batch_size: the batch size to use
    :param model_name: the name of the model we run
    :param numbers_columns: the names of the columns to use for the numeric data
    :param add_numeric_data: if we want to add numbers data
    :return:
    """

    reader = TextExpDataSetReader(add_numeric_data=add_numeric_data,
                                  use_only_prev_round=use_only_prev_round,
                                  single_round_label=single_round_label,
                                  three_losses=True,
                                  fix_text_features=True,
                                  no_history=no_history,
                                  numbers_columns_name=numbers_columns)
    train_data_file_inner_path = os.path.join(data_directory,
                                              train_data_file_name)
    validation_data_file_inner_path = os.path.join(data_directory,
                                                   validation_data_file_name)
    train_instances = reader.read(train_data_file_inner_path)
    validation_instances = reader.read(validation_data_file_inner_path)
    vocab = Vocabulary()

    # TODO: change this if necessary
    # batch_size should be: 10 or 9 depends on the input
    # and not shuffle so all the data of the same pair will be in the same batch
    iterator = BasicIterator(
        batch_size=func_batch_size)  # , instances_per_epoch=10)
    #  sorting_keys=[('sequence_review', 'list_num_tokens')])
    iterator.index_with(vocab)

    # the shape of the flatten data rep
    if 'bert' in train_data_file_name:  # fix features are BERT vector
        text_feedtorward = FeedForward(input_dim=reader.max_tokens_len,
                                       num_layers=2,
                                       hidden_dims=[300, 50],
                                       activations=ReLU(),
                                       dropout=[0.0, 0.0])
        reader.max_tokens_len = 50
    else:
        text_feedtorward = None
    feed_forward_input_dim = reader.max_seq_len * (reader.max_tokens_len +
                                                   reader.number_length)
    feed_forward_classification = FeedForward(input_dim=feed_forward_input_dim,
                                              num_layers=1,
                                              hidden_dims=[2],
                                              activations=LeakyReLU(),
                                              dropout=[0.3])
    criterion_classification = nn.BCEWithLogitsLoss()

    metrics_dict = {
        'Accuracy': CategoricalAccuracy()  # BooleanAccuracy(),
        # 'auc': Auc(),
        # 'F1measure': F1Measure(positive_label=1),
    }

    model = models.BasicFixTextFeaturesDecisionModel(
        vocab=vocab,
        classifier_feedforward_classification=feed_forward_classification,
        criterion_classification=criterion_classification,
        metrics_dict=metrics_dict,
        max_tokens_len=reader.max_tokens_len,
        text_feedforward=text_feedtorward,
        regularizer=RegularizerApplicator([("", L1Regularizer())]),
    )

    optimizer = optim.Adam(model.parameters(), lr=0.1)
    num_epochs = 100

    run_log_directory = utils.set_folder(
        datetime.now().strftime(
            f'{model_name}_{num_epochs}_epochs_%d_%m_%Y_%H_%M_%S'), 'logs')

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        iterator=iterator,
        train_dataset=train_instances,
        validation_dataset=validation_instances,
        num_epochs=num_epochs,
        shuffle=False,
        serialization_dir=run_log_directory,
        patience=10,
        histogram_interval=10,
    )

    model_dict = trainer.train()

    print(f'{model_name}: evaluation measures are:')
    for key, value in model_dict.items():
        if 'accuracy' in key:
            value = value * 100
        print(f'{key}: {value}')

    # save the model predictions
    model.predictions.to_csv(os.path.join(run_log_directory,
                                          'predictions.csv'))
示例#19
0
''' the language model used Glove but we just build an embedder to load the trained parameters '''
token_embedding = Embedding(
    num_embeddings=vocabulary.get_vocab_size(namespace='tokens'),
    embedding_dim=combination.word_embedding_size,
    padding_index=0)
token_embedder: TextFieldEmbedder = BasicTextFieldEmbedder(
    {'tokens': token_embedding})
''' define encoder to wrap up an lstm feature extractor '''
contextualizer: Seq2SeqEncoder = PytorchSeq2SeqWrapper(
    torch.nn.LSTM(input_size=combination.word_embedding_size,
                  hidden_size=combination.ed_ncoder_size,
                  bidirectional=False,
                  batch_first=True))
model = LanguageModel(vocab=vocabulary,
                      text_field_embedder=token_embedder,
                      contextualizer=contextualizer,
                      dropout=combination.dropout,
                      regularizer=RegularizerApplicator([('l2', L2Regularizer(alpha=combination.l2))]),
                      ) \
    .cuda(device)
model.load_state_dict(torch.load(open(language_model_path, 'rb')), strict=True)
dataset_reader = LanguageModelSegmentReader(global_constants=GLOBAL_CONSTANTS)
language_model_predictor = Predictor(model=model,
                                     dataset_reader=dataset_reader)
val_data_path = os.path.join('.', 'data_seg_val_toytoy')
instances = dataset_reader.read(val_data_path)
predictions = [
    language_model_predictor.predict_instance(instance)
    for instance in instances
]
def build_model(
        vocab, embed_dim: int = 100,
        hid_dim: int = 100,
        min_dec_step: int = 2,
        max_decoding_steps: int = 3,
        fix_edu_num: int = -1,
        use_elmo: bool = False,
        dropout=0.5,
        dropout_emb=0.2, span_encoder_type='self_attentive',
        attn_type='dot',
        schedule_ratio_from_ground_truth=0.7,
        pretrain_embedding=None,
        nenc_lay: int = 1,
        mult_orac_sampling: bool = True,
        compression: bool = True,
        word_token_indexers=None,
        alpha: float = 1.0,
        dbg: bool = False,
        dec_avd_trigram_rep: bool = True,
        aggressive_compression: int = -1,
        keep_threshold: float = 0.5,
        weight_alpha=0.0,
        bias_alpha=0.0,
        abs_board_file: str = "/home/cc/exComp/board.txt",
        compress_leadn=-1,
        gather='mean',
        abs_dir_root: str = "/scratch/cluster/jcxu",
        serilization_name="",
        load_save_model: str = None
):
    model = Seq2IdxSum(
        vocab=vocab,
        word_embedding_dim=embed_dim,
        hidden_dim=hid_dim, min_dec_step=min_dec_step,
        max_decoding_steps=max_decoding_steps,
        fix_edu_num=fix_edu_num,
        use_elmo=use_elmo, span_encoder_type=span_encoder_type,
        dropout=dropout, dropout_emb=dropout_emb,
        attn_type=attn_type,
        schedule_ratio_from_ground_truth=schedule_ratio_from_ground_truth,
        pretrain_embedding_file=pretrain_embedding,
        nenc_lay=nenc_lay,
        mult_orac_sampling=mult_orac_sampling,
        word_token_indexers=word_token_indexers,
        compression=compression, alpha=alpha,
        dbg=dbg,
        dec_avd_trigram_rep=dec_avd_trigram_rep,
        aggressive_compression=aggressive_compression,
        keep_threshold=keep_threshold,
        regularizer=RegularizerApplicator([("weight", L2Regularizer(weight_alpha)),
                                           ("bias", L1Regularizer(bias_alpha))]),
        abs_board_file=abs_board_file,
        gather=gather,
        compress_leadn=compress_leadn,
        abs_dir_root=abs_dir_root,
        serilization_name=serilization_name
    )
    if load_save_model:
        model.load_state_dict(torch.load(load_save_model, map_location=get_device()))
    #         `` model.load_state_dict(torch.load("/path/to/model/weights.th"))``

    # model = torch.nn.DataParallel(model)
    device = get_device()
    model = model.to(device)
    return model
示例#21
0
def run_training_loop():
    tokenizer = BERTTokenizer(vocab_file='/Users/tianhongzxy/Downloads/BiSentESIM/BiSentESIM/My-pipeline/allennlp_tutorial/BertTokenizer/vocab.txt')
    # tokenizer = BERTTokenizer('bert-base-multilingual-cased') # same as above

    # Try to use ELMo
    # tokenindexer = ELMoTokenCharactersIndexer()
    # elmo_tokens = tokenindexer.tokens_to_indices([Token("happy")], None)
    # print(len(elmo_tokens["elmo_tokens"][0]), elmo_tokens)

    # Try to use BERT
    # tokenizer = PretrainedTransformerTokenizer(
    #     model_name="bert-base-multilingual-cased",
    #     add_special_tokens=True,
    #     max_length=512
    # )
    # token_indexer = PretrainedTransformerIndexer(
    #     model_name="bert-base-multilingual-cased",
    #     max_length=512,
    # )

    cached_directory = None # "cached_dir"
    dataset_reader = ClassificationTsvReader(tokenizer=tokenizer, cache_directory=cached_directory)
    print("Reading data")
    train_data = dataset_reader.read(file_path='/Users/tianhongzxy/Downloads/contradictory-my-dear-watson/train.txt')
    pretrained_files = None # {"tokens": "/Users/tianhongzxy/Downloads/BiSentESIM/BiSentESIM/embedding/glove.6B.300d.txt"}
    cuda_device = -1
    batch_size = 8
    vocab = build_vocab(train_data, pretrained_files=pretrained_files, include_full_pretrained_words=False)
    init_uniform = XavierUniformInitializer()
    # init_uniform(model.embedder.token_embedder_tokens.weight)
    init_const = ConstantInitializer(val=0)
    # init_const(model.classifier.bias)
    init_normal = NormalInitializer(mean=0., std=1.)
    # init_normal(model.classifier.weight)
    applicator = InitializerApplicator(
        regexes=[
            ('embedder.*', init_uniform),
            ('classifier.*weight', init_normal),
            ('classifier.*bias', init_const)
        ]
    )
    regularizer = RegularizerApplicator(
        regexes=[
            ('embedder.*', L2Regularizer(alpha=1e-3)),
            ('classifier.*weight', L2Regularizer(alpha=1e-3)),
            # ('classifier.*bias', L1Regularizer(alpha=1e-2)) # 不要对bias进行正则,否则容易欠拟合
        ]
    )
    model = build_model(vocab,
                        embedding_dim=10,
                        pretrained_file=None, # pretrained_files["tokens"]
                        initializer=applicator,
                        regularizer=regularizer
                        )
    if cuda_device >= 0:
        model = model.cuda(cuda_device)

    # split train data into train & dev data
    from allennlp.data.dataset_readers import AllennlpDataset
    print('origin train data size: ', len(train_data))
    train_data, dev_data = train_test_split(train_data, test_size=0.2, random_state=20020206)
    assert type(train_data[0]) == type(dev_data[0]) == Instance
    train_data, dev_data = AllennlpDataset(train_data), AllennlpDataset(dev_data)
    print('train data size: ', len(train_data), 'dev data size', len(dev_data))
    assert type(train_data) == type(dev_data) == AllennlpDataset
    train_data.index_with(vocab)
    dev_data.index_with(vocab)

    train_loader, dev_loader = build_data_loaders(train_data=train_data,
                                                  dev_data=dev_data,
                                                  batch_size=batch_size)

    with tempfile.TemporaryDirectory() as serialization_dir:
        # serialization_dir = 'temp_dir/'
        trainer = build_trainer(
            model=model,
            serialization_dir=serialization_dir,
            train_loader=train_loader,
            dev_loader=dev_loader,
            num_epochs=5,
            cuda_device=cuda_device,
            patience=5
        )
        print("Starting training")
        trainer.train()
        print("Finished training")
        # Evaluate model on test data
        # print("Starting testing")
        # test_data = dataset_reader.read('test.txt')
        # test_data.index_with(vocab)
        # data_loader = DataLoader(test_data, batch_size=batch_size)
        # results = evaluate(model, data_loader, cuda_device=cuda_device)
        # print('Test results: ', results)

    # outputs = model.forward_on_instances(instances)
    # print(outputs)
    return model, dataset_reader