def build_model(vocab: Vocabulary,
                args,
                **kwargs) -> Model:
    print("Building the model")
    vocab_size = vocab.get_vocab_size("tokens")
    EMBED_DIMS = 200

    if args.pretrained_WE_path:
    # turn the tokens into 300 dim embedding. Then, turn the embeddings into encodings
        embedder = BasicTextFieldEmbedder(
            {"tokens": Embedding(embedding_dim=EMBED_DIMS, num_embeddings=vocab_size,
                                 pretrained_file=args.pretrained_WE_path, vocab=vocab, )})

    else:
        embedder = BasicTextFieldEmbedder(
            {"tokens": Embedding(embedding_dim=EMBED_DIMS, num_embeddings=vocab_size)})

    encoder = CnnEncoder(embedding_dim=EMBED_DIMS, ngram_filter_sizes = (2,3,5),
                         num_filters=5) # num_filters is a tad bit dangerous: the reason is that we have this many filters for EACH ngram f

    # encoder = BertPooler("bert-base-cased")
    # the output dim is just the num filters *len(ngram_filter_sizes)

    #     construct the regularizer applicator
    regularizer_applicator = None
    if args.use_reg :
        l2_reg = L2Regularizer()
        regexes = [("embedder", l2_reg),
                   ("encoder", l2_reg),
                   ("classifier", l2_reg)
                   ]
        regularizer_applicator = RegularizerApplicator(regexes)

    return MortalityClassifier(vocab, embedder, encoder,regularizer_applicator,**kwargs)
def build_model(vocab: Vocabulary, use_reg: bool = True) -> Model:
    print("Building the model")
    vocab_size = vocab.get_vocab_size("tokens")
    EMBED_DIMS = 300
    # turn the tokens into 300 dim embedding. Then, turn the embeddings into encodings
    embedder = BasicTextFieldEmbedder({
        "tokens":
        Embedding(embedding_dim=EMBED_DIMS, num_embeddings=vocab_size)
    })
    encoder = CnnEncoder(
        embedding_dim=EMBED_DIMS,
        ngram_filter_sizes=(2, 3, 4, 5),
        num_filters=5
    )  # num_filters is a tad bit dangerous: the reason is that we have this many filters for EACH ngram f
    # encoder = BertPooler("bert-base-cased")
    # the output dim is just the num filters *len(ngram_filter_sizes)

    #     construct the regularizer applicator
    regularizer_applicator = None
    if use_reg:
        l2_reg = L2Regularizer()
        regexes = [("embedder", l2_reg), ("encoder", l2_reg),
                   ("classifier", l2_reg)]
        regularizer_applicator = RegularizerApplicator(regexes)

    return DecompensationClassifier(vocab, embedder, encoder,
                                    regularizer_applicator)
示例#3
0
 def test_l2_regularization(self):
     model = torch.nn.Sequential(
             torch.nn.Linear(5, 10),
             torch.nn.Linear(10, 5)
     )
     initializer = InitializerApplicator([(".*", lambda tensor: constant_(tensor, 0.5))])
     initializer(model)
     value = RegularizerApplicator([("", L2Regularizer(1.0))])(model)
     assert value.data.numpy() == 28.75
示例#4
0
 def test_regularizer_applicator_respects_regex_matching(self):
     model = torch.nn.Sequential(
             torch.nn.Linear(5, 10),
             torch.nn.Linear(10, 5)
     )
     initializer = InitializerApplicator([(".*", lambda tensor: constant_(tensor, 1.))])
     initializer(model)
     value = RegularizerApplicator([("weight", L2Regularizer(0.5)),
                                    ("bias", L1Regularizer(1.0))])(model)
     assert value.data.numpy() == 65.0
 def test_l2_regularization(self):
     model = torch.nn.Sequential(torch.nn.Linear(5, 10),
                                 torch.nn.Linear(10, 5))
     constant_init = Initializer.from_params(
         Params({
             "type": "constant",
             "val": 0.5
         }))
     initializer = InitializerApplicator([(".*", constant_init)])
     initializer(model)
     value = RegularizerApplicator([("", L2Regularizer(1.0))])(model)
     assert value.data.numpy() == 28.75
 def test_regularizer_applicator_respects_regex_matching(self):
     model = torch.nn.Sequential(torch.nn.Linear(5, 10),
                                 torch.nn.Linear(10, 5))
     constant_init = Initializer.from_params(
         Params({
             "type": "constant",
             "val": 1.
         }))
     initializer = InitializerApplicator([(".*", constant_init)])
     initializer(model)
     value = RegularizerApplicator([("weight", L2Regularizer(0.5)),
                                    ("bias", L1Regularizer(1.0))])(model)
     assert value.data.numpy() == 65.0
示例#7
0
def build_model_Transformer(vocab: Vocabulary, use_reg: bool = True) -> Model:
    print("Building the model")
    vocab_size = vocab.get_vocab_size("tokens")
    EMBED_DIMS = 300
    # turn the tokens into 300 dim embedding. Then, turn the embeddings into encodings
    embedder = PretrainedTransformerEmbedder(BERT_MODEL_NAME)
    encoder = BertPooler(
        BERT_MODEL_NAME
    )  # num_filters is a tad bit dangerous: the reason is that we have this many filters for EACH ngram f
    # encoder = BertPooler("bert-base-cased")
    # the output dim is just the num filters *len(ngram_filter_sizes)

    #     construct the regularizer applicator
    regularizer_applicator = None
    if use_reg:
        l2_reg = L2Regularizer()
        regexes = [("embedder", l2_reg), ("encoder", l2_reg),
                   ("classifier", l2_reg)]
        regularizer_applicator = RegularizerApplicator(regexes)

    return MortalityClassifier(vocab, embedder, encoder,
                               regularizer_applicator)
示例#8
0
def run_training_loop():
    tokenizer = BERTTokenizer(vocab_file='/Users/tianhongzxy/Downloads/BiSentESIM/BiSentESIM/My-pipeline/allennlp_tutorial/BertTokenizer/vocab.txt')
    # tokenizer = BERTTokenizer('bert-base-multilingual-cased') # same as above

    # Try to use ELMo
    # tokenindexer = ELMoTokenCharactersIndexer()
    # elmo_tokens = tokenindexer.tokens_to_indices([Token("happy")], None)
    # print(len(elmo_tokens["elmo_tokens"][0]), elmo_tokens)

    # Try to use BERT
    # tokenizer = PretrainedTransformerTokenizer(
    #     model_name="bert-base-multilingual-cased",
    #     add_special_tokens=True,
    #     max_length=512
    # )
    # token_indexer = PretrainedTransformerIndexer(
    #     model_name="bert-base-multilingual-cased",
    #     max_length=512,
    # )

    cached_directory = None # "cached_dir"
    dataset_reader = ClassificationTsvReader(tokenizer=tokenizer, cache_directory=cached_directory)
    print("Reading data")
    train_data = dataset_reader.read(file_path='/Users/tianhongzxy/Downloads/contradictory-my-dear-watson/train.txt')
    pretrained_files = None # {"tokens": "/Users/tianhongzxy/Downloads/BiSentESIM/BiSentESIM/embedding/glove.6B.300d.txt"}
    cuda_device = -1
    batch_size = 8
    vocab = build_vocab(train_data, pretrained_files=pretrained_files, include_full_pretrained_words=False)
    init_uniform = XavierUniformInitializer()
    # init_uniform(model.embedder.token_embedder_tokens.weight)
    init_const = ConstantInitializer(val=0)
    # init_const(model.classifier.bias)
    init_normal = NormalInitializer(mean=0., std=1.)
    # init_normal(model.classifier.weight)
    applicator = InitializerApplicator(
        regexes=[
            ('embedder.*', init_uniform),
            ('classifier.*weight', init_normal),
            ('classifier.*bias', init_const)
        ]
    )
    regularizer = RegularizerApplicator(
        regexes=[
            ('embedder.*', L2Regularizer(alpha=1e-3)),
            ('classifier.*weight', L2Regularizer(alpha=1e-3)),
            # ('classifier.*bias', L1Regularizer(alpha=1e-2)) # 不要对bias进行正则,否则容易欠拟合
        ]
    )
    model = build_model(vocab,
                        embedding_dim=10,
                        pretrained_file=None, # pretrained_files["tokens"]
                        initializer=applicator,
                        regularizer=regularizer
                        )
    if cuda_device >= 0:
        model = model.cuda(cuda_device)

    # split train data into train & dev data
    from allennlp.data.dataset_readers import AllennlpDataset
    print('origin train data size: ', len(train_data))
    train_data, dev_data = train_test_split(train_data, test_size=0.2, random_state=20020206)
    assert type(train_data[0]) == type(dev_data[0]) == Instance
    train_data, dev_data = AllennlpDataset(train_data), AllennlpDataset(dev_data)
    print('train data size: ', len(train_data), 'dev data size', len(dev_data))
    assert type(train_data) == type(dev_data) == AllennlpDataset
    train_data.index_with(vocab)
    dev_data.index_with(vocab)

    train_loader, dev_loader = build_data_loaders(train_data=train_data,
                                                  dev_data=dev_data,
                                                  batch_size=batch_size)

    with tempfile.TemporaryDirectory() as serialization_dir:
        # serialization_dir = 'temp_dir/'
        trainer = build_trainer(
            model=model,
            serialization_dir=serialization_dir,
            train_loader=train_loader,
            dev_loader=dev_loader,
            num_epochs=5,
            cuda_device=cuda_device,
            patience=5
        )
        print("Starting training")
        trainer.train()
        print("Finished training")
        # Evaluate model on test data
        # print("Starting testing")
        # test_data = dataset_reader.read('test.txt')
        # test_data.index_with(vocab)
        # data_loader = DataLoader(test_data, batch_size=batch_size)
        # results = evaluate(model, data_loader, cuda_device=cuda_device)
        # print('Test results: ', results)

    # outputs = model.forward_on_instances(instances)
    # print(outputs)
    return model, dataset_reader
示例#9
0
    def train(self, args_hpo, index):
        """
        trains the model, and return the metrics to the meta optimizer.
        :param args_hpo:
        :param index:
        :return:
        """
        PrintColors.prYellow('\n===== training with: {}'.format(args_hpo))
        PrintColors.prGreen('----- in {} mode -----'.format('train'))
        ''' ============ LOAD DATA ================================================================================ '''
        starting_time = time.time()
        lm_dataset_reader = LanguageModelSegmentReader(global_constants=GLOBAL_CONSTANTS)
        train_data, val_data = (lm_dataset_reader.read(folder) for folder in
                                [_train_data_path, _val_data_path])
        lm_vocabulary = Vocabulary.from_instances(train_data + val_data)
        iterator = BasicIterator(batch_size=args_hpo.batch_size)
        iterator.index_with(lm_vocabulary)
        ''' ============ DEFINE MODEL ============================================================================= '''
        ''' 
        the class params 'pop' its parameters i.e. they disappear after first use. So we instantiate a Params 
        instance for each model defining execution. More than that, they turn dicts into Mutable mappings and 
        destroys the original dict. So here's your copy allennlp. Thanks. (I still love you)
        '''
        token_embedding = Embedding.from_params(vocab=lm_vocabulary,
                                                params=Params(copy.deepcopy(GLOBAL_CONSTANTS.GLOVE_PARAMS_CONFIG)))

        token_embedder: TextFieldEmbedder = BasicTextFieldEmbedder({'tokens': token_embedding})
        ''' define encoder to wrap up an lstm feature extractor '''
        contextualizer: Seq2SeqEncoder = PytorchSeq2SeqWrapper(
            torch.nn.LSTM(input_size=args_hpo.word_embedding_size,
                          hidden_size=args_hpo.ed_ncoder_size,
                          bidirectional=False, batch_first=True))

        model = LanguageModel(vocab=lm_vocabulary,
                              text_field_embedder=token_embedder,
                              contextualizer=contextualizer,
                              dropout=args_hpo.dropout,
                              regularizer=RegularizerApplicator([('l2', L2Regularizer(alpha=args_hpo.l2))]),
                              )\
            .cuda(_device)

        ''' ============ TRAIN ================================================================================ '''
        '''  callbacks  '''
        if index == 0:
            for file in os.listdir(os.path.join(*['.', 'lm_models'])):
                path = os.path.join(*['.', 'lm_models', file])
                if os.path.isfile(path):
                    os.remove(path)
                else:
                    shutil.rmtree(path)
        serialization_path = 'models_lm_{}_{}'.format(_tag, index)
        serialization_path_longer = os.path.join(*['.', 'lm_models', serialization_path])
        vocab_path = 'vocab_lm_{}_{}'.format(_tag, index)
        vocab_dir_longer = os.path.join(*['.', 'lm_models', vocab_path])
        if not os.path.exists(serialization_path_longer):
            os.mkdir(serialization_path_longer)
        callbacks = list()
        ''' for validation '''
        callbacks.append(validate.Validate(validation_data=val_data, validation_iterator=iterator))
        ''' for early stopping. it tracks 'loss' returned by model.forward() '''
        callbacks.append(track_metrics.TrackMetrics(patience=3))
        ''' for grad clipping '''
        callbacks.append(gradient_norm_and_clip.GradientNormAndClip(grad_clipping=args_hpo.clip))
        ''' 
            for checkpointing
            TODO: NOTE:serialization path CANNOT exist before training ??
        '''
        model_checkpointer = checkpointer.Checkpointer(serialization_dir=serialization_path_longer,
                                                       num_serialized_models_to_keep=1)
        callbacks.append(checkpoint.Checkpoint(checkpointer=model_checkpointer))
        ''' for sample generations '''

        callback_trainer = CallbackTrainer(
            model=model,
            training_data=train_data,
            iterator=iterator,
            optimizer=torch.optim.Adam(model.parameters(), lr=args_hpo.lr),
            num_epochs=_n_epochs,
            serialization_dir=serialization_path_longer,
            cuda_device=_device,
            callbacks=callbacks
        )

        ''' trainer saves the model, but the vocabulary needs to be saved, too '''
        lm_vocabulary.save_to_files(vocab_dir_longer)

        ''' check the metric names to synchronize with the class '''
        metrics = callback_trainer.train()
        metrics['time_consumed(hrs)'] = round((time.time() - starting_time) / 3600, 4)

        return metrics
def build_model(
        vocab, embed_dim: int = 100,
        hid_dim: int = 100,
        min_dec_step: int = 2,
        max_decoding_steps: int = 3,
        fix_edu_num: int = -1,
        use_elmo: bool = False,
        dropout=0.5,
        dropout_emb=0.2, span_encoder_type='self_attentive',
        attn_type='dot',
        schedule_ratio_from_ground_truth=0.7,
        pretrain_embedding=None,
        nenc_lay: int = 1,
        mult_orac_sampling: bool = True,
        compression: bool = True,
        word_token_indexers=None,
        alpha: float = 1.0,
        dbg: bool = False,
        dec_avd_trigram_rep: bool = True,
        aggressive_compression: int = -1,
        keep_threshold: float = 0.5,
        weight_alpha=0.0,
        bias_alpha=0.0,
        abs_board_file: str = "/home/cc/exComp/board.txt",
        compress_leadn=-1,
        gather='mean',
        abs_dir_root: str = "/scratch/cluster/jcxu",
        serilization_name="",
        load_save_model: str = None
):
    model = Seq2IdxSum(
        vocab=vocab,
        word_embedding_dim=embed_dim,
        hidden_dim=hid_dim, min_dec_step=min_dec_step,
        max_decoding_steps=max_decoding_steps,
        fix_edu_num=fix_edu_num,
        use_elmo=use_elmo, span_encoder_type=span_encoder_type,
        dropout=dropout, dropout_emb=dropout_emb,
        attn_type=attn_type,
        schedule_ratio_from_ground_truth=schedule_ratio_from_ground_truth,
        pretrain_embedding_file=pretrain_embedding,
        nenc_lay=nenc_lay,
        mult_orac_sampling=mult_orac_sampling,
        word_token_indexers=word_token_indexers,
        compression=compression, alpha=alpha,
        dbg=dbg,
        dec_avd_trigram_rep=dec_avd_trigram_rep,
        aggressive_compression=aggressive_compression,
        keep_threshold=keep_threshold,
        regularizer=RegularizerApplicator([("weight", L2Regularizer(weight_alpha)),
                                           ("bias", L1Regularizer(bias_alpha))]),
        abs_board_file=abs_board_file,
        gather=gather,
        compress_leadn=compress_leadn,
        abs_dir_root=abs_dir_root,
        serilization_name=serilization_name
    )
    if load_save_model:
        model.load_state_dict(torch.load(load_save_model, map_location=get_device()))
    #         `` model.load_state_dict(torch.load("/path/to/model/weights.th"))``

    # model = torch.nn.DataParallel(model)
    device = get_device()
    model = model.to(device)
    return model
示例#11
0
''' the language model used Glove but we just build an embedder to load the trained parameters '''
token_embedding = Embedding(
    num_embeddings=vocabulary.get_vocab_size(namespace='tokens'),
    embedding_dim=combination.word_embedding_size,
    padding_index=0)
token_embedder: TextFieldEmbedder = BasicTextFieldEmbedder(
    {'tokens': token_embedding})
''' define encoder to wrap up an lstm feature extractor '''
contextualizer: Seq2SeqEncoder = PytorchSeq2SeqWrapper(
    torch.nn.LSTM(input_size=combination.word_embedding_size,
                  hidden_size=combination.ed_ncoder_size,
                  bidirectional=False,
                  batch_first=True))
model = LanguageModel(vocab=vocabulary,
                      text_field_embedder=token_embedder,
                      contextualizer=contextualizer,
                      dropout=combination.dropout,
                      regularizer=RegularizerApplicator([('l2', L2Regularizer(alpha=combination.l2))]),
                      ) \
    .cuda(device)
model.load_state_dict(torch.load(open(language_model_path, 'rb')), strict=True)
dataset_reader = LanguageModelSegmentReader(global_constants=GLOBAL_CONSTANTS)
language_model_predictor = Predictor(model=model,
                                     dataset_reader=dataset_reader)
val_data_path = os.path.join('.', 'data_seg_val_toytoy')
instances = dataset_reader.read(val_data_path)
predictions = [
    language_model_predictor.predict_instance(instance)
    for instance in instances
]

class Net(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = torch.nn.Linear(2, 3)
        self.linear2 = torch.nn.Linear(3, 2)
        self.conv = torch.nn.Conv1d(2, 2, 2)

    def forward(self, inputs):
        pass


print('Using individual regularizers:')
model = Net()
init_const = ConstantInitializer(val=10.)
init_const(model.linear1.weight)
init_const(model.linear2.weight)

l1_regularizer = L1Regularizer(alpha=0.01)
print(l1_regularizer(model.linear1.weight))  # 0.01 * 10 * 6 = 0.6

l2_regularizer = L2Regularizer(alpha=0.01)
print(l2_regularizer(model.linear2.weight))  # 0.01 * (10)^2 * 6

print('Using an applicator:')
applicator = RegularizerApplicator(
    regexes=[('linear1.weight',
              L1Regularizer(alpha=.01)), ('linear2.weight', L2Regularizer())])
print(applicator(model))  # 0.6 + 6
示例#13
0
class Net(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = torch.nn.Linear(2, 3)
        self.linear2 = torch.nn.Linear(3, 2)
        self.conv = torch.nn.Conv1d(2, 2, 2)

    def forward(self, inputs):
        pass


print("Using individual regularizers:")
model = Net()
init_const = ConstantInitializer(val=10.0)
init_const(model.linear1.weight)
init_const(model.linear2.weight)

l1_regularizer = L1Regularizer(alpha=0.01)
print(l1_regularizer(model.linear1.weight))  # 0.01 * 10 * 6 = 0.6

l2_regularizer = L2Regularizer(alpha=0.01)
print(l2_regularizer(model.linear2.weight))  # 0.01 * (10)^2 * 6

print("Using an applicator:")
applicator = RegularizerApplicator(regexes=[
    ("linear1.weight", L1Regularizer(alpha=0.01)),
    ("linear2.weight", L2Regularizer()),
])
print(applicator(model))  # 0.6 + 6