def __init__(self, opts): random.seed(opts.seed) torch.manual_seed(opts.seed) self.opts = opts self.log = Log(opts) self.device = returnDevice(opts.cuda, opts.gpu_number) self.lr = opts.lr self.lr_decay_rate = opts.lr_decay_rate self.batch_size = opts.batch_size self.max_len = opts.max_len self.epochs = opts.epochs self.print_every_step = opts.print_every_step self.early_stop = opts.early_stop # 为0时不提前结束 self.lr_decay_every = opts.lr_decay_every self.weight_decay = opts.weight_decay self.shuffle = opts.shuffle self.best_model_name = '' self.vocab = load_pkl_data(opts.vocab_path) self.label_id = load_pkl_data(opts.label_id_path) self.best_score = 0 self.best_score_epoch = 0 self.model = self.get_model() self.optimizer = self.get_optim(opts.optims)
def convert_to_features(df_data, save_path, is_train=False): if is_train: if os.path.exists(save_path): dataset = [] for root, dirs, files in os.walk(save_path): for file in files: dataset.extend(load_pkl_data(os.path.join(root, file))) else: os.makedirs(save_path) dataset = ClozeDataset( tokenizer=config.TOKENIZER, data_id=df_data.data_id.values, tag=df_data.tag.values, text=df_data.text.values, candidate=df_data.candidate.values, groundTruth=df_data.groundTruth.values, max_len=config.MAX_LEN ) datas = [] data = [] batch_id = 1 tk = tqdm(dataset, total=len(dataset)) for bi, item in enumerate(tk): data.append(item) if len(data) == 50000 or bi == len(dataset) - 1: path = save_path + f"/train_features_{batch_id}.pkl" save_pkl_data(data, path) batch_id += 1 datas.extend(data) data = [] dataset = datas else: if os.path.exists(save_path): dataset = load_pkl_data(save_path) else: dataset = ClozeDataset( tokenizer=config.TOKENIZER, data_id=df_data.data_id.values, tag=df_data.tag.values, text=df_data.text.values, candidate=df_data.candidate.values, groundTruth=df_data.groundTruth.values, max_len=config.MAX_LEN ) tk = tqdm(dataset, total=len(dataset)) dataset = [item for item in tk] save_pkl_data(dataset, save_path) return dataset
def just_test(model, filename, postfix=None, amount=None, use_softmax=False, indicator=False): """ Given the model object and the previously stored weights file, this function just restore the weights, load testing data and predict the labels. Args: model(): Keras model object. filename(str): Filename of the trained weight file. amount(int): Use only first "amount" of data. """ model_dir = "model/" print("Restoring best weights from: {:s}".format(filename)) model.load_weights(filename) X, Z, y, d = load_pkl_data(model_dir, "testing", postfix, indicator=indicator) predict(model, X, Z, y, model_file=filename, output="results-test.txt", amount=amount, use_softmax=use_softmax)
def get_loaders(batch_size, data_dir='hw2_data', test=False): train_ind = utils.load_pkl_data('snli_train_ind.p') val_ind = utils.load_pkl_data('snli_val_ind.p') train_target = utils.load_pkl_data('snli_train_target.p') val_target = utils.load_pkl_data('snli_val_target.p') if test: train_dataset = SNLI_Dataset(train_ind[:5 * batch_size], train_target) else: train_dataset = SNLI_Dataset(train_ind, train_target) val_dataset = SNLI_Dataset(val_ind, val_target) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn) val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn) return train_loader, val_loader
import torch import utils import models import data import train_helpers model_dir = sys.argv[1] hidden_size = int(sys.argv[2]) interaction_type = sys.argv[3] kind = sys.argv[4] epoch = sys.argv[5] batch_ix = sys.argv[6] batch_size = 32 ind2vec = utils.load_pkl_data('ind2vec.p', data_dir='vocab') _, val_loader = data.get_loaders(batch_size, data_dir='hw2_data') loss_fn = torch.nn.CrossEntropyLoss() fmodel = f'epoch_{epoch}_batch_{batch_ix}.pt' print('model: ' + fmodel) model = models.SNLI_Model(ind2vec, 300, hidden_size, hidden_size, 80, interaction_type, 'cpu', kind, num_layers=1, bidirectional=True, kernel_size=3)
def run(): """ Train model for a speciied fold """ # Read train csv and dev csv df_train = pd.read_csv(config.TRAIN_FILE) df_valid = pd.read_csv(config.DEV_FILE) # Instantiate TweetDataset with training data train_dataset = SiameseDataset(query=df_train.sentence1.values, question=df_train.sentence2.values, label=df_train.label.values) if os.path.exists(config.train_features): train_dataset = load_pkl_data(config.train_features) else: train_dataset = [item for item in train_dataset] save_pkl_data(train_dataset, config.train_features) # Instantiate DataLoader with `train_dataset` # This is a generator that yields the dataset in batches train_data_loader = torch.utils.data.DataLoader( train_dataset, shuffle=False, batch_size=config.TRAIN_BATCH_SIZE) # Instantiate TweetDataset with validation data valid_dataset = SiameseDataset( query=df_valid.sentence1.values, question=df_valid.sentence2.values, label=df_valid.label.values, ) if os.path.exists(config.valid_features): valid_dataset = load_pkl_data(config.valid_features) else: valid_dataset = [item for item in valid_dataset] save_pkl_data(valid_dataset, config.valid_features) # Instantiate DataLoader with `valid_dataset` valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, shuffle=False) # Set device as `cuda` (GPU) device = torch.device("cuda") # Load pretrained BERT (bert-base-uncased) model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH) # Output hidden states # This is important to set since we want to concatenate the hidden states from the last 2 BERT layers model_config.output_hidden_states = True # Instantiate our model with `model_config` model = SiameseWmdModel(conf=model_config, pretrained_model_path=config.BERT_PATH) # Move the model to the GPU model.to(device) # I'm training only for 3 epochs even though I specified 5!!! pred_labels, wmd, acc, f1, auc = predict(train_data_loader, model, device) logger.info(f"train set : acc = {acc}, f1 score = {f1}, auc = {auc}") df_train["pred_label"] = pred_labels df_train["wmd"] = wmd df_train.to_csv("../output/train_predict.csv") thresholds = [0.25, 0.23] best_f1 = 0 best_th = 0 for threshold in thresholds: pred_labels, wmd, acc, f1, auc = predict(valid_data_loader, model, device, threshold) logger.info( f"dev set :threshold={threshold} acc = {acc}, f1 score = {f1}, auc = {auc}" ) if f1 > best_f1: best_f1 = f1 best_th = threshold print(f"best threshold: {best_th} with best f1 {best_f1}") df_valid["pred_label"] = pred_labels df_valid["wmd"] = wmd df_valid.to_csv("../output/dev_predict.csv")
def train(): """ Train model for a speciied fold """ # Read train csv and dev csv df_train = pd.read_csv(config.TRAIN_FILE) df_valid = pd.read_csv(config.DEV_FILE) # Instantiate TweetDataset with training data train_dataset = SiameseDataset(query=df_train.sentence1.values, question=df_train.sentence2.values, label=df_train.label.values) if os.path.exists(config.train_features): train_dataset = load_pkl_data(config.train_features) else: train_dataset = [item for item in train_dataset] save_pkl_data(train_dataset, config.train_features) # Instantiate DataLoader with `train_dataset` # This is a generator that yields the dataset in batches train_data_loader = torch.utils.data.DataLoader( train_dataset, shuffle=True, batch_size=config.TRAIN_BATCH_SIZE) # Instantiate TweetDataset with validation data valid_dataset = SiameseDataset(query=df_valid.sentence1.values, question=df_valid.sentence2.values, label=df_valid.label.values) if os.path.exists(config.valid_features): valid_dataset = load_pkl_data(config.valid_features) else: valid_dataset = [item for item in valid_dataset] save_pkl_data(valid_dataset, config.valid_features) # Instantiate DataLoader with `valid_dataset` valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE) # Set device as `cuda` (GPU) device = torch.device("cuda:2") # Load pretrained BERT (bert-base-uncased) model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH) # Output hidden states # This is important to set since we want to concatenate the hidden states from the last 2 BERT layers model_config.output_hidden_states = True # Instantiate our model with `model_config` model = SiameseWmdModel(conf=model_config, pretrained_model_path=config.BERT_PATH) # Move the model to the GPU model.to(device) # Calculate the number of training steps num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) # Get the list of named parameters param_optimizer = list(model.named_parameters()) # Specify parameters where weight decay shouldn't be applied no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] # Define two sets of parameters: those with weight decay, and those without optimizer_parameters = [ { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }, ] # Instantiate AdamW optimizer with our two sets of parameters, and a learning rate of 3e-5 optimizer = AdamW(optimizer_parameters, lr=3e-5) # Create a scheduler to set the learning rate at each training step # "Create a schedule with a learning rate that decreases linearly after linearly increasing during a warmup period." (https://pytorch.org/docs/stable/optim.html) # Since num_warmup_steps = 0, the learning rate starts at 3e-5, and then linearly decreases at each training step scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) # Apply early stopping with patience of 2 # This means to stop training new epochs when 2 rounds have passed without any improvement es = utils.EarlyStopping(patience=2, mode="max") thresholds = [0.1, 0.15, 0.20] best_f1 = 0 best_th = 0 for threshold in thresholds: # I'm training only for 3 epochs even though I specified 5!!! for epoch in range(config.EPOCHS): train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler, threshold=threshold) acc, f1, auc = eval_fn(valid_data_loader, model, device) # logger.info(f"acc = {acc}, f1 score = {f1}") es(f1, model, model_path=config.MODEL_SAVE_PATH) if es.early_stop: if f1 > best_f1: best_f1 = f1 best_th = threshold print("Early stopping ********") break logger.info(f"best threshold:{best_th}, best f1 :{best_f1}")
def run(args): # Add underscore to the tag args.tag = ("_" + args.tag) if args.tag is not None else "" # Parse prefix and postfix prefix = "{}{}".format("-Subword" if args.subword else "", "-Attention" if args.attention else "") postfix = "{}{}{}".format("_subword" if args.subword else "", ("_" + args.data_tag) if args.data_tag is not None else "", ("_d" if args.description else "")) # Parse directory name if not args.model_dir.endswith("/"): args.model_dir += "/" if args.matching: print("Matching problem.") ######################################### # Load models (TO-BE-REVISED) tokenizers = pkl.load(open(args.tokenizers, "rb")) n_classes = len(tokenizers["mlb"].classes_) try: desc_tokenizer = tokenizers["description"] except: desc_tokenizer = None ######################################### # Building Model print("Building computational graph...") model = EntityTypingNet( architecture=args.arch, n_classes=n_classes, context_tokenizer=tokenizers["context"], mention_tokenizer=tokenizers["mention"], desc_tokenizer=desc_tokenizer, context_emb=args.context_emb, context_embedding_dim=args.context_embedding_dim, mention_emb=args.mention_emb, mention_embedding_dim=args.mention_embedding_dim, desc_emb=args.desc_emb, desc_embedding_dim=args.desc_embedding_dim, same_emb=args.same_emb, n_words=MAX_NUM_WORDS, n_mention=MAX_NUM_MENTION_WORDS, n_description=MAX_NUM_DESCRIPTION_WORDS, len_context=MAX_SEQUENCE_LENGTH, len_mention=MAX_MENTION_LENGTH, len_description=MAX_DESCRIPTION_LENGTH, attention=args.attention, subword=args.subword, indicator=args.indicator, description=False, # args.description, matching=args.matching, merge_mode=args.merge_mode, dropout=args.dropout, use_softmax=args.use_softmax, optimizer=args.optimizer, learning_rate=args.learning_rate) print(model.summary()) # Save weights at the end of each epoch save_prefix = "{:s}{:s}-weights{:s}".format(args.arch, prefix, args.tag) filename = save_prefix + "-{epoch:02d}.hdf5" checkpoint = ModelCheckpoint( filename, monitor="val_loss", verbose=1, save_best_only=False, mode="min") early = EarlyStopping(monitor="val_loss", mode="min", patience=20) callbacks_list = [checkpoint, early] X_train, Z_train, y_train, D_train = load_pkl_data( args.model_dir, "training", postfix, indicator=args.indicator, matching=args.matching) ###################################################### """ print(X_train.shape, y_train.shape) print("Stacking positive samples") n_instance = X_train.shape[0] // 6 idxs = [i * 6 for i in range(n_instance)] tmp = np.vstack([X_train[idxs] for _ in range(4)]) X_train = np.vstack([X_train, tmp]) del tmp tmp = np.vstack([Z_train[idxs] for _ in range(4)]) Z_train = np.vstack([Z_train, tmp]) del tmp tmp = np.hstack([y_train[idxs] for _ in range(4)]) y_train = np.hstack([y_train, tmp]) del tmp if args.description: tmp = np.vstack([D_train[idxs] for _ in range(4)]) D_train = np.vstack([D_train, tmp]) """ ###################################################### # input = [X_train, Z_train] print(X_train.shape, Z_train.shape, y_train.shape) #if args.use_softmax: # y_train = np.array(mlb.inverse_transform(y_train)).flatten() input = [X_train, Z_train, D_train] if args.description else [X_train, Z_train] print("Begin training...") model.fit( input, y_train, batch_size=args.batch_size, epochs=args.epochs, validation_split=0.01, callbacks=callbacks_list) # Evaluation record = 0 index = 0 X_val, Z_val, y_val, D_val = load_pkl_data( args.model_dir, "validation", postfix, indicator=args.indicator, description=args.description) print("Loading trained weights for validation...") for i in range(1, args.epochs + 1, 1): # Deal with model_name for each epoch model_name = "{:s}-{:02d}.hdf5".format(save_prefix, i) model.load_weights(model_name) f = predict( model, X_val, Z_val, y_val, model_name, "results.txt", return_mf1=True, use_softmax=args.use_softmax) # Always choose model trained with more epoch when the F-1 score is same if record <= f: record = f index = i print("\n * Best micro-F1 at Validation: epoch #{:02d}".format(index)) # Test model with best micro F1 score model_name = "{:s}-{:02d}.hdf5".format(save_prefix, index) just_test( model=model, filename=model_name, postfix=postfix, use_softmax=args.use_softmax, indicator=args.indicator) K.clear_session()
def get_table_lookup(data_dir='vocab'): return utils.load_pkl_data('ind2vec.p', data_dir='vocab')
mode='constant', constant_values=0) return (torch.from_numpy(premise_data), torch.from_numpy(hypo_data), torch.LongTensor(premise_lens), torch.LongTensor(hypo_lens), torch.LongTensor(targets)) class SNLI_Dataset(Dataset): max_len = MAX_LEN def __init__(self, data, target): self.data = [[premise[:self.max_len], hypo[:self.max_len]] for premise, hypo in data] self.target = target def __len__(self): return len(self.data) def __getitem__(self, ix): x = self.data[ix] lens = [len(x[0]), len(x[1])] target = self.target[ix] return x, lens, target if __name__ == '__main__': train_data = utils.load_pkl_data('snli_train_ind.p') val_data = utils.load_pkl_data('snli_val_ind.p') print(f'Max length sentence: ', get_max_len(train_data, val_data))