def load_data(data_dir, task, cv, training_data_size=None): # Load data from file if task == "trec": train_x, train_y, test_x, test_y = load_dataset[task](data_dir) data = train_x + test_x label = None elif task == "sst": train_x, train_y, valid_x, valid_y, test_x, test_y = load_dataset[ task](data_dir) data = train_x + valid_x + test_x label = None else: data, label = load_dataset[task](data_dir) if task == "trec": train_x, train_y, valid_x, valid_y = cv_split2(train_x, train_y, nfold=10, valid_id=cv) elif task != "sst": train_x, train_y, valid_x, valid_y, test_x, test_y = cv_split( data, label, nfold=10, test_id=cv) nclasses = max(train_y) + 1 # Set the training dataset size if training_data_size: train_x = train_x[:training_data_size] train_y = train_y[:training_data_size] train_y = torch.LongTensor(train_y) valid_y = torch.LongTensor(valid_y) test_y = torch.LongTensor(test_y) dataset = { "train": (train_x, train_y), "valid": (valid_x, valid_y), "test": (test_x, test_y), "nclasses": nclasses, } return dataset, data
def main(args): if args.dataset == 'mr': data, label = dataloader.read_MR(args.path) elif args.dataset == 'subj': data, label = dataloader.read_SUBJ(args.path) elif args.dataset == 'cr': data, label = dataloader.read_CR(args.path) elif args.dataset == 'mpqa': data, label = dataloader.read_MPQA(args.path) elif args.dataset == 'trec': train_x, train_y, test_x, test_y = dataloader.read_TREC(args.path) data = train_x + test_x label = None elif args.dataset == 'sst': train_x, train_y, valid_x, valid_y, test_x, test_y = dataloader.read_SST(args.path) data = train_x + valid_x + test_x label = None else: raise Exception("unknown dataset: {}".format(args.dataset)) emb_layer = modules.EmbeddingLayer( args.d, data, embs = dataloader.load_embedding(args.embedding) ) if args.dataset == 'trec': train_x, train_y, valid_x, valid_y = dataloader.cv_split2( train_x, train_y, nfold = 10, valid_id = args.cv ) elif args.dataset != 'sst': train_x, train_y, valid_x, valid_y, test_x, test_y = dataloader.cv_split( data, label, nfold = 10, test_id = args.cv ) nclasses = max(train_y)+1 train_x, train_y = dataloader.create_batches( train_x, train_y, args.batch_size, emb_layer.word2id, sort = args.dataset == 'sst' ) valid_x, valid_y = dataloader.create_batches( valid_x, valid_y, args.batch_size, emb_layer.word2id, sort = args.dataset == 'sst' ) test_x, test_y = dataloader.create_batches( test_x, test_y, args.batch_size, emb_layer.word2id, sort = args.dataset == 'sst' ) model = Model(args, emb_layer, nclasses).cuda() need_grad = lambda x: x.requires_grad optimizer = optim.Adam( filter(need_grad, model.parameters()), lr = args.lr ) best_valid = 1e+8 test_err = 1e+8 for epoch in range(args.max_epoch): best_valid, test_err = train_model(epoch, model, optimizer, train_x, train_y, valid_x, valid_y, test_x, test_y, best_valid, test_err ) if args.lr_decay>0: optimizer.param_groups[0]['lr'] *= args.lr_decay sys.stdout.write("best_valid: {:.6f}\n".format( best_valid )) sys.stdout.write("test_err: {:.6f}\n".format( test_err ))
def main(args): datasetList = ['mr', 'subj', 'cr', 'mpqa', 'trec', 'sst'] numberOfTest = 5 args.max_epoch = 100 for dset in datasetList: if dset == 'mr': data, label = dataloader.read_MR(args.path) elif dset == 'subj': data, label = dataloader.read_SUBJ(args.path) elif dset == 'cr': data, label = dataloader.read_CR(args.path) elif dset == 'mpqa': data, label = dataloader.read_MPQA(args.path) elif dset == 'trec': train_x, train_y, test_x, test_y = dataloader.read_TREC(args.path) data = train_x + test_x label = None elif dset == 'sst': train_x, train_y, valid_x, valid_y, test_x, test_y = dataloader.read_SST(args.path) data = train_x + valid_x + test_x label = None else: raise Exception("unknown dataset: {}".format(dset)) emb_layer = modules.EmbeddingLayer( args.d, data, embs = dataloader.load_embedding(args.embedding) ) if dset == 'trec': train_x, train_y, valid_x, valid_y = dataloader.cv_split2( train_x, train_y, nfold = 10, valid_id = args.cv ) elif dset != 'sst': train_x, train_y, valid_x, valid_y, test_x, test_y = dataloader.cv_split( data, label, nfold = 10, test_id = args.cv ) nclasses = max(train_y)+1 train_x, train_y = dataloader.create_batches(train_x, train_y, args.batch_size, emb_layer.word2id, sort = dset == 'sst') valid_x, valid_y = dataloader.create_batches(valid_x, valid_y, args.batch_size, emb_layer.word2id, sort = dset == 'sst') test_x, test_y = dataloader.create_batches(test_x, test_y, args.batch_size, emb_layer.word2id, sort = dset == 'sst') for models in range(3): if models == 1: args.cnn = True modelName = 'CNN' elif models == 2: args.cnn = False args.lstm = True modelName = 'LSTM' else: args.lstm = False modelName = 'SRU' sys.stdout.write("Training {} with {} architecture: \n".format(dset,modelName)) args.dropout = 0.5 for testNo in range(numberOfTest): model = Model(args, emb_layer, nclasses).cuda() need_grad = lambda x: x.requires_grad optimizer = optim.Adam(filter(need_grad, model.parameters()), lr = args.lr) best_valid = 1e+8 test_err = 1e+8 results = [] for epoch in range(args.max_epoch): results.append(train_model(epoch, model, optimizer, train_x, train_y, valid_x, valid_y, test_x, test_y, best_valid, test_err)) with open('results_{d}_{m}_{i}.csv'.format(d=dset, m=modelName, i=(testNo+1)), 'wb') as dump: wr = csv.writer(dump, delimiter=',') wr.writerow(['Epoch','Training Loss', 'Validation Error', 'Test Error', 'Duration']) for idx, value in enumerate(results): wr.writerow(value)