def do(config): # 데이터 읽기 & 전처리 print("Read data") ds = Datasets(config.data_path) data = ds.read_data() print("Data preprocessing..") preprocessing = Preprocessing(config) X = preprocessing.do(data) print('Train model') if config.sg == 'CBOW': model = Word2Vec( sentences=X, size=config.size, window=config.window, min_count=config.min_count, workers=config.workers, sg=0 ) else: model = Word2Vec( sentences=X, size=config.size, window=config.window, min_count=config.min_count, workers=config.workers, sg=1 ) print(model.wv.vectors.shape) model.save(os.path.join(config.save_directory, config.ckpt_name))
def get_transform(nparray): data_file = "framingham.csv" numeric_var = [ "age", "cigsPerDay", "totChol", "sysBP", "diaBP", "BMI", "heartRate", "glucose", ] level_var = ["education"] category_var = [ "male", "currentSmoker", "BPMeds", "prevalentStroke", "prevalentHyp", "diabetes", ] target = ["TenYearCHD"] # Create Data object data = Datasets( data_file=data_file, cat_cols=category_var, num_cols=numeric_var, level_cols=level_var, label_col=target, train=True, ) return data.preprocess_newdata(nparray)
def __init__(self): super(Ui, self).__init__() uic.loadUi('mainwindow.ui', self) self.show() self.load_dir_but.clicked.connect( lambda: load_dir_dialog(self, self.load_dir_lineEdit)) self.load_drift_file_but.clicked.connect( lambda: load_file_dialog(self, self.load_drift_file_lineEdit)) self.load_calibration_file_but.clicked.connect( lambda: load_file_dialog(self, self.load_calibration_file_lineEdit )) def select_model_path(self, line_edit): self.model = None return load_dir_dialog(self, line_edit) self.select_model_path_but.clicked.connect( lambda: select_model_path(self, self.model_path_lineEdit)) self.load_data.clicked.connect(self.load_data_func) self.label_data_but.clicked.connect(self.label_data_func) self.load_prefix_tables() self.datasets = Datasets() self.model = None self.labeled_dataset = None self.original_dataset = None
def fit(self, **kwargs): logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO) logging.root.level = logging.INFO datasets = Datasets() params = DOC2VEC_PARAMS self.model = Doc2Vec(datasets.tagged_docs, **params) self.model.save(self.__get_model_fpath())
def example(): from datasets import Datasets datasets = Datasets() datasets.download() training_data = datasets.load() test_data = datasets.load(test=True) my_xgb_regressor = MyXGBRegressor(datasets) my_xgb_regressor.train(training_data) predictions = my_xgb_regressor.predict(test_data, save=True)
def test(model, args): model.eval() # Load Datasets dataset = Datasets(file_path=args.test_data_path, label_list=label_list, pretrained_type=args.pretrained_type) # Use custom batch function collate_fn = ClassificationBatchFunction(args.max_len, dataset.pad_idx, dataset.cls_idx, dataset.sep_idx) loader = DataLoader(dataset=dataset, batch_size=args.train_batch_size, num_workers=8, pin_memory=True, collate_fn=collate_fn) loss, acc, f1, (total_y_hat, cm) = evaluate(args, loader, model, device) return loss, acc, f1, total_y_hat, cm
def do(config): # 데이터 읽기 & 전처리 print("Read data") ds = Datasets(config.data_path) data = ds.read_data() print("Data preprocessing..") preprocessing = Preprocessing(config) x_train, y_train = preprocessing.do(data) print("Model build..") model, callback = build(config, preprocessing.vocab_size) history = model.fit(x_train, y_train, epochs=config.epoch, callbacks=callback, batch_size=config.batch_size, validation_split=0.2) model.save(os.path.join(config.save_directory, config.ckpt_name))
def train_model(self): t0 = time.time() print(self.model.summary()) dataset = Datasets(self.options) train_gen = dataset.make_split_generator('train') validation_gen = dataset.make_split_generator('validation') optimizer = Adam(lr=self.options.init_lr, decay=0) self.model.compile( optimizer, loss=self.loss_function, metrics=self.metrics) callbacks = self.make_callbacks() self.model.fit_generator( train_gen, initial_epoch=self.options.init_epoch, steps_per_epoch=self.options.steps_per_epoch, epochs=self.options.epochs, validation_data=validation_gen, validation_steps=self.options.validation_steps, callbacks=callbacks) print('Training time cost: %0.2f(min).'%((time.time()-t0)/60))
def main(args): """Execute a task based on the given command-line arguments. This function is the main entry-point of the program. It allows the user to extract features, train a model, compute predictions, and evaluate predictions using the command-line interface. """ from datasets import Datasets datasets = Datasets(args.dataset_path) if args.command == 'extract': extract(datasets.get(args.dataset), args) elif args.command == 'train': train(datasets.get('training'), args) elif args.command == 'predict': predict(datasets.get(args.dataset), args) elif args.command == 'evaluate': if isinstance(args.training_id, list): evaluate_all(datasets.get('test'), args) else: evaluate(datasets.get('test'), args)
def train(args): set_seed(args) # Set device if args.device == 'cuda': device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') logger.info('use cuda') else: device = torch.device('cpu') logger.info('use cpu') # Set label list for classification if args.num_label == 'multi': label_list = ['공포', '놀람', '분노', '슬픔', '중립', '행복', '혐오'] elif args.num_label == 'binary': label_list = ['긍정', '부정'] logger.info('use {} labels for training'.format(len(label_list))) # Load pretrained model and model configuration pretrained_path = os.path.join('./pretrained_model/', args.pretrained_type) if args.pretrained_model_path is None: # Use pretrained bert model(etri/skt) pretrained_model_path = os.path.join(pretrained_path, 'pytorch_model.bin') else: # Use further-pretrained bert model pretrained_model_path = args.pretrained_model_path logger.info('Pretrain Model : {}'.format(pretrained_model_path)) pretrained = torch.load(pretrained_model_path) if args.pretrained_type == 'skt' and 'bert.' not in list(pretrained.keys())[0]: logger.info('modify parameter names') # Change parameter name for consistency new_keys_ = ['bert.' + k for k in pretrained.keys()] old_values_ = pretrained.values() pretrained = {k: v for k, v in zip(new_keys_, old_values_)} bert_config = BertConfig(os.path.join(pretrained_path + '/bert_config.json')) bert_config.num_labels = len(label_list) model = BertForEmotionClassification(bert_config).to(device) model.load_state_dict(pretrained, strict=False) # Load Datasets tr_set = Datasets(file_path=args.train_data_path, label_list=label_list, pretrained_type=args.pretrained_type, max_len=args.max_len) # Use custom batch function collate_fn = ClassificationBatchFunction(args.max_len, tr_set.pad_idx, tr_set.cls_idx, tr_set.sep_idx) tr_loader = DataLoader(dataset=tr_set, batch_size=args.train_batch_size, shuffle=True, num_workers=8, pin_memory=True, collate_fn=collate_fn) dev_set = Datasets(file_path=args.dev_data_path, label_list=label_list, pretrained_type=args.pretrained_type, max_len=args.max_len) dev_loader = DataLoader(dataset=dev_set, batch_size=args.eval_batch_size, num_workers=8, pin_memory=True, drop_last=False, collate_fn=collate_fn) # optimizer optimizer = layerwise_decay_optimizer(model=model, lr=args.learning_rate, layerwise_decay=args.layerwise_decay) # lr scheduler t_total = len(tr_loader) // args.gradient_accumulation_steps * args.epochs warmup_steps = int(t_total * args.warmup_percent) logger.info('total training steps : {}, lr warmup steps : {}'.format(t_total, warmup_steps)) # Use gradual warmup and linear decay scheduler = optimization.WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total) # for low-precision training if args.fp16: try: from apex import amp logger.info('Use fp16') except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level, verbosity=0) # tensorboard setting save_path = "./model_saved_finetuning/lr{},batch{},total{},warmup{},len{},{}".format( args.learning_rate, args.train_batch_size * args.gradient_accumulation_steps, t_total, args.warmup_percent, args.max_len, args.pretrained_type) if not os.path.isdir(save_path): os.makedirs(save_path) writer = SummaryWriter(save_path) # Save best model results with resultwriter result_writer = utils.ResultWriter("./model_saved_finetuning/results.csv") model.zero_grad() best_val_loss = 1e+9 global_step = 0 train_loss, train_acc, train_f1 = 0, 0, 0 logging_loss, logging_acc, logging_f1 = 0, 0, 0 logger.info('***** Training starts *****') total_result = [] for epoch in tqdm(range(args.epochs), desc='epochs'): for step, batch in tqdm(enumerate(tr_loader), desc='steps', total=len(tr_loader)): model.train() x_train, mask_train, y_train = map(lambda x: x.to(device), batch) inputs = { 'input_ids': x_train, 'attention_mask': mask_train, 'classification_label': y_train, } output, loss = model(**inputs) y_max = output.max(dim=1)[1] cr = classification_report(y_train.tolist(), y_max.tolist(), labels=list(range(len(label_list))), target_names=label_list, output_dict=True) # Get accuracy(micro f1) if 'micro avg' not in cr.keys(): batch_acc = list(cr.items())[len(label_list)][1] else: # If at least one of labels does not exists in mini-batch, use micro average instead batch_acc = cr['micro avg']['f1-score'] # macro f1 batch_macro_f1 = cr['macro avg']['f1-score'] # accumulate measures grad_accu = args.gradient_accumulation_steps if grad_accu > 1: loss /= grad_accu batch_acc /= grad_accu batch_macro_f1 /= grad_accu if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() train_loss += loss.item() train_acc += batch_acc train_f1 += batch_macro_f1 if (global_step + 1) % grad_accu == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.grad_clip_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip_norm) optimizer.step() scheduler.step() model.zero_grad() global_step += 1 if global_step % args.logging_step == 0: acc_ = (train_acc - logging_acc) / args.logging_step f1_ = (train_f1 - logging_f1) / args.logging_step loss_ = (train_loss - logging_loss) / args.logging_step writer.add_scalars('loss', {'train': loss_}, global_step) writer.add_scalars('acc', {'train': acc_}, global_step) writer.add_scalars('macro_f1', {'train': f1_}, global_step) logger.info('[{}/{}], trn loss : {:.3f}, trn acc : {:.3f}, macro f1 : {:.3f}'.format( global_step, t_total, loss_, acc_, f1_ )) logging_acc, logging_f1, logging_loss = train_acc, train_f1, train_loss # Get f1 score for each label f1_results = [(l, r['f1-score']) for i, (l, r) in enumerate(cr.items()) if i < len(label_list)] f1_log = "\n".join(["{} : {}".format(l, f) for l, f in f1_results]) logger.info("\n\n***f1-score***\n" + f1_log + "\n\n***confusion matrix***\n{}".format( confusion_matrix(y_train.tolist(), y_max.tolist()))) # Validation val_loss, val_acc, val_macro_f1, _ = evaluate(args, dev_loader, model, device) val_result = '[{}/{}] val loss : {:.3f}, val acc : {:.3f}. val macro f1 : {:.3f}'.format( global_step, t_total, val_loss, val_acc, val_macro_f1 ) writer.add_scalars('loss', {'val': val_loss}, global_step) writer.add_scalars('acc', {'val': val_acc}, global_step) writer.add_scalars('macro_f1', {'val': val_macro_f1}, global_step) logger.info(val_result) total_result.append(val_result) if val_loss < best_val_loss: # Save model checkpoints torch.save(model.state_dict(), os.path.join(save_path, 'best_model.bin')) torch.save(args, os.path.join(save_path, 'training_args.bin')) logger.info('Saving model checkpoint to %s', save_path) best_val_loss = val_loss best_val_acc = val_acc best_val_macro_f1 = val_macro_f1 # Save results in 'model_saved_finetuning/results.csv' results = { 'val_loss': best_val_loss, 'val_acc': best_val_acc, 'val_macro_f1' : best_val_macro_f1, 'save_dir': save_path, 'pretrained_path': pretrained_path, } result_writer.update(args, **results) return global_step, loss_, acc_, best_val_loss, best_val_acc, total_result
global test_loader test_dataset = TestKodakDataset(data_dir=args.val) test_loader = DataLoader(dataset=test_dataset, shuffle=False, batch_size=1, pin_memory=True, num_workers=0) if args.test: testKodak(global_step) exit(-1) optimizer = optim.Adam(parameters, lr=base_lr) # save_model(model, 0) global train_loader tb_logger = SummaryWriter(os.path.join(save_path, 'events')) train_data_dir = args.train train_dataset = Datasets(train_data_dir, image_size) train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=2) # default 2 steps_epoch = global_step // ( len(train_dataset) // (batch_size)) # i think have some problem, remove batch_size save_model(model, global_step, save_path) for epoch in range(steps_epoch, tot_epoch): adjust_learning_rate(optimizer, global_step) if global_step > tot_step: save_model(model, global_step, save_path) break global_step = train(epoch, global_step)
def prepare_datasets(args): datasets = Datasets(**vars(args)) args.vocab_size = len(datasets.vocab) return datasets
parser.add_argument('--weights', dest='weights', default=None, help='initialize with pretrained model weights', type=str) args = parser.parse_args() return args if __name__ == '__main__': args = parse_args() embed_dict = get_embed_dict(embedding_path) # print(dataset1.columns) datasets = Datasets(root_dir) X, Y, tokenizer = datasets.get_tokenized_data( max_sentence_len=max_sentence_len) embedding_matrix, word_num = make_embedding(tokenizer, embed_dict) model = Network(word_num=word_num, embedding_matrix=embedding_matrix, maxlen=max_sentence_len) # if args.weights is not None: # model.load_weights(args.weights) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[ 'accuracy', ])
def train(args): set_seed(args) # Set device if args.device == 'cuda': device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') logger.info('use cuda') else: device = torch.device('cpu') logger.info('use cpu') # Load pretrained model and model configuration pretrained_path = os.path.join('./pretrained_model/', args.pretrained_type) if args.pretrained_model_path is None: # Use pretrained bert model(etri/skt) pretrained_model_path = os.path.join(pretrained_path, 'pytorch_model.bin') else: # Use further-pretrained bert model pretrained_model_path = args.pretrained_model_path logger.info('Pretrain Model : {}'.format(pretrained_model_path)) pretrained = torch.load(pretrained_model_path) if args.pretrained_type == 'skt' and 'bert.' not in list( pretrained.keys())[0]: logger.info('modify parameter names') # Change parameter name for consistency new_keys_ = ['bert.' + k for k in pretrained.keys()] old_values_ = pretrained.values() pretrained = {k: v for k, v in zip(new_keys_, old_values_)} bert_config = BertConfig( os.path.join(pretrained_path + '/bert_config.json')) model = BertForMLM(bert_config).to(device) model.load_state_dict(pretrained, strict=False) # Load Datasets tr_set = Datasets(file_path=args.train_data_path, pretrained_type=args.pretrained_type, max_len=args.max_len) # Use custom batch function collate_fn = MLMBatchFunction(args.max_len, tr_set.vocab) tr_loader = DataLoader(dataset=tr_set, batch_size=args.train_batch_size, shuffle=True, num_workers=8, pin_memory=True, drop_last=True, collate_fn=collate_fn) if args.do_eval: dev_set = Datasets(file_path=args.dev_data_path, pretrained_type=args.pretrained_type, max_len=args.max_len) dev_loader = DataLoader(dataset=dev_set, batch_size=args.eval_batch_size, num_workers=8, pin_memory=True, drop_last=False, collate_fn=collate_fn) # optimizer optimizer = layerwise_decay_optimizer(model=model, lr=args.learning_rate, layerwise_decay=args.layerwise_decay) # lr scheduler t_total = len(tr_loader) // args.gradient_accumulation_steps * args.epochs warmup_steps = int(t_total * args.warmup_percent) logger.info('total training steps : {}, lr warmup steps : {}'.format( t_total, warmup_steps)) # Use gradual warmup and cosine decay scheduler = optimization.WarmupCosineWithHardRestartsSchedule( optimizer, warmup_steps=warmup_steps, t_total=t_total) # for low-precision training if args.fp16: try: from apex import amp logger.info('Use fp16') except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level, verbosity=0) # tensorboard setting save_path = "./model_saved_pretrain/lr{},batch{},total{},warmup{},len{},{}".format( args.learning_rate, args.train_batch_size * args.gradient_accumulation_steps, t_total, args.warmup_percent, args.max_len, args.pretrained_type) if not os.path.isdir(save_path): os.makedirs(save_path) writer = SummaryWriter(save_path) # Save best model results with resultwriter result_writer = utils.ResultWriter("./model_saved_pretrain/results.csv") model.zero_grad() best_val_loss = 1e+9 best_val_acc = 0 global_step = 0 train_loss, train_acc = 0, 0 val_loss, val_acc = 0, 0 logging_loss, logging_acc = 0, 0 logger.info('***** Training starts *****') total_result = [] for epoch in tqdm(range(args.epochs), desc='epochs'): for step, batch in tqdm(enumerate(tr_loader), desc='steps', total=len(tr_loader)): model.train() x_train, y_train, mask_train = map(lambda x: x.to(device), batch) inputs = { 'input_ids': x_train, 'attention_mask': mask_train, 'masked_lm_labels': y_train, } output, loss = model(**inputs) y_max = output.max(dim=2)[1] # Get accuracy for maked tokens total_length = torch.ones_like(y_train).masked_fill( y_train == -1, 0).sum().item() total_sum = torch.zeros_like(y_max).masked_fill( y_max == y_train, 1).sum().item() batch_acc = total_sum / total_length # accumulate measures grad_accu = args.gradient_accumulation_steps if grad_accu > 1: loss /= grad_accu batch_acc /= grad_accu if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() train_loss += loss.item() train_acc += batch_acc if (step + 1) % grad_accu == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.grad_clip_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip_norm) optimizer.step() scheduler.step() model.zero_grad() global_step += 1 if global_step % args.logging_step == 0: acc_ = (train_acc - logging_acc) / args.logging_step loss_ = (train_loss - logging_loss) / args.logging_step writer.add_scalars('loss', {'train': loss_}, global_step) writer.add_scalars('acc', {'train': acc_}, global_step) writer.add_scalars('lr', {'lr': scheduler.get_lr()[0]}, global_step) logger.info( '[{}/{}], trn loss : {:.3f}, trn acc : {:.3f}'.format( global_step, t_total, loss_, acc_)) logging_acc, logging_loss = train_acc, train_loss if args.do_eval: # Validation val_loss, val_acc = evaluate(args, dev_loader, model, device) val_result = '[{}/{}] val loss : {:.3f}, val acc : {:.3f}'.format( global_step, t_total, val_loss, val_acc) logger.info(val_result) total_result.append(val_result) if val_loss <= best_val_loss: # Save model checkpoints torch.save(model.state_dict(), os.path.join(save_path, 'best_model.bin')) torch.save(args, os.path.join(save_path, 'training_args.bin')) logger.info('Saving model checkpoint to %s', save_path) best_val_loss = val_loss best_val_acc = val_acc if (epoch + 1) % args.saving_step == 0: torch.save( model.state_dict(), os.path.join(save_path, 'epoch{}_model.bin'.format(epoch + 1))) # Save results in 'model_saved_pretrain/results.csv' results = { 'train_loss': loss_, 'train_acc': acc_, 'val_loss': best_val_loss, 'val_acc': best_val_acc, 'save_dir': save_path, 'global_step': global_step, } result_writer.update(args, **results) return global_step, loss_, acc_, best_val_loss, best_val_acc, total_result
def main(args): num_layers = len(args.hidden_dims) datasets = Datasets(data_path=args.data_path) # Prepare output files outname1 = '../tmp/aep_' + args.dataset + '_' + str(num_layers) + '_'\ + str(args.num_inducing) + '.rmse' if not os.path.exists(os.path.dirname(outname1)): os.makedirs(os.path.dirname(outname1)) outfile1 = open(outname1, 'w') outname2 = '../tmp/aep_' + args.dataset + '_' + str(num_layers) + '_'\ + str(args.num_inducing) + '.nll' outfile2 = open(outname2, 'w') outname3 = '../tmp/aep_' + args.dataset + '_' + str(num_layers) + '_'\ + str(args.num_inducing) + '.time' outfile3 = open(outname3, 'w') running_err = 0 running_loss = 0 running_time = 0 test_errs = np.zeros(args.splits) test_nlls = np.zeros(args.splits) test_times = np.zeros(args.splits) for i in range(args.splits): print('Split: {}'.format(i)) print('Getting dataset...') data = datasets.all_datasets[args.dataset].get_data(i) X, Y, Xs, Ys, Y_std = [ data[_] for _ in ['X', 'Y', 'Xs', 'Ys', 'Y_std'] ] dgp_model = aep.SDGPR(X, Y, args.num_inducing, args.hidden_dims) print('Training DGP model...') t0 = time.time() dgp_model.optimise(method='Adam', mb_size=args.batch_size, adam_lr=args.learning_rate, maxiter=args.iterations) t1 = time.time() test_times[i] = t1 - t0 print('Time taken to train: {}'.format(t1 - t0)) outfile3.write('Split {}: {}\n'.format(i + 1, t1 - t0)) outfile3.flush() os.fsync(outfile3.fileno()) running_time += t1 - t0 # Minibatch test predictions means, vars = [], [] test_batch_size = args.test_batch_size if len(Xs) > test_batch_size: for mb in range(-(-len(Xs) // test_batch_size)): m, v = dgp_model.predict_y(Xs[mb * test_batch_size:(mb + 1) * test_batch_size, :]) means.append(m) vars.append(v) else: m, v = dgp_model.predict_y(Xs) means.append(m) vars.append(v) mean_ND = np.concatenate(means, 0) var_ND = np.concatenate(vars, 0) test_err = np.mean(Y_std * np.mean((Ys - mean_ND)**2.0)**0.5) test_errs[i] = test_err print('Average RMSE: {}'.format(test_err)) outfile1.write('Split {}: {}\n'.format(i + 1, test_err)) outfile1.flush() os.fsync(outfile1.fileno()) running_err += test_err test_nll = np.mean( norm.logpdf(Ys * Y_std, mean_ND * Y_std, var_ND**0.5 * Y_std)) test_nlls[i] = test_nll print('Average test log likelihood: {}'.format(test_nll)) outfile2.write('Split {}: {}\n'.format(i + 1, test_nll)) outfile2.flush() os.fsync(outfile2.fileno()) running_loss += test_nll outfile1.write('Average: {}\n'.format(running_err / args.splits)) outfile1.write('Standard deviation: {}\n'.format(np.std(test_errs))) outfile2.write('Average: {}\n'.format(running_loss / args.splits)) outfile2.write('Standard deviation: {}\n'.format(np.std(test_nlls))) outfile3.write('Average: {}\n'.format(running_time / args.splits)) outfile3.write('Standard deviation: {}\n'.format(np.std(test_times))) outfile1.close() outfile2.close() outfile3.close()
from gpflow.likelihoods import Gaussian from gpflow.kernels import RBF, White from gpflow.mean_functions import Constant from gpflow.models.sgpr import SGPR, GPRFITC from gpflow.models.svgp import SVGP from gpflow.models.gpr import GPR from gpflow.training import AdamOptimizer, ScipyOptimizer from scipy.cluster.vq import kmeans2 from scipy.stats import norm from scipy.special import logsumexp from doubly_stochastic_dgp.dgp import DGP from datasets import Datasets datasets = Datasets(data_path='data/') data = datasets.all_datasets['kin8nm'].get_data() X, Y, Xs, Ys, Y_std = [data[_] for _ in ['X', 'Y', 'Xs', 'Ys', 'Y_std']] print('N: {}, D: {}, Ns: {}'.format(X.shape[0], X.shape[1], Xs.shape[0])) Z_100 = kmeans2(X, 100, minit='points')[0] Z_5 = kmeans2(X, 5, minit='points')[0] Z_500 = kmeans2(X, 500, minit='points')[0] def make_dgp(X, Y, Z, L): D = X.shape[1] # the layer shapes are defined by the kernel dims, so here all hidden layers are D dimensional kernels = []
from datasets import Datasets from taboo_search import TabooSearch path = '/Users/michal/PycharmProjects/MRP/datasets/*.tsp' data = Datasets(path) # name = 'ali535' # name = 'berlin11_modified' # name = 'berlin52' # name = 'fl417' name = 'gr666' # name = 'kroA100' # name = 'kroA150' # name = 'nrw1379' # name = 'pr2392' TS = TabooSearch(data, name) stats = TS(taboo_list_size=30, count_of_neighbours=30, mutation_ratio=0.5, epochs=200) for iterator, log in enumerate(stats.items()): print('EPOCH {}'.format(iterator)) print('\t\tbest distance - {}'.format(log[0])) print('\t\tbest route - {}'.format(log[1])) print('\n')
from gpflow.likelihoods import Gaussian from gpflow.kernels import RBF, White from gpflow.training import AdamOptimizer #A new (robust) class of model-based likelihoods. from robustified_likelihoods import betaDivGaussian, gammaDivGaussian from scipy.cluster.vq import kmeans2 from scipy.stats import norm from scipy.special import logsumexp from doubly_stochastic_dgp.dgp import DGP from datasets import Datasets #Path this file is in + /data datasets = Datasets() """Function serves three purposes: (1) Reads relevant data/prepares the accurate split for index i (2) Calls 'DGP' for split i, which does the inference (3) Extracts and returns test performancee metrics to 'main'. """ def get_test_error(i, dataset, alpha, learning_rate=0.001, iterations=20000, white=True, normalized=True, num_inducing=100,
def main(args): datasets = Datasets(data_path=args.data_path) # prepare output files outname1 = '../svgp_ard_tmp/svgp_ard_' + args.dataset + '_' + str( args.num_inducing) + '.rmse' if not os.path.exists(os.path.dirname(outname1)): os.makedirs(os.path.dirname(outname1)) outfile1 = open(outname1, 'w') outname2 = '../svgp_ard_tmp/svgp_ard_' + args.dataset + '_' + str( args.num_inducing) + '.nll' outfile2 = open(outname2, 'w') outname3 = '../svgp_ard_tmp/svgp_ard)' + args.dataset + '_' + str( args.num_inducing) + '.time' outfile3 = open(outname3, 'w') # ========================================================================= # CROSS-VALIDATION LOOP # ========================================================================= running_err = 0 running_loss = 0 running_time = 0 test_errs = np.zeros(args.splits) test_nlls = np.zeros(args.splits) test_times = np.zeros(args.splits) for i in range(args.splits): # ===================================================================== # MODEL CONSTRUCTION # ===================================================================== print('Split: {}'.format(i)) print('Getting dataset...') data = datasets.all_datasets[args.dataset].get_data( i, normalize=args.normalize_data) X, Y, Xs, Ys, Y_std = [ data[_] for _ in ['X', 'Y', 'Xs', 'Ys', 'Y_std'] ] Z = kmeans2(X, args.num_inducing, minit='points')[0] # set up batches batch_size = args.batch_size if args.batch_size < X.shape[0]\ else X.shape[0] train_dataset = tf.data.Dataset.from_tensor_slices((X, Y)).repeat()\ .prefetch(X.shape[0]//2)\ .shuffle(buffer_size=(X.shape[0]//2))\ .batch(batch_size) print('Setting up SVGP model...') if args.ard: # SE kernel with lengthscale per dimension kernel = SquaredExponential(lengthscale=[1.] * X.shape[1]) + White(variance=1e-5) else: # SE kernel with single lengthscale kernel = SquaredExponential(lengthscale=1.) + White(variance=1e-5) likelihood = Gaussian(variance=0.05) model = gpflow.models.SVGP(kernel=kernel, likelihood=likelihood, inducing_variable=Z) # ===================================================================== # TRAINING # ===================================================================== print('Training SVGP model...') optimiser = tf.optimizers.Adam(args.learning_rate) t0 = time.time() monitored_training_loop(model, train_dataset, optimiser=optimiser, logdir=args.log_dir, iterations=args.iterations, logging_iter_freq=args.logging_iter_freq) t1 = time.time() # ===================================================================== # TESTING # ===================================================================== test_times[i] = t1 - t0 print('Time taken to train: {}'.format(t1 - t0)) outfile3.write('Split {}: {}\n'.format(i + 1, t1 - t0)) outfile3.flush() os.fsync(outfile3.fileno()) running_time += t1 - t0 # minibatch test predictions means, vars = [], [] test_batch_size = args.test_batch_size if len(Xs) > test_batch_size: for mb in range(-(-len(Xs) // test_batch_size)): m, v = model.predict_y(Xs[mb * test_batch_size:(mb + 1) * test_batch_size, :]) means.append(m) vars.append(v) else: m, v = model.predict_y(Xs) means.append(m) vars.append(v) mean_ND = np.concatenate(means, 0) # [N, D] var_ND = np.concatenate(vars, 0) # [N, D] # rmse test_err = np.mean(Y_std * np.mean((Ys - mean_ND)**2.0)**0.5) test_errs[i] = test_err print('Average RMSE: {}'.format(test_err)) outfile1.write('Split {}: {}\n'.format(i + 1, test_err)) outfile1.flush() os.fsync(outfile1.fileno()) running_err += test_err # nll test_nll = np.mean( norm.logpdf(Ys * Y_std, mean_ND * Y_std, var_ND**0.5 * Y_std)) test_nlls[i] = test_nll print('Average test log likelihood: {}'.format(test_nll)) outfile2.write('Split {}: {}\n'.format(i + 1, test_nll)) outfile2.flush() os.fsync(outfile2.fileno()) running_loss += test_nll outfile1.write('Average: {}\n'.format(running_err / args.splits)) outfile1.write('Standard deviation: {}\n'.format(np.std(test_errs))) outfile2.write('Average: {}\n'.format(running_loss / args.splits)) outfile2.write('Standard deviation: {}\n'.format(np.std(test_nlls))) outfile3.write('Average: {}\n'.format(running_time / args.splits)) outfile3.write('Standard deviation: {}\n'.format(np.std(test_times))) outfile1.close() outfile2.close() outfile3.close()
import dash import dash_core_components as dcc import dash_html_components as html import pandas as pd import plotly.graph_objects as go from dash.dependencies import Input, Output from dash.exceptions import PreventUpdate from plotly.subplots import make_subplots from datasets import Datasets from ui_components import slider_choose_dates, covid_slider_mark_dates, \ covid_selectors, caveat_markdown_text_covid # Data sets print("--------------------") dat = Datasets() covid_state, covid_county = dat.covid_data() state_pop, county_pop = dat.population_data() dat.geo_data() c_zip_fips = dat.c_zip_fips counties_geojson = dat.counties_geojson states_geojson = dat.states_geojson county_latlong = dat.counties_latlon state_latlong = dat.state_latlong print("Completed loading datasets and computing rolled statistics") print("--------------------") # merge population covid_state = pd.merge(covid_state, state_pop, on='state', suffixes=('','_'), how='inner') covid_county = pd.merge(covid_county, county_pop, on='fips', suffixes=('','_'), how='left') #, how='inner') # merge geography
help='Path of the model to load when testing', required=False) # Parse the arguments arguments = parser.parse_args() if (arguments.modelPath == None) and (arguments.mode != "train"): print( "Missing arg: You have to specify which model you want to load, to test the network." ) sys.exit() config = HandConfig() config.display() myDatasets = Datasets(arguments.imageDir, arguments.testDataset, trainSplit=0.8) imagePaths, masksPath, testImagePaths, testMasksPath, trainIdxs, valIdxs, testIdxs = myDatasets.split_indexes( ) if arguments.testDataset: dataset_test = HandDataset(testImagePaths, testMasksPath) dataset_test = myDatasets.prepare_dataset(dataset_test, testIdxs, config.IMAGE_SHAPE) else: dataset_train = HandDataset(imagePaths, masksPath) dataset_val = HandDataset(imagePaths, masksPath) dataset_train = myDatasets.prepare_dataset(dataset_train, trainIdxs, config.IMAGE_SHAPE) dataset_val = myDatasets.prepare_dataset(dataset_val, valIdxs, config.IMAGE_SHAPE)
if __name__ == "__main__": #formatter = logging.Formatter('[%(asctime)s] [%(name)s] [%(levelname)s] %(message)s') logging.basicConfig(filename='logfile.log', level=logging.INFO) formatter = logging.Formatter('[%(asctime)s] [%(name)s] [%(levelname)s] %(message)s') logger = get_logger('ML_struct') parser = argparse.ArgumentParser(description='Character-level convolutional neural network for text classification') parser.add_argument('--config','--c', type=str, metavar='yaml FILE', help='where to load YAML configuration') args = parser.parse_args() print(args) print(args.config) yaml_file = args.config with open(yaml_file) as f: cfg = yaml.load(f) print(cfg) train_dataset, val_dataset = Datasets() # 이건 다른 path일때는 어떻게? train_loader, val_loader=dataLoader(train_dataset, val_dataset) cnn = CNNClassifier() device = torch.device("cuda") print(device) if torch.cuda.device_count() >1 : print("Let's use", torch.cuda.device_count(), "GPUs!") #cnn.to(device) cnn = nn.DataParallel(cnn) cnn.to(device) else: cnn.to(device) #mytensor = my_tensor.to(device) # loss criterion = nn.CrossEntropyLoss()
from datasets import Datasets from label import Classify data_dir = 'data/' datasets_dir = data_dir + 'datasets/' datasets = Datasets(datasets_dir) def configure_app(app): app.config.debug = True app.config.port = 8080 app.config.host = "0.0.0.0" app.config.LOGO = None global datasets_bundle datasets_bundle = {} def load_dataset_graphs(): for dataset in datasets.get(): dataset_name = dataset['name'] dataset_path = dataset['path'] graph_path = dataset_path + "retrained_graph.pb" datasets_bundle[dataset_name] = [] try: datasets_bundle[dataset_name] = Classify(graph=graph_path) except Exception as e: print(e)
else: device = torch.device("cpu") print("cpu mode") # the name of results files codename = 'ad_ac_example' fnnname = codename + "_fnn_model" total_loss_name = codename + "_total_loss" acc_name = codename + "_accuracy" soft_loss_name = codename + "_softmax_loss" ad_disc_loss_name = codename + "_adaptivediscriminant_loss" ad_cen_loss_name = codename + "_adaptivecenter_loss" result_name = codename + "_result" # load the data set instance_datasets = Datasets(DATASET, BATCH_SIZE, NUM_WORKERS) data_sets = instance_datasets.create() trainloader = data_sets[0] testloader = data_sets[1] classes = data_sets[2] based_labels = data_sets[3] trainset = data_sets[4] testset = data_sets[5] # network and criterions model = Net(FEATURE, OUTPUTS).to(device) optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM,
def main(args): datasets = Datasets(data_path=args.data_path) # Prepare output files outname1 = '../tmp/' + args.dataset + '_' + str(args.num_layers) + '_'\ + str(args.num_inducing) + '.nll' if not os.path.exists(os.path.dirname(outname1)): os.makedirs(os.path.dirname(outname1)) outfile1 = open(outname1, 'w') outname2 = '../tmp/' + args.dataset + '_' + str(args.num_layers) + '_'\ + str(args.num_inducing) + '.time' outfile2 = open(outname2, 'w') running_loss = 0 running_time = 0 for i in range(args.splits): print('Split: {}'.format(i)) print('Getting dataset...') data = datasets.all_datasets[args.dataset].get_data(i) X, Y, Xs, Ys, Y_std = [ data[_] for _ in ['X', 'Y', 'Xs', 'Ys', 'Y_std'] ] Z = kmeans2(X, args.num_inducing, minit='points')[0] # set up batches batch_size = args.M if args.M < X.shape[0] else X.shape[0] train_dataset = tf.data.Dataset.from_tensor_slices((X, Y)).repeat()\ .prefetch(X.shape[0]//2)\ .shuffle(buffer_size=(X.shape[0]//2))\ .batch(batch_size) print('Setting up DGP model...') kernels = [] for l in range(args.num_layers): kernels.append(SquaredExponential() + White(variance=1e-5)) dgp_model = DGP(X.shape[1], kernels, Gaussian(variance=0.05), Z, num_outputs=Y.shape[1], num_samples=args.num_samples, num_data=X.shape[0]) # initialise inner layers almost deterministically for layer in dgp_model.layers[:-1]: layer.q_sqrt = Parameter(layer.q_sqrt.value() * 1e-5, transform=triangular()) optimiser = tf.optimizers.Adam(args.learning_rate) def optimisation_step(model, X, Y): with tf.GradientTape() as tape: tape.watch(model.trainable_variables) obj = -model.elbo(X, Y, full_cov=False) grad = tape.gradient(obj, model.trainable_variables) optimiser.apply_gradients(zip(grad, model.trainable_variables)) def monitored_training_loop(model, train_dataset, logdir, iterations, logging_iter_freq): # TODO: use tensorboard to log trainables and performance tf_optimisation_step = tf.function(optimisation_step) batches = iter(train_dataset) for i in range(iterations): X, Y = next(batches) tf_optimisation_step(model, X, Y) iter_id = i + 1 if iter_id % logging_iter_freq == 0: tf.print( f'Epoch {iter_id}: ELBO (batch) {model.elbo(X, Y)}') print('Training DGP model...') t0 = time.time() monitored_training_loop(dgp_model, train_dataset, logdir=args.log_dir, iterations=args.iterations, logging_iter_freq=args.logging_iter_freq) t1 = time.time() print('Time taken to train: {}'.format(t1 - t0)) outfile2.write('Split {}: {}\n'.format(i + 1, t1 - t0)) outfile2.flush() os.fsync(outfile2.fileno()) running_time += t1 - t0 m, v = dgp_model.predict_y(Xs, num_samples=args.test_samples) test_nll = np.mean( logsumexp(norm.logpdf(Ys * Y_std, m * Y_std, v**0.5 * Y_std), 0, b=1 / float(args.test_samples))) print('Average test log likelihood: {}'.format(test_nll)) outfile1.write('Split {}: {}\n'.format(i + 1, test_nll)) outfile1.flush() os.fsync(outfile1.fileno()) running_loss += t1 - t0 outfile1.write('Average: {}\n'.format(running_loss / args.splits)) outfile2.write('Average: {}\n'.format(running_time / args.splits)) outfile1.close() outfile2.close()
"glucose", ] level_var = ["education"] category_var = [ "male", "currentSmoker", "BPMeds", "prevalentStroke", "prevalentHyp", "diabetes", ] target = ["TenYearCHD"] # Create Data object data = Datasets( data_file=data_file, cat_cols=category_var, num_cols=numeric_var, level_cols=level_var, label_col=target, train=True, ) X_train = data.feature_train y_train = data.target_train X_test = data.feature_test y_test = data.feature_test # training data models = Model(data)
def main(args): datasets = Datasets(data_path=args.data_path) # prepare output files outname1 = args.results_dir + args.dataset + '_' + str(args.num_layers) + '_'\ + str(args.num_inducing) + '.rmse' if not os.path.exists(os.path.dirname(outname1)): os.makedirs(os.path.dirname(outname1)) outfile1 = open(outname1, 'w') outname2 = args.results_dir + args.dataset + '_' + str(args.num_layers) + '_'\ + str(args.num_inducing) + '.nll' outfile2 = open(outname2, 'w') outname3 = args.results_dir + args.dataset + '_' + str(args.num_layers) + '_'\ + str(args.num_inducing) + '.time' outfile3 = open(outname3, 'w') # ========================================================================= # CROSS-VALIDATION LOOP # ========================================================================= running_err = 0 running_loss = 0 running_time = 0 test_errs = np.zeros(args.splits) test_nlls = np.zeros(args.splits) test_times = np.zeros(args.splits) for i in range(args.splits): # ===================================================================== # MODEL CONSTRUCTION # ===================================================================== print('Split: {}'.format(i)) print('Getting dataset...') # get dataset data = datasets.all_datasets[args.dataset].get_data( i, normalize=args.normalize_data) X, Y, Xs, Ys, Y_std = [ data[_] for _ in ['X', 'Y', 'Xs', 'Ys', 'Y_std'] ] # inducing points via k-means Z = kmeans2(X, args.num_inducing, minit='points')[0] # set up batches batch_size = args.M if args.M < X.shape[0] else X.shape[0] train_dataset = tf.data.Dataset.from_tensor_slices((X, Y)).repeat()\ .prefetch(X.shape[0]//2)\ .shuffle(buffer_size=(X.shape[0]//2))\ .batch(batch_size) print('Setting up DGP model...') kernels = [] dims = [] # hidden_dim = min(args.max_dim, X.shape[1]) hidden_dim = X.shape[1] if X.shape[1] < args.max_dim else args.max_dim for l in range(args.num_layers): if l == 0: dim = X.shape[1] dims.append(dim) else: dim = hidden_dim dims.append(dim) if args.ard: # SE kernel with lengthscale per dimension kernels.append( SquaredExponential(lengthscale=[1.] * dim) + White(variance=1e-5)) else: # SE kernel with single lengthscale kernels.append( SquaredExponential(lengthscale=1.) + White(variance=1e-5)) # output dim dims.append(Y.shape[1]) dgp_model = DGP(X, Y, Z, dims, kernels, Gaussian(variance=0.05), num_samples=args.num_samples, num_data=X.shape[0]) # initialise inner layers almost deterministically for layer in dgp_model.layers[:-1]: layer.q_sqrt = Parameter(layer.q_sqrt.value() * 1e-5, transform=triangular()) # ===================================================================== # TRAINING # ===================================================================== optimiser = tf.optimizers.Adam(args.learning_rate) print('Training DGP model...') t0 = time.time() # training loop monitored_training_loop(dgp_model, train_dataset, optimiser=optimiser, logdir=args.log_dir, iterations=args.iterations, logging_iter_freq=args.logging_iter_freq) t1 = time.time() # ===================================================================== # TESTING # ===================================================================== test_times[i] = t1 - t0 print('Time taken to train: {}'.format(t1 - t0)) outfile3.write('Split {}: {}\n'.format(i + 1, t1 - t0)) outfile3.flush() os.fsync(outfile3.fileno()) running_time += t1 - t0 # minibatch test predictions means, vars = [], [] test_batch_size = args.test_batch_size if len(Xs) > test_batch_size: for mb in range(-(-len(Xs) // test_batch_size)): m, v = dgp_model.predict_y(Xs[mb * test_batch_size:(mb + 1) * test_batch_size, :], num_samples=args.test_samples) means.append(m) vars.append(v) else: m, v = dgp_model.predict_y(Xs, num_samples=args.test_samples) means.append(m) vars.append(v) mean_SND = np.concatenate(means, 1) # [S, N, D] var_SND = np.concatenate(vars, 1) # [S, N, D] mean_ND = np.mean(mean_SND, 0) # [N, D] # rmse test_err = np.mean(Y_std * np.mean((Ys - mean_ND)**2.0)**0.5) test_errs[i] = test_err print('Average RMSE: {}'.format(test_err)) outfile1.write('Split {}: {}\n'.format(i + 1, test_err)) outfile1.flush() os.fsync(outfile1.fileno()) running_err += test_err # nll test_nll = np.mean( logsumexp(norm.logpdf(Ys * Y_std, mean_SND * Y_std, var_SND**0.5 * Y_std), 0, b=1 / float(args.test_samples))) test_nlls[i] = test_nll print('Average test log likelihood: {}'.format(test_nll)) outfile2.write('Split {}: {}\n'.format(i + 1, test_nll)) outfile2.flush() os.fsync(outfile2.fileno()) running_loss += test_nll outfile1.write('Average: {}\n'.format(running_err / args.splits)) outfile1.write('Standard deviation: {}\n'.format(np.std(test_errs))) outfile2.write('Average: {}\n'.format(running_loss / args.splits)) outfile2.write('Standard deviation: {}\n'.format(np.std(test_nlls))) outfile3.write('Average: {}\n'.format(running_time / args.splits)) outfile3.write('Standard deviation: {}\n'.format(np.std(test_times))) outfile1.close() outfile2.close() outfile3.close()
logger = get_logger(logging_level) # Remove existing inference results if os.path.exists('result_' + test_file): os.remove('result_' + test_file) if os.path.exists('gt_' + test_file): os.remove('gt_' + test_file) path = os.path.join(args.model_dir, model_file_name) datasets = Datasets(train_file, validation_file, test_file, embedding_file, max_train_size, image_encoding_algo, use_keyword=use_keyword) logger.info("Max question len: %s" % datasets.max_question_len) logger.info("Max training samples: %s" % datasets.max_samples) logger.info("Vocabulary: %s" % str(len(datasets.vocabulary))) question_generator = QuestionGenerationModel(datasets, logger, hidden_units, dropout) question_generator.input_shape = image_embedding_dim # Calculate image features and store them if save is True obj_dir = os.path.join('data', dataset, 'obj') if not os.path.exists(obj_dir): os.makedirs(obj_dir)