def main(): # load MNIST images images, labels = dataset.load_train_images() # config config = model.config # settings max_epoch = 10000 num_trains_per_epoch = 5000 num_validation_data = 10000 batchsize = 128 # seed np.random.seed(args.seed) if args.gpu_device != -1: cuda.cupy.random.seed(args.seed) # save validation accuracy per epoch csv_results = [] # create semi-supervised split training_images, training_labels, validation_images, validation_labels = dataset.split_data(images, labels, num_validation_data, seed=args.seed) training_labels = np.random.randint(0, config.num_classes, training_labels.size).astype(np.int32) validation_labels = np.random.randint(0, config.num_classes, validation_labels.size).astype(np.int32) # training progress = Progress() for epoch in xrange(1, max_epoch): progress.start_epoch(epoch, max_epoch) sum_loss = 0 for t in xrange(num_trains_per_epoch): # sample from data distribution image_batch, label_batch = dataset.sample_data(training_images, training_labels, batchsize, binarize=False) image_batch = np.reshape(image_batch, (-1, 1, 28, 28)) distribution = model.discriminate(image_batch, apply_softmax=False) loss = F.softmax_cross_entropy(distribution, model.to_variable(label_batch)) sum_loss += float(loss.data) model.backprop(loss) if t % 10 == 0: progress.show(t, num_trains_per_epoch, {}) model.save(args.model_dir) train_accuracy = compute_accuracy(training_images, training_labels) validation_accuracy = compute_accuracy(validation_images, validation_labels) progress.show(num_trains_per_epoch, num_trains_per_epoch, { "loss": sum_loss / num_trains_per_epoch, "accuracy (validation)": validation_accuracy, "accuracy (train)": train_accuracy, }) # write accuracy to csv csv_results.append([epoch, train_accuracy, validation_accuracy, progress.get_total_time()]) data = pd.DataFrame(csv_results) data.columns = ["epoch", "train_accuracy", "validation_accuracy", "min"] data.to_csv("{}/result.csv".format(args.model_dir))
def run(params): # pprint.pprint(params) dataset = GgTraceDataSet('datasets/5.csv', params['sliding_encoder'], params['sliding_decoder']) params['n_dim'] = dataset.n_dim data = dataset.get_data() train, test = split_data(data, test_size=0.2) x_train = (train[0], train[1]) y_train = train[2] x_test = (test[0], test[1]) y_test = test[2] model_name = "sle({})_sld({})_ls({})_ac({})_opt({})_drop({})_rdrop({})_bs({})_lr({})_ct({})".format( params['sliding_encoder'], params['sliding_decoder'], params['layer_sizes_ed'], params['activation'], params['optimizer'], params['dropout'], params['recurrent_dropout'], params['batch_size'], params['learning_rate'], params['cell_type']) model = EDModel('logs/' + model_name) model.build_model(params) history = model.train(x_train, y_train, params['batch_size'], params['epochs'], verbose=0) # plot history # histor = pd.DataFrame(history) # print(histor.describe()) # history.loc[:, ['loss', 'val_loss']].plot() plt.plot(history['loss'], label='loss') plt.plot(history['val_loss'], label='val_loss') plt.legend() plt.xlabel('epoch') plt.ylabel('loss') # plt.show() plt.savefig('logs/' + model_name + '/history.png') plt.clf() # plot predict preds = model.predict(x_test) mae = np.mean(np.abs(y_test - preds)) * 56.863121 with open('logs/mae.txt', 'a') as f: f.write("{},{:.5f}\n".format(model_name, mae)) y_test = y_test[:, -1, 0] preds = preds[:, -1, 0] plt.plot(y_test, label='actual') plt.plot(preds, label='predict') plt.xlabel('time') plt.ylabel('value') plt.legend() plt.savefig('logs/' + model_name + '/predict.png') plt.clf() model.save()
def main(cfg): torch.cuda.empty_cache() torch.manual_seed(cfg.param.seed) # Training settings cwd = Path(hydra.utils.get_original_cwd()) wsi_dir = cwd/cfg.dir.wsi patch_dir = cwd/cfg.dir.patch ckpt = Checkpoint( cwd, cfg.gpus, cfg.dir.resume, cfg.dir.save_to, cfg.log.save_model) device = torch.device( f"cuda:{cfg.gpus[0]}" if cfg.gpus[0] != -1 else "cpu") model = build_model(gpus=cfg.gpus) optimizer = RAdam(model.parameters(), lr=cfg.param.lr) scheduler = StepLR(optimizer, step_size=1, gamma=cfg.param.gamma) if cfg.dir.resume: model, optimizer, scheduler = ckpt.load_state( model, optimizer, scheduler) criterion = get_loss_fn() train_wsi, test_wsi = split_wsi( wsi_dir, ckpt.save_to, cwd, ratio=cfg.data.ratio, projects=cfg.data.projects, strategies=cfg.data.strategies, limit=cfg.data.limit) for epoch in range(ckpt.start_epoch, cfg.param.epochs + 1): split_data( patch_dir, ckpt.save_to, train_wsi, test_wsi, cfg.data.chunks, epoch, cfg.dir.resume) for chunk in range(ckpt.start_chunk, cfg.data.chunks): data_loader = get_loaders( cfg.param.batch_size, ckpt.save_to, chunk, cfg.gpus) train( model, device, data_loader, optimizer, scheduler, criterion, epoch, cfg.param.epochs, chunk, cfg.data.chunks, ckpt) ckpt.start_chunk = 0 scheduler.step() ckpt.save(model, optimizer, scheduler, epoch, chunk, loss=False) ckpt.close_writer()
def run_ed(params): dataset = GgTraceDataSet2('datasets/5.csv', params['sliding_encoder'], params['sliding_decoder']) params['n_dim'] = dataset.n_dim data = dataset.get_data_ed() train, test = split_data(data, test_size=0.2) x_train = train[:2] y_train = train[-1] x_test = test[:2] y_test = test[-1] model_name = "sle({})_sld({})_lsed({})_lsf({})_ac({})_opt({})_kp({})_drop({})_bs({})_lr({})_ct({})_pat({})".format( params['sliding_encoder'], params['sliding_decoder'], params['layer_sizes_ed'], params['layer_sizes_f'], params['activation'], params['optimizer'], params['keep_probs'], params['dropout'], params['batch_size'], params['learning_rate'], params['cell_type'], params['patience']) print('Runing config:' + model_name) model = Model('logs/' + model_name, params=params) model.train_ed(x_train, y_train, batch_size=params['batch_size'], epochs=params['epochs'], verbose=1) preds = model.predict_ed(x_test) preds_inv = dataset.invert_transform(preds) y_test_inv = dataset.invert_transform(y_test) mae = np.mean(np.abs(np.subtract(preds_inv, y_test_inv))) with open('logs/mae.csv', 'a') as f: f.write("{};{:.5f}\n".format(model_name, mae)) preds_inv = preds_inv[:, 0, 0] y_test_inv = y_test_inv[:, -1, 0] plt.plot(y_test_inv, label='actual', color='#fc6b00', linestyle='solid') plt.plot(preds_inv, label='predict', color='blue', linestyle='solid') plt.xlabel('time') plt.ylabel('value') plt.legend() plt.title('mae={:.2f}'.format(mae)) # plt.show() plt.savefig('logs/' + str(mae) + "_" + model_name + '_predict_ed.png') plt.clf()
params = { 'layer_sizes_ann': [64], 'activation': 'sigmoid', 'optimizer': 'rmsprop', 'dropout': 0.05, 'batch_size': 8, 'learning_rate': 0.001, 'epochs': 200, 'patience': 5, } dataset = GgTraceDataSet2('datasets/5.csv', 8, 1) params['n_dim'] = dataset.n_dim data = dataset.get_data_forecast() train, test = split_data(data, test_size=0.2) x_train = train[0] y_train = train[1].reshape((-1, 1)) x_test = test[0] y_test = test[1].reshape((-1, 1)) print(x_train.shape, y_train.shape) model = AnnModel('logs/test_ann', 'logs/test') model.build_model(params) model.train(x_train, y_train, batch_size=params['batch_size'], epochs=params['epochs'], verbose=1)
correct_prediction = tf.equal(tf.argmax(Y, 1), tf.argmax(Y_, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) train_step = tf.train.AdamOptimizer(lr).minimize(cross_entropy) print ('Model built!') test_it = [] train_it = [] test_acc = [] train_acc = [] train_loss = [] dataX, dataY = load_data(train_file) (trainX, trainY), (testX, testY) = split_data(dataX, dataY, [9, 1]) sets = cv_split(trainX, trainY, k_fold) testset = Dataset(testX, testY) test_batch_X, test_batch_Y = testset.minibatch(len(testY)) test_pred = [] print ("Data loaded!") t_start = time.time() for k in range(k_fold): print ("%d-th in %d-fold:" % (k+1, k_fold)) test_it.append([]) train_it.append([])
import tornado.ioloop import tornado.web from transformers import BertForTokenClassification from dataset import import_data, split_data, convert_dataframe_to_data, tags_and_tag_to_idx, tokenizer from model import one_sentence_prediction_bert path_to_dataset = '/home/andrei/Documents/ML/ner.csv' path_to_model = '/home/andrei/Documents/ML/bert_uncased/' data = import_data(path_to_dataset) training, testing = split_data(data) train_data = convert_dataframe_to_data(training) test_data = convert_dataframe_to_data(testing) _, tag_to_idx = tags_and_tag_to_idx(train_data, test_data) tag_values = list(tag_to_idx.keys()) bert_model_loaded = BertForTokenClassification.from_pretrained(path_to_model) class Ner(tornado.web.RequestHandler): def get(self): form = """<form method="post"> <input type="text" name="sentence"/> <input type="submit"/> </form>""" self.write(form) def post(self): sentence = self.get_argument('sentence') prediction = one_sentence_prediction_bert(sentence, bert_model_loaded,
def main(): # ---------- Directories & User inputs -------------- # Location of data folder data_dir = './data/' FLAG_train = (len(sys.argv) > 1 and sys.argv[1] == '--train') ########################################## ######## Load and preprocess data ######## ########################################## # Read and preprocess data from CSV data = dataset.read_and_preprocess_data(data_dir=data_dir, file_name='training.csv') print data.head(), '\n', data.tail(), '\n', data.info() plt.figure() data.groupby(['serieNames'])['sales'].plot() plt.legend(loc="best") # Split data/labels into train/test set X_train, y_train, X_test, y_test = dataset.split_data(df=data, test_ratio=0.1) y_train_serieNames, y_test_serieNames = X_train['serieNames'], X_test[ 'serieNames'] # Data normalization sc = MinMaxScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) ########################################## ######## Train regressor ################# ########################################## if FLAG_train: models.train_regressor_models(X_train, y_train, n_splits=3) else: # Load the pre-trained regressor with tuned parameters # Linear Ridge Regression regressor_ridge = models.load_regressor_Ridge(X_train, y_train) # Random ForestRegression regressor_rf = models.load_regressor_RF(X_train, y_train) # Support Vector Regression regressor_svr = models.load_regressor_SVR(X_train, y_train) # Dummy regressor dummy = DummyRegressor(strategy='mean') dummy.fit(X_train, y_train) y_hat_dummy = pd.DataFrame({ 'y_hat_dummy': y_test, 'serieNames': y_test_serieNames }) y_hat_dummy = y_hat_dummy.groupby(['serieNames' ])['y_hat_dummy'].shift() y_hat_dummy = y_hat_dummy.fillna(method='bfill') print 'RMSE dummy mean %.5f' % (models.rmse(y_test, y_hat_dummy)) ########################################## ######## Compare model performance ####### ########################################## regressor_models = { 'Baseline Previous Mean': dummy, 'Ridge Regression': regressor_ridge, 'Support Vector Regression': regressor_svr, 'Random Forest Regression': regressor_rf } # Test errors: test the model with tuned parameters for i, regressor_model in sorted(regressor_models.items()): y_hat_regressor = regressor_model.predict(X_test) RMSE_regressor = models.rmse(y_test, y_hat_regressor) print 'RMSE %s : %.5f' % (i, RMSE_regressor) plt.figure() plt.ylabel("RMSE") plt.title('RMSE %s : %.5f' % (i, RMSE_regressor)) plot_prediction_perSerie(y_true=y_test, y_pred=y_hat_regressor, y_serieNames=y_test_serieNames) plt.figure() plt.ylabel("RMSE") plt.title('RMSE dummy last observation %.5f' % (models.rmse(y_test, y_hat_dummy))) plot_prediction_perSerie(y_true=y_test, y_pred=y_hat_dummy, y_serieNames=y_test_serieNames) # Generization errors: cross_validate_score plt.figure() plt.title('Generalization errors (RMSE)') n_splits = 10 scoring = 'neg_mean_squared_error' for i, regressor_model in sorted(regressor_models.items()): test_error = models.get_regressor_cross_validate_score( regressor_model, X_test, y_test, scoring=scoring, n_splits=n_splits) test_rmse = np.array([np.sqrt(-e) for e in test_error]) plt.plot(test_rmse, 'o-', label=i + ' : %0.2f (+/- %0.2f)' % (test_rmse.mean(), test_rmse.std() / 2)) plt.xlabel("Fold number") plt.ylabel("RMSE") plt.legend(loc="best") ########################################## ######## Make predictions ################ ########################################## file_name = 'test.csv' new_samples = dataset.read_and_preprocess_data(data_dir=data_dir, file_name=file_name) X_new = new_samples.values X_new = sc.transform(X_new) # Directly predict y_new_hat = regressor_rf.predict(X_new) # Fit all data available and make prediction X_all = np.concatenate((X_train, X_test), axis=0) y_all = np.concatenate((y_train, y_test), axis=0) regressor_rf.fit(X_all, y_all) y_new_hat_all = regressor_rf.predict(X_new) # Plot the prediction results plt.figure() df_new = pd.DataFrame({ 'sales_pred_90': y_new_hat, 'sales_pred_100': y_new_hat_all, 'serieNames': new_samples['serieNames'] }) df_new.groupby(['serieNames' ])['sales_pred_90'].plot(label='sales_pred_90%') df_new.groupby(['serieNames' ])['sales_pred_100'].plot(style='o--', label='sales_pred_100%') plt.ylabel("sales") plt.legend(loc="best") ########################################## ######## Save prediction results ######### ########################################## # Save the prediction results df_new.reset_index() df_new.to_csv('./results/prediction.csv', index=False) # Write to the test.csv format df_test = pd.read_csv(data_dir + 'test.csv') df_test['sales'] = y_new_hat_all df_test.to_csv('./results/test_prediction.csv', index=False) plt.figure() df_test = df_test.set_index(['TSDate']) df_test.groupby(['serieNames'])['sales'].plot(style='*-') plt.ylabel("sales") plt.legend(loc="best") # Visualize the prediction Visualize_prediction(data_dir) plt.legend(loc="best") plt.show()
""" Pipeline for training and testing CNN model for prediction TSS with dna_seq data from Oriza Sativa """ import dataset import models import metrics import numpy as np import time t0 = time.time() # !!! 1. Dataset from dna-seq data and TSS list file (Chromosome, Strand, Locus_ID, TSS_position) # !!! 1.1 Read DNA-seq data and TSS positions file TSS = dataset.tss_read('./data/TSS_MSU.txt', sep=',') # read TSS_MSU file split = dataset.split_data( TSS) # split given chromosomes for train and test sets dataset.sequences(tss_array=TSS, dna_seq='./data/all.con', split=split, new_path='./data/', nucleotides=1000) # read all.con file (DNA-seq data) and save 4 new files: train_seq.fa, train_tss_pos.fa, test_seq.fa, test_tss_pos.fa dataset.beyond_genes(tss_array=TSS, dna_seq='./data/all.con', split=split, examples_train=15000, examples_test=1000, new_path='./data/', nucleotides=512,
params['dataset'] = os.path.basename(os.path.normpath(args.data)) run_name = 'q_{0[dataset]}_lr{0[lr]}_e{0[epochs]}_b{0[batch]}'.format(params) run_dir = os.path.join(args.runs, run_name) ckpt_dir = os.path.join(run_dir, 'ckpt') if not os.path.exists(run_dir): os.makedirs(run_dir) os.makedirs(ckpt_dir) log_file = os.path.join(run_dir, 'log.csv') param_file = os.path.join(run_dir, 'params.csv') pd.DataFrame(params, index=[0]).to_csv(param_file, index=False) ### Load Data train_data, val_data, test_data = split_data(args.data, group=args.group, bPrecompGroup = args.groupprecomp) train_data.to_csv(os.path.join(run_dir, "train.csv"), ',') val_data.to_csv(os.path.join(run_dir, "val.csv"), ',') test_data.to_csv(os.path.join(run_dir, "test.csv"), ',') #create the loader for the training set train_data = HdrVdpDataset(train_data, args.data, args.group, bPrecompGroup = args.groupprecomp) train_loader = DataLoader(train_data, shuffle=True, batch_size=args.batch, num_workers=8, pin_memory=True) #create the loader for the validation set val_data = HdrVdpDataset(val_data, args.data, args.group, bPrecompGroup = args.groupprecomp) val_loader = DataLoader(val_data, shuffle=False, batch_size=args.batch, num_workers=8, pin_memory=True) #create the loader for the testing set test_data = HdrVdpDataset(test_data, args.data, args.group, bPrecompGroup = args.groupprecomp) test_loader = DataLoader(test_data, shuffle=False, batch_size=args.batch, num_workers=8, pin_memory=True)