def train(epochs, layer, lr, lambd) : model = FNN(feature.shape[2], 1, layer, 128) model.apply(weight_init) optimizer = optim.Adam(model.parameters(), lr = lr, weight_decay = lambd) if args.cuda : model = model.cuda() print("Training FNN for %d layers, %f learning rate, %f lambda" %(layer, lr, lambd)) for epoch in range(epochs) : train_epoch(epoch, model, optimizer, lambd) output = model(feature[:,0,:]) output = output.unsqueeze(1) for i in range(1, feature.shape[1]) : output = torch.cat((output, model(feature[:, i, :]).unsqueeze(1)), 1) # t_weight = torch.stack((weight, weight),2) t_weight = weight output = output.squeeze() output = torch.mul(t_weight, output) output = torch.sum(output, 1) loss_train = Loss(output[idx_train], out[idx_train]) loss_val = Loss(output[idx_val], out[idx_val]) print("Result for %d layers, %f learning rate, %f lambda" %(layer, lr, lambd)) print('loss_val: {:.4f}'.format(loss_val.item())) return model, output, loss_val
def main(argv=None): # pylint: disable=unused-argument algo = FLAGS.model eprint(algo) field_sizes = None if algo == 'fmuv': params = { 'data_dir': FLAGS.data_dir, 'num_epochs': FLAGS.num_epochs, 'batch_size': FLAGS.batch_size, 'input_dim': FLAGS.input_dim, 'factor_order': 12, 'l2_w': 0.001, } eprint(params) model = FMUV(**params) elif algo == 'fnn': field_sizes = [FLAGS.input_dim] * FLAGS.num_field params = { 'data_dir': FLAGS.data_dir, 'batch_size': FLAGS.batch_size, 'num_epochs': FLAGS.num_epochs, 'input_dim': FLAGS.input_dim, 'layer_sizes': [field_sizes, 12, 200, 1], 'layer_acts': ['none', 'tanh', 'none'], 'layer_l2': [0, 0, 0], 'l2_w': 0.001, } eprint(params) model = FNN(**params) elif algo == 'pnn1': field_sizes = [FLAGS.input_dim] * FLAGS.num_field params = { 'data_dir': FLAGS.data_dir, 'batch_size': FLAGS.batch_size, 'num_epochs': FLAGS.num_epochs, 'input_dim': FLAGS.input_dim, 'layer_sizes': [field_sizes, 12, 1], 'layer_acts': ['tanh', 'none'], 'layer_l2': [0, 0], 'kernel_l2': 0, 'l2_w': 0.001, } eprint(params) model = PNN1(**params) X, y, B = worker_input(field_sizes=field_sizes) eval_once(model, X, y, B)
def train(epochs, layer, lr, lambd, idx_train, idx_val): model = FNN(feature.shape[1], out.shape[1], layer, 128) model.apply(weight_init) optimizer = optim.Adam(model.parameters(), lr = lr, weight_decay = lambd) if args.cuda: model = model.cuda() print("Training FNN for %d layers, %f learning rate, %f lambda" %(layer, lr, lambd)) for epoch in range(epochs) : train_epoch(epoch, model, optimizer, lambd, idx_train, idx_val) output = model(feature) loss_val = F.mse_loss(output[idx_val], out[idx_val]) print("Result for %d layers, %f learning rate, %f lambda" %(layer, lr, lambd)) print('loss_val: {:.4f}'.format(loss_val.item())) return output, loss_val
'l2_v': 0, } model = FM(**fm_params) elif algo == 'fnn': fnn_params = { 'layer_sizes': [field_sizes, 10, 1], 'layer_acts': ['tanh', 'none'], 'drop_out': [0, 0], 'opt_algo': 'gd', 'learning_rate': 0.1, 'layer_l2': [0, 0], 'random_seed': 0 } model = FNN(**fnn_params) elif algo == 'ccpm': ccpm_params = { 'layer_sizes': [field_sizes, 10, 5, 3], 'layer_acts': ['tanh', 'tanh', 'none'], 'drop_out': [0, 0, 0], 'opt_algo': 'gd', 'learning_rate': 0.1, 'random_seed': 0 } model = CCPM(**ccpm_params) elif algo == 'pnn1': pnn1_params = { 'layer_sizes': [field_sizes, 10, 1], 'layer_acts': ['tanh', 'none'],
} params_op = { 'lr': float(args.learning_rate), 'momentum': float(args.momentum), 'weight_decay': float(args.weight_decay) } path = args.path training_set = SignalDataset(path, train=True) train_loader = torch.utils.data.DataLoader(training_set, **params_dataloader) num_classes = training_set.num_classes test_set = SignalDataset(path, train=False) test_loader = torch.utils.data.DataLoader(test_set, **params_dataloader) model = FNN(**params_model, output_size=num_classes).to(device=device) nll_loss = nn.NLLLoss() op = torch.optim.SGD(model.parameters(), **params_op) if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume))
def main_func(activation, data_path, save_path, batch_size, epochs, layer_sizes, mi_methods, test_size, num_bins=[30], num_runs=1, try_gpu=False): check_for_data(save_path) if try_gpu: cuda = torch.cuda.is_available() device = torch.device("cuda" if cuda else "cpu") else: device = torch.device("cpu") print("Using "+ str(device)) loss_function = nn.CrossEntropyLoss() # Only one supported as of now max_values = [] for i in tqdm.tqdm(range(args.start_from, num_runs)): torch.manual_seed(i) torch.cuda.manual_seed(i) np.random.seed(i) train_loader, test_loader, act_full_loader = prepare_data(data_path, test_size, i, batch_size) model = FNN(layer_sizes, activation=activation, seed=i).to(device) optimizer = optim.Adam(model.parameters(), lr=0.0004) tr = Trainer(loss_function, epochs, model, optimizer, device) print("Start Training...") tr.train(train_loader, test_loader, act_full_loader) if args.save_train_error: print("Saving train and test error...") with open(save_path + '/training_history_run_{}_{}.pickle'.format(i, batch_size), 'wb') as f: pickle.dump([tr.error_train, tr.error_test], f, protocol=pickle.HIGHEST_PROTOCOL) f.close() with open(save_path + '/loss_run_{}_{}.pickle'.format(i, batch_size), 'wb') as f: pickle.dump([tr.train_loss, tr.val_loss], f, protocol=pickle.HIGHEST_PROTOCOL) f.close() if args.save_max_vals: print("Saving max activation values...") with open(save_path + '/max_values{}_{}.pickle'.format(i, batch_size), 'wb') as f: print(np.array(tr.max_value_layers_mi).max()) pickle.dump(tr.max_value_layers_mi, f, protocol=pickle.HIGHEST_PROTOCOL) f.close() if args.save_mutual_information: for j in num_bins: print("Saving mutual information with {} bins...".format(j)) if "variable" in mi_methods: max_value = info_utils.get_max_value(tr.hidden_activations) num_bins = int(max_value*15) mutual_inf = MI(tr.hidden_activations, act_full_loader,act=activation, num_of_bins=j) MI_XH, MI_YH = mutual_inf.get_mi(method="fixed") with open(save_path + '/MI_XH_MI_YH_run_{}_{}_{}variable.pickle'.format(i, batch_size, j), 'wb') as f: pickle.dump([MI_XH, MI_YH], f, protocol=pickle.HIGHEST_PROTOCOL) f.close() if "fixed" in mi_methods: mutual_inf = MI(tr.hidden_activations, act_full_loader,act=activation, num_of_bins=j) MI_XH, MI_YH = mutual_inf.get_mi(method="fixed") with open(save_path + '/MI_XH_MI_YH_run_{}_{}_{}bins.pickle'.format(i, batch_size, j), 'wb') as f: pickle.dump([MI_XH, MI_YH], f, protocol=pickle.HIGHEST_PROTOCOL) f.close() if "adaptive" in mi_methods: mutual_inf = MI(tr.hidden_activations, act_full_loader,act=activation, num_of_bins=j) MI_XH, MI_YH = mutual_inf.get_mi(method="adaptive") with open(save_path + '/MI_XH_MI_YH_run_{}_{}_{}adaptive.pickle'.format(i, batch_size, j), 'wb') as f: pickle.dump([MI_XH, MI_YH], f, protocol=pickle.HIGHEST_PROTOCOL) f.close() minv, maxv = info_utils.get_min_max_vals(activation, tr.hidden_activations) max_values.append(maxv) print(max_values) # Need to delete everything from memory # because python will keep things in memory until computation of overwriting # variable is finished for the next iteration. This simply fills up my RAM. del model del tr if args.save_mutual_information: del mutual_inf del MI_XH del MI_YH del train_loader del test_loader del act_full_loader print("Done runnning...")
from models import FactorizationMachines, FNN dao = learning_dao() dao.build() X_train, X_test, y_train, y_test = dao.fetch_dataset() print(X_train) print(y_train) features_info = dao.features_info auc = tf.keras.metrics.AUC(num_thresholds=1000) optimizer = tf.keras.optimizers.Adam(lr=0.01, decay=0.1) fm_model = FactorizationMachines(features_info) fm_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=[auc]) fm_model.fit(x=X_train, y=y_train, epochs=100, batch_size=100000, validation_split=0.2) fm_model.evaluate(x=X_test, y=y_test) model = FNN(fm_model) model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=[auc]) model.fit(x=X_train, y=y_train, epochs=100, batch_size=100000, validation_split=0.2) model.evaluate(x=X_test, y=y_test)
def worker_process(cluster, server): # assign ops to local worker by default with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % FLAGS.task_index, cluster=cluster)): ps_num = cluster.num_tasks('ps') worker_num = cluster.num_tasks('worker') algo = FLAGS.model eprint(algo) field_sizes = None if algo == 'fmuv': params = { 'data_dir': FLAGS.data_dir, 'summary_dir': FLAGS.train_dir, 'eval_dir': FLAGS.eval_dir, 'random_seed': FLAGS.task_index, 'batch_size': FLAGS.batch_size, 'num_epochs': FLAGS.num_epochs, 'input_dim': FLAGS.input_dim, 'learning_rate': FLAGS.learning_rate, 'opt_algo': FLAGS.optimizer, #'adagrad', 'sync': FLAGS.sync_replicas, 'workers': FLAGS.all_workers, 'factor_order': 12, 'l2_w': 0.001, } eprint(params) model = FMUV(**params) elif algo == 'fnn': field_sizes = [FLAGS.input_dim] * FLAGS.num_field params = { 'data_dir': FLAGS.data_dir, 'summary_dir': FLAGS.train_dir, 'eval_dir': FLAGS.eval_dir, 'random_seed': FLAGS.task_index, 'batch_size': FLAGS.batch_size, 'num_epochs': FLAGS.num_epochs, 'input_dim': FLAGS.input_dim, 'learning_rate': FLAGS.learning_rate, 'opt_algo': FLAGS.optimizer, #'adagrad', 'sync': FLAGS.sync_replicas, 'workers': FLAGS.all_workers, 'layer_sizes': [field_sizes, 12, 200, 1], 'layer_acts': ['none', 'tanh', 'none'], 'drop_out': [0, 0, 0], 'layer_l2': [0, 0, 0], 'l2_w': 0.001, } eprint(params) model = FNN(**params) elif algo == 'pnn1': field_sizes = [FLAGS.input_dim] * FLAGS.num_field params = { 'data_dir': FLAGS.data_dir, 'summary_dir': FLAGS.train_dir, 'eval_dir': FLAGS.eval_dir, 'random_seed': FLAGS.task_index, 'batch_size': FLAGS.batch_size, 'num_epochs': FLAGS.num_epochs, 'input_dim': FLAGS.input_dim, 'learning_rate': FLAGS.learning_rate, 'opt_algo': FLAGS.optimizer, #'adagrad', 'sync': FLAGS.sync_replicas, 'workers': FLAGS.all_workers, 'layer_sizes': [field_sizes, 12, 1], 'layer_acts': ['tanh', 'none'], 'layer_l2': [0, 0], 'kernel_l2': 0, 'l2_w': 0.001, } eprint(params) model = PNN1(**params) worker_device = "/job:worker/task:%d" % FLAGS.task_index with tf.device(worker_device): X, y, B = worker_input(field_sizes=field_sizes) # The supervisor takes care of session initialization, restoring from # a checkpoint, and closing when done or an error occurs. #summary_writer = tf.summary.FileWriter(FLAGS.log_dir, model.graph) saver = tf.train.Saver(var_list=model.vars, max_to_keep=FLAGS.max_models_to_keep) save_interval = 100 if FLAGS.model == "fmuv" else 600 def load_pretrained_model(sess): restore_file = tf.train.latest_checkpoint(FLAGS.resume_dir) eprint('restore:', restore_file) saver.restore(sess, restore_file) load_model_function = load_pretrained_model if FLAGS.resume_dir != '' else None is_chief = (FLAGS.task_index == 0) # Create a "supervisor", which oversees the training process. sv = tf.train.Supervisor(is_chief=is_chief, logdir=FLAGS.train_dir, saver=saver, init_fn=load_model_function, global_step=model.global_step, save_model_secs=save_interval) retry_times = 0 N_failed = 10 while retry_times < N_failed: try: eprint('retry_times = %d' % (retry_times)) startt = time.time() with sv.managed_session(master=server.target) as sess: eprint('------ start ------', datetime.now()) if is_chief: time.sleep(10) run_while_batch(sv, sess, model, X, y, B) sv.stop() eprint("------ end sv stop:", datetime.now()) endt = time.time() if endt - startt > 300: retry_times = N_failed else: time.sleep(10) retry_times += 1 except: traceback.print_exc() retry_times += 1 time.sleep(10)