def train_main(config): bsz = config.batch_size tf.set_random_seed(config.seed) X = tf.placeholder(tf.float32, shape=(bsz, config.input_steps, feature_dim)) Y_label = tf.placeholder(tf.int32, [None, config.num_classes]) Y_bbox = tf.placeholder(tf.float32, [None, 3]) Index = tf.placeholder(tf.int32, [bsz + 1]) LR = tf.placeholder(tf.float32) optimizer, loss, trainable_variables = \ train_operation(X, Y_label, Y_bbox, Index, LR, config) model_saver = tf.train.Saver(var_list=trainable_variables, max_to_keep=2) sess = tf.InteractiveSession(config=tf.ConfigProto( log_device_placement=False)) tf.global_variables_initializer().run() # initialize parameters or restore from previous model if not os.path.exists(models_dir): os.makedirs(models_dir) if os.listdir(models_dir) == [] or config.initialize: init_epoch = 0 print("Initializing Network") else: init_epoch = int(config.steps) restore_checkpoint_file = join(models_dir, 'model-ep-' + str(config.steps - 1)) model_saver.restore(sess, restore_checkpoint_file) batch_train_dataX, batch_train_gt_label, batch_train_gt_info, batch_train_index = \ get_train_data(config, mode, pretrain_dataset, True) num_batch_train = len(batch_train_dataX) for epoch in range(init_epoch, config.training_epochs): loss_info = [] for idx in range(num_batch_train): feed_dict = { X: batch_train_dataX[idx], Y_label: batch_train_gt_label[idx], Y_bbox: batch_train_gt_info[idx], Index: batch_train_index[idx], LR: config.learning_rates[epoch] } _, out_loss = sess.run([optimizer, loss], feed_dict=feed_dict) loss_info.append(out_loss) print("Training epoch ", epoch, " loss: ", np.mean(loss_info)) if epoch == config.training_epochs - 2 or epoch == config.training_epochs - 1: model_saver.save(sess, models_file_prefix, global_step=epoch)
def train(args): global TRAIN_BATCH_SIZE, LEARNING_RATE TRAIN_BATCH_SIZE = args.batch if args.batch else TRAIN_BATCH_SIZE print("train batch size", TRAIN_BATCH_SIZE) LEARNING_RATE = args.lr if args.lr else LEARNING_RATE print("learning_rate", LEARNING_RATE) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() print(device, n_gpu) model, tokenizer = load_model_and_tokenizer(lm=args.lm, model_dir=args.model_dir) all_exs = [] all_data = [] if not args.num_examples: args.num_examples = [None] * len(args.data) else: args.num_examples += [None] * (len(args.data) - len(args.num_examples)) print(args.data, args.num_examples) for data_source, num_exs in zip(args.data, args.num_examples): exs, data = get_train_data(data_source, tokenizer, lm=args.lm, num_examples=num_exs, mask=args.mask, distant_source=args.distant_source) all_exs.append(exs) all_data.append(data) ''' if args.unsup: if args.unsup.endswith(".pkl"): inputs = pickle.load(open(args.unsup, 'rb')) u_exs = inputs['exs'] u_data = inputs['old_data'] u_new_data = inputs['new_data'] else: assert args.unsup in set(["matres", "udst"]) u_exs, u_data = get_train_data(args.unsup, lm=arg.lm, num_examples=args.unsup_num_examples, mask=args.mask) print(len(u_exs), "unsup examples loaded") UNSUP_BATCH_SIZE = args.unsup_batch if args.unsup_batch else int(TRAIN_BATCH_SIZE/2) uda_dataset = UdaDataset(u_exs, UNSUP_BATCH_SIZE) ''' OUTPUT_DIR = args.output_dir if args.output_dir else "models/scratch/" if not os.path.exists(OUTPUT_DIR): os.makedirs(OUTPUT_DIR) if len(all_exs) == 0: print("no dataset specified") elif len(all_exs) == 1: print("one dataset specified") exs = all_exs[0] data = all_data[0] else: print("using multiple data sources") inputs = [] for i in range(len(all_data[0].tensors)): inputs.append(torch.cat([d.tensors[i] for d in all_data])) exs = list(chain(*all_exs)) data = TensorDataset(*inputs) data_sampler = RandomSampler(data) dataloader = DataLoader(data, sampler=data_sampler, batch_size=TRAIN_BATCH_SIZE) print(len(data), len(exs), "examples loaded") num_train_optimization_steps = int( len(data) / TRAIN_BATCH_SIZE / GRADIENT_ACCUMULATION_STEPS) * NUM_TRAIN_EPOCHS print(num_train_optimization_steps, "optimization steps") num_warmup_steps = WARMUP_PROPORTION * num_train_optimization_steps model.to(device) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any( nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any( nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE, correct_bias=False) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_train_optimization_steps) # PyTorch scheduler if args.serialize: inputs = {"exs": exs, "data": data} pickle.dump(inputs, open(OUTPUT_DIR+"inputs.pkl", 'wb')) logfile = open(OUTPUT_DIR + "/log.txt", "w+") print(args, file=logfile) print(len(data), len(exs), "examples loaded", file=logfile) count_labels(exs, file=logfile) count_labels(exs) print("learning_rate", LEARNING_RATE, file=logfile) global_step = 0 num_epochs = args.epochs if not args.epochs is None else int(NUM_TRAIN_EPOCHS) if num_epochs == 0: exit() model.train() exs_cpy = exs for ep in trange(num_epochs, desc="Epoch"): last_loss_kldiv = 0 for step, batch in enumerate(tqdm(dataloader, desc="Iteration " + str(ep), disable=args.disable_tqdm)): bbatch = tuple(t.to(device) for t in batch) loss, _, _ = model(*bbatch) ''' if args.unsup: loss_kldiv = unsup_loss(model) if loss_kldiv: last_loss_kldiv = loss_kldiv.item() loss += loss_kldiv ''' loss.backward() if step % 100 == 0: print("Loss: %.3f at step %d" % (loss.item(), step), file=logfile) # if args.unsup and last_loss_kldiv: # print("Unsup Loss: %.3f at step %d" %(last_loss_kldiv, step), file=logfile) optimizer.step() scheduler.step() model.zero_grad() global_step += 1 # Save a trained model, configuration and tokenizer model_output_dir = OUTPUT_DIR + "/output_" + str(ep) + "/" if not os.path.exists(model_output_dir): os.makedirs(model_output_dir) model.save_pretrained(model_output_dir) tokenizer.save_pretrained(model_output_dir)
def train(): X_train, y_train = get_train_data() X_test, y_test = get_valid_data() # data generater train_gen = ImageDataGenerator(rescale=1.0 / 255, horizontal_flip=True, width_shift_range=4.0 / 32.0, height_shift_range=4.0 / 32.0) test_gen = ImageDataGenerator(rescale=1.0 / 255) # train_gen = ImageDataGenerator(featurewise_center=True, featurewise_std_normalization=True, zca_whitening=True, horizontal_flip=True,) # test_gen = ImageDataGenerator(featurewise_center=True, featurewise_std_normalization=True, zca_whitening=True) y_train = to_categorical(y_train) y_test = to_categorical(y_test) # load network model = ShuffleNet_V2() opt = SGD(0.01, momentum=0.9) #model.compile(Adam(0.001), "categorical_crossentropy", ["accuracy"]) model.compile(optimizer=opt, loss="categorical_crossentropy", metrics=['accuracy']) #model.compile(SGD(0.01, momentum = 0.9), "categorical_crossentropy", ["acc", "top_k_categorical_accuracy"]) model.summary() # set GPU config = tf.ConfigProto() config.gpu_options.allow_growth = True session = tf.Session(config=config) session.run(tf.global_variables_initializer()) KTF.set_session(session) # set batch_size = 128 scheduler = LearningRateScheduler(lr_scheduler) hist = History() start_time = time.time() #model.fit_generator(train_gen.flow(X_train, y_train, batch_size, shuffle=True), # steps_per_epoch=X_train.shape[0]//batch_size, # validation_data=test_gen.flow(X_test, y_test, batch_size, shuffle=False), # validation_steps=X_test.shape[0]//batch_size, # callbacks=[scheduler, hist], max_queue_size=5, epochs=100) model.fit_generator(train_gen.flow(X_train, y_train, batch_size, shuffle=True), steps_per_epoch=X_train.shape[0] // batch_size, validation_data=test_gen.flow(X_test, y_test, batch_size, shuffle=False), validation_steps=X_test.shape[0] // batch_size, callbacks=[scheduler, hist], max_queue_size=5, epochs=50) elapsed = time.time() - start_time print('training time', elapsed) history = hist.history history["elapsed"] = elapsed with open("shuffle_v2_002_glp.pkl", "wb") as fp: pickle.dump(history, fp)
def loadRawData(): ''' Download global variables of systems. Each variable is a list of values for each system. Coulomb matrices follow the same order in indexing. ''' # Spacegroup of the systems global spacegrp # Number of atoms global Natoms # Percentage of Al global pc_al # % Ga global pc_ga # % In global pc_in global lv_alpha global lv_beta global lv_gamma global lvadeg global lvbdeg global lvgdeg # Formation energies LABELS global Ef # Band gap energies LABELS global Eg # Training set atom coordinates global xyz_Train # Training set atom elements (str) global elements_Train # Training set lattice vectors global lattices_Train filename = hyppar.datapath + 'data/train.csv' spacegrp, Natoms, pc_al, pc_ga, pc_in, lv_alpha, lv_beta, lv_gamma, lvadeg, lvbdeg, lvgdeg, Ef, Eg = load_data.get_train_data( filename) xyz_Train, elements_Train, lattices_Train = load_data.get_geometry( hyppar.Ndata, hyppar.datapath + 'data')