def preprocess(self, query): ## input checks if isinstance(query, dict): query = pd.DataFrame(query) elif isinstance(query, pd.DataFrame): pass else: raise Exception(f"ERROR - FAIL:(model_evaluation) - invalid input. {type(query)} was given") X = query.drop(['y'], axis=1) y = query[['y']] query_train = DataLoader().feature_pipeline(self.numerical, self.categorical).fit(X).transform(X) # get from model trainer log for n_samples in range(10, len(X), 20): subset_indices = X.sample(n=n_samples, replace=True) subset_query = DataLoader().feature_pipeline(self.numerical, self.categorical).fit(subset_indices).transform(subset_indices) if subset_query.shape[1] != query_train.shape[1]: continue else: break print(f'n_ : {n_samples}') print(f'y_ : {subset_query.shape[1]}') return subset_query, subset_indices, y
def train(epochs=10, batch_size=50, lr=1e-3): dataloader = DataLoader('/data/strokes.npy') Model = model.Model() optimizer = torch.optim.RMSprop(Model.params(), lr=lr) for epoch in range(epochs): x, y = dataloader.generate_batch() x = torch.from_numpy(np.array(x)) y = torch.from_numpy(np.array(y)) y1 = y[:, :, 0] y2 = y[:, :, 1] y2 = y[:, :, 2] hidden = autograd.Variable(torch.randn(1, x.size(0), 121)) e, pi, mu1, mu2, sigma1, sigma2, corr, hidden = Model(x, hidden) loss_val = loss(e, pi, mu1, mu2, sigma1, sigma2, corr, y1, y2, y3) optimizer.zero_grad() loss.backward() optimizer.step() torch.save(Model.save_dict(), 'unconditional.pt')
def test_dataset(): qm9 = QM9Dataset('data/adjacency_matrix_train.pkl') np.random.seed(0) sample = qm9[0] assert sample.length == 19 assert sample.targets == np.array(['N']) assert np.array_equal(sample.target_mask, np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])) assert sample.adj.max()==1 dl = DataLoader(qm9, batch_size=10) a = next(iter(dl)) qm9 = QM9Dataset('data/adjacency_matrix_train.pkl', bond_order=True) np.random.seed(0) sample = qm9[0] assert sample.length == 19 assert sample.targets == np.array(['N']) assert np.array_equal(sample.target_mask, np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])) assert sample.adj.max()==2 dl = DataLoader(qm9, batch_size=10) a = next(iter(dl))
def run_nn_dmi(args): set_global_seeds(args['seed']) dataset = DataLoader(args['dataset'], args) X_train, X_test, X_val, y_train, y_test, y_val = dataset.prepare_train_test_val( args) mlp = MLP( feature_dim=X_train.shape[-1], hidsizes=args['hidsize'], dropout=args['dropout'], outputs=2, ) classifier = DMIClassifier( model=mlp, learning_rate=args['lr'], ) results = classifier.fit( X_train, y_train, X_test, y_test, batchsize=args['batchsize'], episodes=args['episodes'], logger=logger if args['seeds'] == 1 else None, ) return results
def run_pam(args): set_global_seeds(args['seed']) dataset = DataLoader(args['dataset']) X_train, X_test, X_val, y_train, y_test, y_val = dataset.prepare_train_test_val( args) model = Perceptron(feature_dim=X_train.shape[-1], margin=args['margin']) model.fit(X_train, y_train) return model.score(X_test, y_test)
def run_c_svm(args): set_global_seeds(args['seed']) dataset = DataLoader(args['dataset'], args) X_train, X_test, X_val, y_train, y_test, y_val = dataset.prepare_train_test_val( args) model = SVC(gamma='auto', class_weight={0: 1., 1: args['C1']}) model.fit(X_train, y_train) return model.score(X_test, y_test)
def main_experiment(train_data, valid_data): """ Question 8: """ trainloader = DataLoader(train_data, batch_size=32) devloader = DataLoader(valid_data, batch_size=len(valid_data)) mlp = MLPClassifier(constants.Circles.INPUT_DIM, constants.Circles.N_CLASSES, 10, 0.05, 50) mlp.train(trainloader, devloader, log=os.path.join(constants.Circles.RESULTS_DIR, 'circles_log.txt'))
def find_best_c1(args): set_global_seeds(args['seed']) dataset = DataLoader(args['dataset'], args) X_train, X_test, X_val, y_train, y_test, y_val = dataset.prepare_train_test_val( args) results = [] for c1 in CLASS_WEIGHTS: model = SVC(gamma='auto', class_weight={0: 1., 1: c1}) model.fit(X_train, y_train) results.append(model.score(X_val, y_val)) return results
def find_best_margin(args): """ return `best_margin / 0.1` """ set_global_seeds(args['seed']) dataset = DataLoader(args['dataset']) X_train, X_test, X_val, y_train, y_test, y_val = dataset.prepare_train_test_val( args) results = [] for margin in MARGINS: model = Perceptron(feature_dim=X_train.shape[-1], margin=margin) model.fit(X_train, y_train) results.append(model.score(X_val, y_val)) return results
def load_data(self, subset=False): """Loads and Preprocess data """ LOG.info(f'loading {self.config.data.path} dataset .....') self.dataset = DataLoader().load_data(self.config.data) LOG.info("..... validating all data") try: validate = DataLoader().validate_schema(self.dataset) if validate is None: LOG.info("PASS: data validation passed.") except: LOG.critical("FAIL: data validation failed.") raise Exception( "CRITICAL - FAIL:(dataloader) - invalid data schema") # sys.exit(100) # exit if using log and no raise exception # self.X, self.y = DataLoader().split_feature_target(self.dataset, self.target) # self.X_train, self.X_test, self.y_train ,self.y_test = DataLoader().preprocess_data(self.X, self.y, self.test_size, self.random_state) self.train_dataset, self.test_dataset = DataLoader().preprocess_data( self.dataset, self.test_size, self.random_state) train_shape = DataLoader().feature_pipeline(self.numerical, self.categorical) \ .fit(self.train_dataset).transform(self.train_dataset) # subset the data to enable faster unittests if subset: subset_query = np.empty(shape=(1, 1), dtype=object) while subset_query.shape[1] != train_shape.shape[1]: subset_indices = self.train_dataset.sample( frac=self.subset_n_frac, replace=True) subset_query = DataLoader().feature_pipeline( self.numerical, self.categorical).fit( subset_indices).transform(subset_indices) self.train_dataset = subset_indices self.X_train= DataLoader().feature_pipeline(self.numerical, self.categorical) \ .fit(self.train_dataset).transform(self.train_dataset) self.y_train = DataLoader().target_pipeline(self.target).fit(self.train_dataset[self.target]) \ .transform(self.train_dataset[self.target]) self.X_test= DataLoader().feature_pipeline(self.numerical, self.categorical).fit(self.test_dataset) \ .transform(self.test_dataset) self.y_test = DataLoader().target_pipeline(self.target).fit(self.test_dataset[self.target]) \ .transform(self.test_dataset[self.target])
def main(argv): log.info('Beginning prediction') funcs = pd.read_pickle( os.path.join(FLAGS.resources, '{}.pkl'.format(FLAGS.function)))['functions'].values funcs = GODAG.initialize_idmap(funcs, FLAGS.function) log.info('GO DAG initialized. Updated function list-{}'.format(len(funcs))) FeatureExtractor.load(FLAGS.resources) log.info('Loaded amino acid and ngram mapping data') data = DataLoader(filename=FLAGS.inputfile) if FLAGS.evaluate: test_dataiter = DataIterator(batchsize=FLAGS.batchsize, size=FLAGS.testsize, dataloader=data, functype=FLAGS.function, featuretype='ngrams') predict_evaluate(test_dataiter, 0.2, FLAGS.modelsdir) else: test_dataiter = DataIterator(batchsize=FLAGS.batchsize, size=FLAGS.testsize, dataloader=data, functype=FLAGS.function, featuretype='ngrams', test=True) predict(test_dataiter, 0.2, FLAGS.modelsdir, funcs)
def test_data_validation(self): ## schema checks try: validate = DataLoader().validate_schema(self.test_dataset) if validate is None: LOG.info("PASS: Test data validation passed.") except: raise Exception( "ERROR - FAIL:(model_evaluation) - invalid input schema.") ## input checks if isinstance(self.test_dataset, dict): self.test_dataset = pd.DataFrame(self.test_dataset) elif isinstance(self.test_dataset, pd.DataFrame): pass else: raise Exception( f"ERROR - FAIL:(model_evaluation) - invalid input. {self.test_dataset} was given" ) ## features check test_features = sorted(self.test_dataset.columns.drop(['y']).tolist()) data_features = sorted(self.dataset.columns.drop(['y']).tolist()) if test_features != data_features: print(f"test features: {','.join(test_features)}") raise Exception( "ERROR - FAIL:(model_evaluation) - invalid features present")
def data_loader(self, batch_size=10, num_workers=4, shuffle=False, pin_memory=False): return DataLoader(self, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, pin_memory=pin_memory)
def decision_boundaries(train_data, valid_data): """ Question 5: Train the neural network using gradient descent on the two circles dataset. Plot the decision regions for several different values of the hyperparameters (weight decay, number of hidden units, early stopping) so as to illustrate their effect on the capacity of the model. """ # raw data is only used to plot the decision boundary raw_data = np.loadtxt(open(constants.Circles.DATA_PATH, 'r')) X = raw_data[:, :2] y = raw_data[:, -1] # hyperparameters HIDDEN_DIM_SET = [8, 14] NUM_EPOCH_SET = [30] LEARNING_RATE_SET = [0.05] L1_WEIGH_DECAY = [0, 0.005] L2_WEIGH_DECAY = [0, 0.005] trainloader = DataLoader(train_data, batch_size=32) devloader = DataLoader(valid_data, batch_size=len(valid_data)) i = 0 for h in HIDDEN_DIM_SET: for lr in LEARNING_RATE_SET: for l1 in L1_WEIGH_DECAY: for l2 in L2_WEIGH_DECAY: for n_epoch in NUM_EPOCH_SET: print('\nhidden_dim: {}, lr: {}, l1: {}, l2: {}'.format(h, lr, l1, l2)) mlp = MLPClassifier(constants.Circles.INPUT_DIM, constants.Circles.N_CLASSES, h, lr, n_epoch, l1, l2, l1, l2) mlp.train(trainloader, devloader) figure_name = 'decision_boundaries_{}.png'.format(i) visualize.plot_decision( X, y, path=os.path.join(constants.Circles.FIGURES_DIR, figure_name), model=mlp, param=[h, lr, n_epoch, l1, l2, l1, l2] ) i += 1
def __init__(self, args): self.args = args train = DataLoader(self.args.trainpath) dev = DataLoader(self.args.devpath) self.train_words, self.train_poss, self.train_chunks, self.train_labels = train.get_all_train_tokens( ) self.train_max_sentence_len, self.train_max_word_len = train.get_required_max_len( ) self.dev_words, self.dev_poss, self.dev_chunks, self.dev_labels = dev.get_all_train_tokens( ) self.dev_max_sentence_len, self.dev_max_word_len = dev.get_required_max_len( ) vocabulary = Vocabulary(self.train_words) self.vocab = vocabulary.get_word_vocab() self.char_vocab = vocabulary.get_char_vocab() self.train_vect = Vectorizer(self.train_max_sentence_len, self.train_max_word_len, self.vocab, self.char_vocab, self.train_words) self.dev_vect = Vectorizer(self.train_max_sentence_len, self.train_max_word_len, self.vocab, self.char_vocab, self.dev_words) self.poss_vect = LabelEncoderModel(self.train_poss, self.train_max_sentence_len) self.chunks_vect = LabelEncoderModel(self.train_chunks, self.train_max_sentence_len) self.labels_vect = LabelEncoderModel(self.train_labels, self.train_max_sentence_len) #st wrong here self.pos_emb_weights = self.poss_vect.get_emb_weights() self.chunk_emb_weights = self.chunks_vect.get_emb_weights() self.word_emb_weights, self.word_emb_dimensions = PretrainedEmbedder( self.vocab, self.args.pretrained_path).pretrained_embedder() self.model = ModelTraining( self.args.dropout, self.args.lr, len(set(sum(self.train_labels, []))), len(self.vocab), len(self.char_vocab), self.train_max_word_len, len(set(sum(self.train_poss, []))), len(set(sum(self.train_chunks, []))), word_emb_dimensions=self.word_emb_dimensions, word_emb_weights=self.word_emb_weights, pos_emb_weights=self.pos_emb_weights, chunk_emb_weights=self.chunk_emb_weights).model_build()
def test_transformer_forward_cuda(): qm9 = QM9Dataset('data/adjacency_matrix_train.pkl') np.random.seed(0) torch.manual_seed(0) dl = DataLoader(qm9, batch_size=1) sample = next(iter(dl)) transformer = TransformerModel().cuda() sample.cuda() out = transformer(sample) assert torch.equal(out['prediction'][sample.target_mask], torch.tensor([0]).cuda())
def deploy(path): assert os.path.exists(path), f'{path} not found : (' dataset = 'YOUR_DATASET_NAME' img_size = 256 test_transform = transforms.Compose([ transforms.Resize((img_size, img_size)), transforms.ToTensor(), transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) ]) testA = ImageFolder(os.path.join('dataset', dataset, 'testA'), test_transform) with fluid.dygraph.guard(): testA_loader = DataLoader(testA, batch_size=1, shuffle=False) real_A, _ = next(iter(testA_loader)) in_np = real_A.numpy() # load model place = fluid.CPUPlace() exe = fluid.Executor(place) program, feed_vars, fetch_vars = fluid.io.load_inference_model(path, exe) # inference fetch, = exe.run(program, feed={feed_vars[0]: in_np}, fetch_list=fetch_vars) def img_postprocess(img): assert isinstance(img, np.ndarray), type(img) img = img * 0.5 + 0.5 img = img.squeeze(0).transpose((1, 2, 0)) # BGR to RGB img = img[:, :, ::-1] return img in_img = img_postprocess(in_np) out_img = img_postprocess(fetch) plt.subplot(121) plt.title('real A') plt.imshow(in_img) plt.subplot(122) plt.title('A to B') plt.imshow(out_img) plt.show()
def finite_difference_check(dataset, batch_size): """ Computes the gradients for a single example, and check that the gradient is correct using the nite difference method. Answers to questions 1, 2, and 4. """ dataloader = DataLoader(dataset, batch_size) inputs, targets = next(dataloader) mlp = MLPClassifier(constants.Circles.INPUT_DIM, constants.Circles.N_CLASSES) gradHats, grads, param_names = mlp.finite_difference_check(inputs, targets) figure_name = 'finite_difference_check_batch_size_{}.png'.format(batch_size) visualize.plot_gradient( gradHats, grads, param_names, legend=['finite differences approx.', 'backpropagation'], path=os.path.join(constants.Circles.FIGURES_DIR, figure_name) )
def test_transformer_forward_cpu(): qm9 = QM9Dataset('data/adjacency_matrix_train.pkl', epsilon_greedy=0.5) np.random.seed(0) torch.manual_seed(0) dl = DataLoader(qm9, batch_size=2) sample = next(iter(dl)) transformer = TransformerModel() out = transformer(sample) assert torch.equal(out['prediction'][sample.target_mask], torch.tensor([0, 4, 0, 4, 0, 0, 0])) criterion = CrossEntropyLoss() targets = sample.targets_num assert torch.equal(targets, torch.tensor([[2, 1, 3, 2, 1, 1], [1, 0, 0, 0, 0, 0]])) targets = targets[targets != 0] targets -= 1 assert torch.equal(targets, torch.tensor([1, 0, 2, 1, 0, 0, 0])) loss = criterion(out['out'][sample.target_mask], targets) assert torch.equal(loss, cross_entropy(out['out'][sample.target_mask], targets, reduction='none').mean())
default='Transformer') parser.add_argument('--gamma', default=1, type=float) parser.add_argument('--bond_order', default=False, type=bool) parser.add_argument('--dataset', default='zinc', choices=['qm9', 'zinc']) args = parser.parse_args() train_file = f'data/{args.dataset}/adjacency_matrix_train_scaffold.pkl' if args.scaffold else f'data/{args.dataset}/adjacency_matrix_train.pkl' validation_file = f'data/{args.dataset}/adjacency_matrix_validation_scaffold.pkl' if args.scaffold else f'data/{args.dataset}/adjacency_matrix_validation.pkl' training = QM9Dataset(data=train_file, num_masks=args.num_masks, epsilon_greedy=args.epsilon_greedy, num_fake=args.num_fake, bond_order=args.bond_order) train_dl = DataLoader(training, batch_size=args.batch_size) # Create multiple validation dlators, one for 25, 50 and 75% masked atoms val_dls = [] if args.num_fake == 0: for masks in range(1, 6): val_set = QM9Dataset(data=validation_file, num_masks=masks, bond_order=args.bond_order) val_dl = DataLoader(val_set, batch_size=args.batch_size) val_dls.append(val_dl) if args.num_masks == 0: for fakes in range(1, 6):
def __init__(self, isInjector=True): self.isInjector = isInjector # Input shape cube_shape = config['cube_shape'] self.img_rows = config['cube_shape'][1] self.img_cols = config['cube_shape'][2] self.img_depth = config['cube_shape'][0] self.channels = 1 self.num_classes = 5 self.img_shape = (self.img_rows, self.img_cols, self.img_depth, self.channels) # Configure data loader if self.isInjector: self.dataset_path = config['unhealthy_samples'] self.modelpath = config['modelpath_inject'] else: self.dataset_path = config['healthy_samples'] self.modelpath = config['modelpath_remove'] self.dataloader = DataLoader(dataset_path=self.dataset_path, normdata_path=self.modelpath, img_res=(self.img_rows, self.img_cols, self.img_depth)) # Calculate output shape of D (PatchGAN) patch = int(self.img_rows / 2**4) self.disc_patch = (patch, patch, patch, 1) # Number of filters in the first layer of G and D self.gf = 100 self.df = 100 optimizer = Adam(0.0002, 0.5) optimizer_G = Adam(0.000001, 0.5) # Build and compile the discriminator self.discriminator = self.build_discriminator() self.discriminator.summary() self.discriminator.compile(loss='mse', optimizer=optimizer_G, metrics=['accuracy']) # ------------------------- # Construct Computational # Graph of Generator # ------------------------- # Build the generator self.generator = self.build_generator() self.generator.summary() # Input images and their conditioning images img_A = Input(shape=self.img_shape) img_B = Input(shape=self.img_shape) # By conditioning on B generate a fake version of A fake_A = self.generator([img_B]) # For the combined model we will only train the generator self.discriminator.trainable = False # Discriminators determines validity of translated images / condition pairs valid = self.discriminator([fake_A, img_B]) self.combined = Model(inputs=[img_A, img_B], outputs=[valid, fake_A]) self.combined.compile(loss=['mse', 'mae'], loss_weights=[1, 100], optimizer=optimizer)
class Trainer: def __init__(self, isInjector=True): self.isInjector = isInjector # Input shape cube_shape = config['cube_shape'] self.img_rows = config['cube_shape'][1] self.img_cols = config['cube_shape'][2] self.img_depth = config['cube_shape'][0] self.channels = 1 self.num_classes = 5 self.img_shape = (self.img_rows, self.img_cols, self.img_depth, self.channels) # Configure data loader if self.isInjector: self.dataset_path = config['unhealthy_samples'] self.modelpath = config['modelpath_inject'] else: self.dataset_path = config['healthy_samples'] self.modelpath = config['modelpath_remove'] self.dataloader = DataLoader(dataset_path=self.dataset_path, normdata_path=self.modelpath, img_res=(self.img_rows, self.img_cols, self.img_depth)) # Calculate output shape of D (PatchGAN) patch = int(self.img_rows / 2**4) self.disc_patch = (patch, patch, patch, 1) # Number of filters in the first layer of G and D self.gf = 100 self.df = 100 optimizer = Adam(0.0002, 0.5) optimizer_G = Adam(0.000001, 0.5) # Build and compile the discriminator self.discriminator = self.build_discriminator() self.discriminator.summary() self.discriminator.compile(loss='mse', optimizer=optimizer_G, metrics=['accuracy']) # ------------------------- # Construct Computational # Graph of Generator # ------------------------- # Build the generator self.generator = self.build_generator() self.generator.summary() # Input images and their conditioning images img_A = Input(shape=self.img_shape) img_B = Input(shape=self.img_shape) # By conditioning on B generate a fake version of A fake_A = self.generator([img_B]) # For the combined model we will only train the generator self.discriminator.trainable = False # Discriminators determines validity of translated images / condition pairs valid = self.discriminator([fake_A, img_B]) self.combined = Model(inputs=[img_A, img_B], outputs=[valid, fake_A]) self.combined.compile(loss=['mse', 'mae'], loss_weights=[1, 100], optimizer=optimizer) def build_generator(self): """U-Net Generator""" def get_crop_shape(target, refer): # depth, the 4rth dimension cd = (target.get_shape()[3] - refer.get_shape()[3]).value assert (cd >= 0) if cd % 2 != 0: cd1, cd2 = int(cd / 2), int(cd / 2) + 1 else: cd1, cd2 = int(cd / 2), int(cd / 2) # width, the 3rd dimension cw = (target.get_shape()[2] - refer.get_shape()[2]).value assert (cw >= 0) if cw % 2 != 0: cw1, cw2 = int(cw / 2), int(cw / 2) + 1 else: cw1, cw2 = int(cw / 2), int(cw / 2) # height, the 2nd dimension ch = (target.get_shape()[1] - refer.get_shape()[1]).value assert (ch >= 0) if ch % 2 != 0: ch1, ch2 = int(ch / 2), int(ch / 2) + 1 else: ch1, ch2 = int(ch / 2), int(ch / 2) return (ch1, ch2), (cw1, cw2), (cd1, cd2) def conv3d(layer_input, filters, f_size=4, bn=True): """Layers used during downsampling""" d = Conv3D(filters, kernel_size=f_size, strides=2, padding='same')(layer_input) d = LeakyReLU(alpha=0.2)(d) if bn: d = BatchNormalization(momentum=0.8)(d) return d def deconv3d(layer_input, skip_input, filters, f_size=4, dropout_rate=0.5): """Layers used during upsampling""" u = UpSampling3D(size=2)(layer_input) u = Conv3D(filters, kernel_size=f_size, strides=1, padding='same', activation='relu')(u) if dropout_rate: u = Dropout(dropout_rate)(u) u = BatchNormalization(momentum=0.8)(u) # u = Concatenate()([u, skip_input]) ch, cw, cd = get_crop_shape(u, skip_input) crop_conv4 = Cropping3D(cropping=(ch, cw, cd), data_format="channels_last")(u) u = Concatenate()([crop_conv4, skip_input]) return u # Image input d0 = Input(shape=self.img_shape, name="input_image") # Downsampling d1 = conv3d(d0, self.gf, bn=False) d2 = conv3d(d1, self.gf * 2) d3 = conv3d(d2, self.gf * 4) d4 = conv3d(d3, self.gf * 8) d5 = conv3d(d4, self.gf * 8) u3 = deconv3d(d5, d4, self.gf * 8) u4 = deconv3d(u3, d3, self.gf * 4) u5 = deconv3d(u4, d2, self.gf * 2) u6 = deconv3d(u5, d1, self.gf) u7 = UpSampling3D(size=2)(u6) output_img = Conv3D(self.channels, kernel_size=4, strides=1, padding='same', activation='tanh')(u7) return Model(inputs=[d0], outputs=[output_img]) def build_discriminator(self): def d_layer(layer_input, filters, f_size=4, bn=True): """Discriminator layer""" d = Conv3D(filters, kernel_size=f_size, strides=2, padding='same')(layer_input) d = LeakyReLU(alpha=0.2)(d) if bn: d = BatchNormalization(momentum=0.8)(d) return d img_A = Input(shape=self.img_shape) img_B = Input(shape=self.img_shape) # Concatenate image and conditioning image by channels to produce input model_input = Concatenate(axis=-1)([img_A, img_B]) d1 = d_layer(model_input, self.df, bn=False) d2 = d_layer(d1, self.df * 2) d3 = d_layer(d2, self.df * 4) d4 = d_layer(d3, self.df * 8) validity = Conv3D(1, kernel_size=4, strides=1, padding='same')(d4) return Model([img_A, img_B], validity) def train(self, epochs, batch_size=1, sample_interval=50): start_time = datetime.datetime.now() # Adversarial loss ground truths valid = np.zeros((batch_size, ) + self.disc_patch) fake = np.ones((batch_size, ) + self.disc_patch) for epoch in range(epochs): # save model if epoch > 0: print("Saving Models...") self.generator.save(os.path.join( self.modelpath, "G_model.h5")) # creates a HDF5 file self.discriminator.save( os.path.join( self.modelpath, "D_model.h5")) # creates a HDF5 file 'my_model.h5' for batch_i, (imgs_A, imgs_B) in enumerate( self.dataloader.load_batch(batch_size)): # --------------------- # Train Discriminator # --------------------- # Condition on B and generate a translated version fake_A = self.generator.predict([imgs_B]) # Train the discriminators (original images = real / generated = Fake) d_loss_real = self.discriminator.train_on_batch( [imgs_A, imgs_B], valid) d_loss_fake = self.discriminator.train_on_batch( [fake_A, imgs_B], fake) d_loss = 0.5 * np.add(d_loss_real, d_loss_fake) # ----------------- # Train Generator # ----------------- # Train the generators g_loss = self.combined.train_on_batch([imgs_A, imgs_B], [valid, imgs_A]) elapsed_time = datetime.datetime.now() - start_time # Plot the progress print( "[Epoch %d/%d] [Batch %d/%d] [D loss: %f, acc: %3d%%] [G loss: %f] time: %s" % (epoch, epochs, batch_i, self.dataloader.n_batches, d_loss[0], 100 * d_loss[1], g_loss[0], elapsed_time)) # If at save interval => save generated image samples if batch_i % sample_interval == 0: self.show_progress(epoch, batch_i) def show_progress(self, epoch, batch_i): filename = "%d_%d.png" % (epoch, batch_i) if self.isInjector: savepath = os.path.join(config['progress'], "injector") else: savepath = os.path.join(config['progress'], "remover") os.makedirs(savepath, exist_ok=True) r, c = 3, 3 imgs_A, imgs_B = self.dataloader.load_data(batch_size=3, is_testing=True) fake_A = self.generator.predict([imgs_B]) gen_imgs = np.concatenate([imgs_B, fake_A, imgs_A]) # Rescale images 0 - 1 gen_imgs = 0.5 * gen_imgs + 0.5 titles = ['Condition', 'Generated', 'Original'] fig, axs = plt.subplots(r, c) cnt = 0 for i in range(r): for j in range(c): axs[i, j].imshow(gen_imgs[cnt].reshape( (self.img_rows, self.img_cols, self.img_depth))[16, :, :]) axs[i, j].set_title(titles[i]) axs[i, j].axis('off') cnt += 1 fig.savefig(os.path.join(savepath, filename)) plt.close()
path=constants.TRAIN_PATH, input_features=constants.INPUT_FEATURES, output_features=constants.OUTPUT_FEATURES, header=0, transform=lambda X: [x / 255 for x in X] ) valid_data = Dataset( path=constants.VALID_PATH, input_features=constants.INPUT_FEATURES, output_features=constants.OUTPUT_FEATURES, header=0, transform=lambda X: [x / 255 for x in X] ) trainloader = DataLoader(train_data, batch_size=constants.BATCH_SIZE) devloader = DataLoader(valid_data, batch_size=1000) mlp = MLPClassifier( input_size=constants.INPUT_DIM, hidden_size=constants.HIDDEN_DIM, output_size=constants.N_CLASSES, learning_rate=constants.LEARNING_RATE, num_epochs=constants.NUM_EPOCHS ) loss_storage, acc_storage = mlp.train( trainloader, devloader, log=os.path.join(constants.RESULTS_DIR, 'mnist_log.txt') )
def main(argv): funcs = pd.read_pickle(os.path.join(FLAGS.resources, '{}.pkl'.format(FLAGS.function)))['functions'].values funcs = GODAG.initialize_idmap(funcs, FLAGS.function) log.info('GO DAG initialized. Updated function list-{}'.format(len(funcs))) FeatureExtractor.load(FLAGS.resources) log.info('Loaded amino acid and ngram mapping data') data = DataLoader(filename=FLAGS.inputfile) modelsavename = 'savedmodels_{}_{}'.format(__processor__, int(time.time())) if FLAGS.predict != '': modelsavename = FLAGS.predict bestthres = 0.1 log.info('no training') valid_dataiter = DataIterator(batchsize=FLAGS.batchsize, size=FLAGS.validationsize, dataloader=data, functype=FLAGS.function, featuretype='onehot') train_iter = DataIterator(batchsize=FLAGS.batchsize, size=FLAGS.trainsize, seqlen=FLAGS.maxseqlen, dataloader=data, numfiles=np.floor((FLAGS.trainsize * FLAGS.batchsize) / 250000), functype=FLAGS.function, featuretype='onehot') next(valid_dataiter) next(train_iter) else: with tf.Session() as sess: valid_dataiter = DataIterator(batchsize=FLAGS.batchsize, size=FLAGS.validationsize, dataloader=data, functype=FLAGS.function, featuretype='onehot') train_iter = DataIterator(batchsize=FLAGS.batchsize, size=FLAGS.trainsize, seqlen=FLAGS.maxseqlen, dataloader=data, numfiles=np.floor((FLAGS.trainsize * FLAGS.batchsize) / 250000), functype=FLAGS.function, featuretype='onehot') encoder = CHARCNNEncoder(vocab_size=len(FeatureExtractor.aminoacidmap) + 1, inputsize=train_iter.expectedshape).build() log.info('built encoder') decoder = HierarchicalGODecoder(funcs, encoder.outputs, FLAGS.function).build(GODAG) log.info('built decoder') init = tf.global_variables_initializer() init.run(session=sess) chkpt = tf.train.Saver(max_to_keep=4) train_writer = tf.summary.FileWriter(FLAGS.outputdir + '/train', sess.graph) test_writer = tf.summary.FileWriter(FLAGS.outputdir + '/test') step = 0 maxwait = 1 wait = 0 bestf1 = -1 metagraphFlag = True log.info('starting epochs') for epoch in range(FLAGS.num_epochs): for x, y in train_iter: if x.shape[0] != y.shape[0]: raise Exception('invalid, x-{}, y-{}'.format(str(x.shape), str(y.shape))) _, loss, summary = sess.run([decoder.train, decoder.loss, decoder.summary], feed_dict={decoder.ys_: y, encoder.xs_: x, decoder.threshold: [0.2]}) train_writer.add_summary(summary, step) log.info('step-{}, loss-{}'.format(step, round(loss, 2))) step += 1 if True: log.info('beginning validation') prec, recall, f1 = validate(valid_dataiter, sess, encoder, decoder, test_writer) thres = np.argmax(np.round(f1, 2)) log.info('epoch: {} \n precision: {}, recall: {}, f1: {}'.format(epoch, np.round(prec, 2)[thres], np.round(recall, 2)[thres], np.round(f1, 2)[thres])) log.info('precision mat {}'.format(str(np.round(prec, 2)))) log.info('recall mat {}'.format(str(np.round(recall, 2)))) log.info('f1 mat {}'.format(str(np.round(f1, 2)))) log.info('selected threshold is {}'.format(thres/10 + 0.1)) if f1[thres] > (bestf1 + 1e-3): bestf1 = f1[thres] bestthres = THRESHOLD_RANGE[thres] wait = 0 chkpt.save(sess, os.path.join(FLAGS.outputdir, modelsavename, 'model_{}_{}'.format(FLAGS.function, step)), global_step=step, write_meta_graph=metagraphFlag) metagraphFlag = False else: wait += 1 if wait > maxwait: log.info('f1 didnt improve for last {} validation steps, so stopping'.format(maxwait)) break step += 1 train_iter.reset() log.info('testing model') test_dataiter = DataIterator(batchsize=FLAGS.batchsize, size=FLAGS.testsize, dataloader=data, functype=FLAGS.function, featuretype='onehot') prec, recall, f1 = predict_evaluate(test_dataiter, [bestthres], os.path.join(FLAGS.outputdir, modelsavename)) log.info('test results') log.info('precision: {}, recall: {}, F1: {}'.format(round(prec, 2), round(recall, 2), round(f1, 2))) data.close()
def main(argv): goids = GODAG.initialize_idmap(None, None) labelembedding = load_labelembedding(os.path.join(FLAGS.resources, 'goEmbeddings.txt'), goids) assert(labelembedding.shape[0] == (len(goids))) , 'label embeddings and known go ids differ' ## Add a row of zeros to refer to NOGO or STOPGO labelembedding = np.vstack([np.zeros(labelembedding.shape[1]), labelembedding]).astype(np.float32) labelembeddingsize = labelembedding.shape[1] # shift all goids by 1, to allow STOPGO GODAG.idmap = {key: (val + 1) for key, val in GODAG.idmap.items()} log.info('min go index - {}'.format(min(list(GODAG.idmap.values())))) GODAG.idmap['STOPGO'] = 0 GODAG.GOIDS.insert(0, 'STOPGO') log.info('first from main-{}, from goids-{}, from idmap-{}, by reversemap-{}'.format(goids[0], GODAG.GOIDS[1], GODAG.id2node(1), GODAG.get_id(goids[0]))) FeatureExtractor.load(FLAGS.resources) log.info('Loaded amino acid and ngram mapping data') data = DataLoader(filename=FLAGS.inputfile) modelsavename = FLAGS.predict if FLAGS.predict == "": modelsavename = 'savedmodels_{}'.format(int(time.time())) with tf.Session() as sess: # sess = tf_debug.LocalCLIDebugWrapperSession(sess) valid_dataiter = DataIterator(batchsize=FLAGS.batchsize, size=FLAGS.validationsize, dataloader=data, functype=FLAGS.function, featuretype='onehot', onlyLeafNodes=True, numfuncs=FLAGS.maxnumfuncs) train_iter = DataIterator(batchsize=FLAGS.batchsize, size=FLAGS.trainsize, seqlen=FLAGS.maxseqlen, dataloader=data, numfiles=np.floor((FLAGS.trainsize * FLAGS.batchsize) / 250000), functype=FLAGS.function, featuretype='onehot', onlyLeafNodes=True, numfuncs=FLAGS.maxnumfuncs) #encoder = CNNEncoder(vocab_size=len(FeatureExtractor.ngrammap) + 1, inputsize=train_iter.expectedshape).build() encoder = MultiCharCNN(vocab_size=len(FeatureExtractor.aminoacidmap) + 1, inputsize=train_iter.expectedshape, with_dilation=False, charfilter=32, poolsize=80, poolstride=48).build() log.info('built encoder') decoder = GORNNDecoder(encoder.outputs, labelembedding, numfuncs=FLAGS.maxnumfuncs, trainlabelEmbedding=FLAGS.trainlabel, distancefunc=FLAGS.distancefunc, godag=GODAG).build() log.info('built decoder') init = tf.global_variables_initializer() init.run(session=sess) chkpt = tf.train.Saver(max_to_keep=4) train_writer = tf.summary.FileWriter(FLAGS.outputdir + '/train', sess.graph) test_writer = tf.summary.FileWriter(FLAGS.outputdir + '/test') step = 0 maxwait = 2 wait = 0 bestf1 = 0 bestthres = 0 metagraphFlag = True log.info('starting epochs') log.info('params - trainsize-{}, validsie-{}, rootfunc-{}, batchsize-{}'.format(FLAGS.trainsize, FLAGS.validationsize, FLAGS.function, FLAGS.batchsize)) for epoch in range(FLAGS.num_epochs): for x, y in train_iter: if x.shape[0] != y.shape[0]: raise Exception('invalid, x-{}, y-{}'.format(str(x.shape), str(y.shape))) negatives = get_negatives(y, 10) _, loss, summary = sess.run([decoder.train, decoder.loss, decoder.summary], feed_dict={decoder.ys_: y[:, :FLAGS.maxnumfuncs], encoder.xs_: x, decoder.negsamples: negatives, decoder.istraining: [True]}) train_writer.add_summary(summary, step) log.info('step-{}, loss-{}'.format(step, round(loss, 2))) step += 1 log.info('beginning validation') prec, recall, f1 = validate(valid_dataiter, sess, encoder, decoder, test_writer) log.info('epoch: {} \n precision: {}, recall: {}, f1: {}'.format(epoch, np.round(prec, 2), np.round(recall, 2), np.round(f1, 2))) if np.round(f1,2) >= (bestf1): bestf1 = np.round(f1,2) wait = 0 log.info('saving meta graph') #ipdb.set_trace() chkpt.save(sess, os.path.join(FLAGS.outputdir, modelsavename, 'model_{}_{}'.format(FLAGS.function, step)), global_step=step, write_meta_graph=metagraphFlag) metagraphFlag = True else: wait += 1 if wait > maxwait: log.info('f1 didnt improve for last {} validation steps, so stopping'.format(maxwait)) break train_iter.reset() prec, recall, f1 = validate(train_iter, sess, encoder, decoder, None) log.info('training error,epoch-{}, precision: {}, recall: {}, f1: {}'.format(epoch, np.round(prec, 2), np.round(recall, 2), np.round(f1, 2))) train_iter.reset() log.info('testing model') test_dataiter = DataIterator(batchsize=FLAGS.batchsize, size=FLAGS.testsize, dataloader=data, functype=FLAGS.function, featuretype='onehot', onlyLeafNodes=True, numfuncs=FLAGS.maxnumfuncs) prec, recall, f1 = predict_evaluate(test_dataiter, os.path.join(FLAGS.outputdir, modelsavename)) log.info('test results') log.info('precision: {}, recall: {}, F1: {}'.format(round(prec, 2), round(recall, 2), round(f1, 2))) data.close()
from utils.bert import get_config, BertModel, BertForEmoji, set_learned_params from torch import optim, nn import torch from utils.dataloader import DataLoader from utils.train import train_model train_dl, val_dl, TEXT, dataloaders_dict = DataLoader(max_length=256, batch_size=32) # モデル設定のJOSNファイルをオブジェクト変数として読み込む config = get_config(file_path="./weights/bert_config.json") # ベースのBERTモデルを生成 net_bert = BertModel(config) # BERTモデルに学習済みパラメータセット net_bert = set_learned_params(net_bert, weights_path="./weights/pytorch_model.bin") net = BertForEmoji(net_bert) # 訓練モードに設定 net.train() # 勾配計算を最後のBertLayerモジュールと追加した分類アダプターのみ実行 for name, param in net.named_parameters(): param.requires_grad = False for name, param in net.bert.encoder.layer[-1].named_parameters(): param.requires_grad = True
def main(argv): goids = GODAG.initialize_idmap(None, None) # GO_MAT = GODAG.get_fullmat(goids) # log.info('GO Matrix shape - {}'.format(GO_MAT.shape)) # GO_MAT = np.vstack([np.zeros(GO_MAT.shape[1]), GO_MAT]) labelembedding = load_labelembedding(os.path.join(FLAGS.data, 'goEmbeddings.txt'), goids) assert(labelembedding.shape[0] == (len(goids) + 1)) , 'label embeddings and known go ids differ' labelembeddingsize = labelembedding.shape[1] FeatureExtractor.load(FLAGS.data) log.info('Loaded amino acid and ngram mapping data') data = DataLoader() modelsavename = 'savedmodels_{}'.format(int(time.time())) with tf.Session() as sess: # sess = tf_debug.LocalCLIDebugWrapperSession(sess) valid_dataiter = DataIterator(batchsize=FLAGS.batchsize, size=FLAGS.validationsize, dataloader=data, functype=FLAGS.function, featuretype='ngrams', onlyLeafNodes=True, limit=FLAGS.maxnumfuncs) train_iter = DataIterator(batchsize=FLAGS.batchsize, size=FLAGS.trainsize, seqlen=FLAGS.maxseqlen, dataloader=data, numfiles=np.floor((FLAGS.trainsize * FLAGS.batchsize) / 250000), functype=FLAGS.function, featuretype='ngrams', onlyLeafNodes=True, limit=FLAGS.maxnumfuncs) encoder = CNNEncoder(vocab_size=len(FeatureExtractor.ngrammap) + 1, inputsize=train_iter.expectedshape).build() log.info('built encoder') decoder = GORNNDecoder(encoder.outputs, labelembedding, numfuncs=FLAGS.maxnumfuncs).build() log.info('built decoder') init = tf.global_variables_initializer() init.run(session=sess) chkpt = tf.train.Saver(max_to_keep=4) train_writer = tf.summary.FileWriter(FLAGS.outputdir + '/train', sess.graph) test_writer = tf.summary.FileWriter(FLAGS.outputdir + '/test') step = 0 maxwait = 1 wait = 0 bestf1 = 0 bestthres = 0 metagraphFlag = True log.info('starting epochs') log.info('params - trainsize-{}, validsie-{}, rootfunc-{}, batchsize-{}'.format(FLAGS.trainsize, FLAGS.validationsize, FLAGS.function, FLAGS.batchsize)) for epoch in range(FLAGS.num_epochs): for x, y in train_iter: if x.shape[0] != y.shape[0]: raise Exception('invalid, x-{}, y-{}'.format(str(x.shape), str(y.shape))) negatives = get_negatives(y, 10) _, loss, summary = sess.run([decoder.train, decoder.loss, decoder.summary], feed_dict={decoder.ys_: y, encoder.xs_: x, decoder.negsamples: negatives}) train_writer.add_summary(summary, step) log.info('step-{}, loss-{}'.format(step, round(loss, 2))) step += 1 log.info('beginning validation') prec, recall, f1 = validate(valid_dataiter, sess, encoder, decoder, test_writer) log.info('epoch: {} \n precision: {}, recall: {}, f1: {}'.format(epoch, np.round(prec, 2), np.round(recall, 2), np.round(f1, 2))) if f1 > (bestf1 + 1e-3): bestf1 = f1 wait = 0 chkpt.save(sess, os.path.join(FLAGS.outputdir, modelsavename, 'model_{}_{}'.format(FLAGS.function, step)), global_step=step, write_meta_graph=metagraphFlag) metagraphFlag = False else: wait += 1 if wait > maxwait: log.info('f1 didnt improve for last {} validation steps, so stopping'.format(maxwait)) break train_iter.reset() log.info('testing model') test_dataiter = DataIterator(batchsize=FLAGS.batchsize, size=FLAGS.testsize, dataloader=data, functype=FLAGS.function, featuretype='ngrams', onlyLeafNodes=True, limit=FLAGS.maxnumfuncs) prec, recall, f1 = predict_evaluate(test_dataiter, [bestthres], os.path.join(FLAGS.outputdir, modelsavename)) log.info('test results') log.info('precision: {}, recall: {}, F1: {}'.format(round(prec, 2), round(recall, 2), round(f1, 2))) data.close()
from utils.dataloader import DataLoader import torch from model import model_utils from optimizer.optimizer import NoamOpt from train.trainer import Trainer hidden_size = 256 num_encoder = 6 num_decoder = 6 n_head = 8 pf_dim = 1024 drop_out = 0.5 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') device = 'cpu' dataloader = DataLoader(device) train_iterator, valid_iterator, test_iterator = dataloader.load_data(64) model = model_utils.create_model(dataloader.src_vocab_size(), dataloader.trg_vocab_size(), hidden_size, num_encoder, num_decoder, n_head, pf_dim, drop_out, dataloader.get_pad_idx(), device) print(model_utils.count_parameters(model)) model_utils.init(model) optimizer = NoamOpt(hidden_size , 1, 2000, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)) trainer = Trainer(train_iterator, valid_iterator, model, optimizer, dataloader.get_pad_idx(), device) trainer.train(5) # for i, batch in enumerate(train_iterator): # src = batch.src.permute(1, 0).to(device) # trg = batch.trg.permute(1, 0).to(device)
targets_global, predictions_global) return metric_per_length, lengths, metric_global device = torch.device('cuda') if torch.cuda.is_available() else torch.device( 'cpu') batch_size = 248 val_iters_mask = [] num_corrupted = [1, 2, 3, 4, 5, 20] for masks in num_corrupted: test_set = QM9Dataset(data='data/adjacency_matrix_test.pkl', num_masks=masks) test_dl = DataLoader(test_set, batch_size=batch_size) val_iters_mask.append(test_dl) val_iters_fake = [] for fakes in num_corrupted: test_set = QM9Dataset(data='data/adjacency_matrix_test.pkl', num_fake=fakes) test_dl = DataLoader(test_set, batch_size=batch_size) val_iters_fake.append(test_dl) model_names = [ "Transformer_num_masks=1_num_fake=0_num_same=0_num_layers=4_num_heads=3_embedding_dim=64_dropout=0.0_lr=0.001_edge_encoding=1_epsilon_greedy=0.2.pt", "BagOfWords_num_masks=1_num_fake=0_num_same=0_num_layers=4_embedding_dim=64_lr=0.0005_epsilon_greedy=0.2_bow_type=1.pt", "BagOfWords_num_masks=1_num_fake=0_num_same=0_num_layers=4_embedding_dim=64_lr=0.0005_epsilon_greedy=0.2_bow_type=2.pt",
def main(argv): funcs = pd.read_pickle( os.path.join(FLAGS.resources, '{}.pkl'.format(FLAGS.function)))['functions'].values funcs = GODAG.initialize_idmap(funcs, FLAGS.function) log.info('GO DAG initialized. Updated function list-{}'.format(len(funcs))) FeatureExtractor.load(FLAGS.resources) log.info('Loaded amino acid and ngram mapping data') pretrained = None featuretype = 'onehot' if FLAGS.pretrained != '': log.info('loading pretrained embedding') pretrained, ngrammap = load_pretrained_embedding(FLAGS.pretrained) FeatureExtractor.ngrammap = ngrammap featuretype = 'ngrams' with tf.Session() as sess: data = DataLoader(filename=FLAGS.inputfile) log.info('initializing validation data') valid_dataiter = DataIterator(batchsize=FLAGS.batchsize, size=FLAGS.validationsize, dataloader=data, functype=FLAGS.function, featuretype='ngrams', numfuncs=len(funcs), all_labels=False, autoreset=True) log.info('initializing train data') train_iter = DataIterator(batchsize=FLAGS.batchsize, size=FLAGS.trainsize, seqlen=FLAGS.maxseqlen, dataloader=data, numfiles=4, numfuncs=len(funcs), functype=FLAGS.function, featuretype='ngrams', all_labels=False, autoreset=True) vocabsize = ((len(FeatureExtractor.ngrammap) + 1) if featuretype == 'ngrams' else (len(FeatureExtractor.aminoacidmap) + 1)) model = KerasDeepGO(funcs, FLAGS.function, GODAG, train_iter.expectedshape, vocabsize, pretrained_embedding=pretrained).build() log.info('built encoder') log.info('built decoder') keras.backend.set_session(sess) log.info('starting epochs') model_path = FLAGS.outputdir + 'models/model_seq_' + FLAGS.function + '.h5' checkpointer = keras.callbacks.ModelCheckpoint(filepath=model_path, verbose=1, save_best_only=True, save_weights_only=True) earlystopper = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, verbose=1) model_jsonpath = FLAGS.outputdir + 'models/model_{}.json'.format( FLAGS.function) f = open(model_jsonpath, 'w') f.write(model.to_json()) f.close() model.fit_generator(train_iter, steps_per_epoch=FLAGS.trainsize, epochs=5, validation_data=valid_dataiter, validation_steps=FLAGS.validationsize, max_queue_size=128, callbacks=[checkpointer, earlystopper]) valid_dataiter.close() train_iter.close() log.info('initializing test data') test_dataiter = DataIterator(batchsize=FLAGS.batchsize, size=FLAGS.testsize, seqlen=FLAGS.maxseqlen, dataloader=data, numfiles=4, numfuncs=len(funcs), functype=FLAGS.function, featuretype='ngrams', all_labels=True) prec, recall, f1 = predict_evaluate(test_dataiter, model_jsonpath, model_path) log.info('testing error, prec-{}, recall-{}, f1-{}'.format( np.round(prec, 3), np.round(recall, 3), np.round(f1, 3))) data.close()