def main(checkpoint, data_count, data_cols, should_train, nb_epoch, null_pct, try_reuse_data, batch_size, execution_config): maxlen = 20 max_cells = 500 p_threshold = 0.5 checkpoint_dir = "pretrained_models/" if not os.path.isdir(checkpoint_dir): os.makedirs(checkpoint_dir) with open('Categories.txt','r') as f: Categories = f.read().splitlines() # orient the user a bit print("fixed categories are: ") Categories = sorted(Categories) print(Categories) raw_data, header = DataGenerator.gen_test_data( (data_count, data_cols), try_reuse_data) print(raw_data) # transpose the data raw_data = np.char.lower(np.transpose(raw_data).astype('U')) # do other processing and encode the data if null_pct > 0: DataGenerator.add_nulls_uniform(raw_data, null_pct) config = {} if not should_train: if execution_config is None: raise TypeError config = Simon({}).load_config(execution_config, checkpoint_dir) encoder = config['encoder'] if checkpoint is None: checkpoint = config['checkpoint'] else: encoder = Encoder(categories=Categories) encoder.process(raw_data, max_cells) # encode the data X, y = encoder.encode_data(raw_data, header, maxlen) max_cells = encoder.cur_max_cells Classifier = Simon(encoder=encoder) data = None if should_train: data = Classifier.setup_test_sets(X, y) else: data = type('data_type', (object,), {'X_test': X, 'y_test':y}) print('Sample chars in X:{}'.format(X[2, 0:10])) print('y:{}'.format(y[2])) # need to know number of fixed categories to create model category_count = y.shape[1] print('Number of fixed categories is :') print(category_count) model = Classifier.generate_model(maxlen, max_cells, category_count) Classifier.load_weights(checkpoint, config, model, checkpoint_dir) # using multi-gpu capabilities NGPUS = 2 # specify number of GPUS parallel_model = Classifier.multi_gpu_model(model, gpus=NGPUS) decay_rate = 0.001*NGPUS / nb_epoch # decay_rate = learning_rate / epochs ADAM = keras.optimizers.Adam(lr=0.001*NGPUS, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=decay_rate) batch_size = NGPUS*batch_size parallel_model.compile(loss='binary_crossentropy', optimizer=ADAM, metrics=['binary_accuracy']) ### if(should_train): start = time.time() history = Classifier.train_model(batch_size, checkpoint_dir, parallel_model, nb_epoch, data) end = time.time() print("Time for training is %f sec"%(end-start)) config = { 'encoder' : encoder, 'checkpoint' : Classifier.get_best_checkpoint(checkpoint_dir) } Classifier.save_config(config, checkpoint_dir) Classifier.plot_loss(history) #comment out on docker images... pred_headers = Classifier.evaluate_model(max_cells, parallel_model, data, encoder, p_threshold) print("DEBUG::The predicted headers are:") print(pred_headers) print("DEBUG::The actual headers are:") print(header)