def main(args): print('---------------------ARGS---------------------') print('--train_batch_size : %d' % args.train_batch_size) print('--train_epoch : %d' % args.train_epoch) print('--model_base_dir : %s' % args.model_base_dir) print('--log_base_dir : %s' % args.log_base_dir) print('--learning_rate : %f' % args.learning_rate) print('--width : %d' % args.width) print('--height : %d' % args.height) print('--save_steps : %d' % args.save_steps) print('--val_steps : %d' % args.val_steps) print('--log_steps : %d' % args.log_steps) print('--train_imgs_dir : %s' % args.train_imgs_dir) print('--test_imgs_dir : %s' % args.test_imgs_dir) print('---------------------END----------------------') subdir = datetime.strftime(datetime.now(), '%Y%m%d-%H%M%S') log_dir_today = os.path.join(os.path.expanduser(args.log_base_dir), subdir) if not os.path.isdir(log_dir_today): os.makedirs(log_dir_today) model_dir = os.path.join(os.path.expanduser(args.model_base_dir), subdir) if not os.path.isdir(model_dir): os.makedirs(model_dir) print('Model directory: %s' % model_dir) print('Log directory : %s' % log_dir_today) print('Load train and validation data...') train_imgs_dir_list = [os.path.join(args.train_imgs_dir, l) for l in os.listdir(os.path.expanduser(args.train_imgs_dir))] test_imgs_dir_list = [os.path.join(args.test_imgs_dir, l) for l in os.listdir(os.path.expanduser(args.test_imgs_dir))] assert len(train_imgs_dir_list) != 0 or len(test_imgs_dir_list) != 0 train_x, train_y = utils.prepare_dataset(train_imgs_dir_list) test_x, test_y = utils.prepare_dataset(test_imgs_dir_list) train_ds = utils.create_inputs(train_x, train_y, batch_size=args.train_batch_size) test_ds = utils.create_inputs(test_x, test_y, batch_size=args.train_batch_size, train_data=False) print('Create model...') model = create_model() print('Callbacks for tensorboard...') tb = tf.keras.callbacks.TensorBoard(log_dir=log_dir_today, histogram_freq=1, write_images=True) st = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3) print('Compile model...') model.compile(optimizer='SGD', loss='sparse_categorical_crossentropy', metrics=['accuracy'], learning_rate=args.learning_rate) print('Train model with args...') h = model.fit(train_ds, epochs=args.train_epoch, validation_data=test_ds, callbacks=[tb]) print('Done, save model to: %s' % os.path.join(model_dir, 'cafar10.h5')) model.save(os.path.join(model_dir, 'cafar10.h5'))
def plot_main_graph(ticker): df = load_ticker_data(ticker, update=True, start_history='2012-01-01') data = df['adjclose'] model = keras.models.load_model('../models/keras_tuned_model.h5') past_history = 30 target_size = 7 dataset, _, col_scaler = prepare_dataset(df, 'adjclose') LAST_SEQUENCE = dataset.shape[0] - 1 - past_history test_batch = prepare_test_batch(dataset, LAST_SEQUENCE, None, past_history).take(1) prediction = model.predict(test_batch)[0] prediction_rescaled = return_original_scale(prediction, col_scaler['adjclose']) business_days = CustomBusinessDay(calendar=USFederalHolidayCalendar()) try: start_date = datetime.strptime(df.index[-1], '%Y-%m-%d') + timedelta(days=1) except TypeError: start_date = df.index[-1] + timedelta(days=1) prediction_index = pd.date_range(start=start_date, periods=target_size, freq=business_days) prediction_final = pd.Series(data=prediction_rescaled.flatten(), index=prediction_index) data.index = pd.DatetimeIndex(data.index) fig = plot_ticker_ts(ticker, data, prediction_final) return fig
def prepare_dataset(self): source_data_file = os.path.join( self.data_path, "validate_{}".format(self.languages["source"])) target_data_file = os.path.join( self.data_path, "validate_{}".format(self.languages["target"])) return utils.prepare_dataset(source_data_file, target_data_file, self.source_converter, self.target_converter, self.GO, self.EOS, self.UNK)
def classify(**args): """ Main method that prepares dataset, builds model, executes training and displays results. :param args: keyword arguments passed from cli parser """ # only allow print-outs if execution has no repetitions allow_print = args['repetitions'] == 1 # determine classification targets and parameters to construct datasets properly cls_target, cls_str = set_classification_targets(args['cls_choice']) d = prepare_dataset(args['dataset_choice'], cls_target, args['batch_size']) print('\n\tTask: Classify «{}» using «{}»\n'.format( cls_str, d['data_str'])) print_dataset_info(d) # build and train inputs = Input(shape=(7810, )) models = [ build_model(i, d['num_classes'], inputs=inputs) for i in range(args['num_models']) ] # combine outputs of all models y = Average()([m.outputs[0] for m in models]) model = Model(inputs, outputs=y, name='multiple') model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) if allow_print: model.summary() print('') plot_model(model, to_file='img/multiple_mlp.png') model.fit(d['train_data'], steps_per_epoch=d['train_steps'], epochs=args['epochs'], verbose=1, class_weight=d['class_weights']) # evaluation model print('Evaluate ...') model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) model.evaluate(d['eval_data'], steps=d['test_steps'], verbose=1) # predict on testset and calculate classification report and confusion matrix for diagnosis print('Test ...') pred = model.predict(d['test_data'], steps=d['test_steps']) if allow_print: diagnose_output(d['test_labels'], pred.argmax(axis=1), d['classes_trans']) return balanced_accuracy_score(d['test_labels'], pred.argmax(axis=1))
def run(train_csv, imagedir, model_path): train_data = utils.load(train_csv) classes = utils.determine_classes(train_data) train_set = prepare_dataset(train_data, imagedir, classes) cfg = PredictionConfig() model = MaskRCNN(mode="inference", model_dir=imagedir, config=cfg) model.load_weights(model_path, by_name=True) train_mAP = evaluate_model(train_set, model, cfg, classes, model_path) print("Train mAP: %.3f" % train_mAP)
def classify(**args): """ Main method that prepares dataset, builds model, executes training and displays results. :param args: keyword arguments passed from cli parser """ # only allow print-outs if execution has no repetitions allow_print = args['repetitions'] == 1 # determine classification targets and parameters to construct datasets properly cls_target, cls_str = set_classification_targets(args['cls_choice']) d = prepare_dataset(args['dataset_choice'], cls_target, args['batch_size'], train_shuffle_repeat=False, categorical_labels=False) print('\n\tTask: Classify «{}» using «{}» with DecisionTreeClassifier\n'. format(cls_str, d['data_str'])) print_dataset_info(d) model = DecisionTreeClassifier(class_weight='balanced') # empty train data generator into list, then train. Careful with RAM train_data = [ sample for batch in tqdm( d['train_data'], total=d['train_steps'], desc='prep_train') for sample in batch[0] ] model.fit(train_data, d['train_labels']) del train_data # predict on testset and calculate classification report and confusion matrix for diagnosis test_data = [ sample for batch in tqdm( d['test_data'], total=d['test_steps'], desc='prep_test') for sample in batch[0] ] pred = model.predict(test_data) del test_data if allow_print: # visualise decision tree, from datacamp.com/community/tutorials/decision-tree-classification-python dot_data = StringIO() export_graphviz(model, out_file=dot_data, filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf('img/decision_tree.pdf') diagnose_output(d['test_labels'], pred, d['classes_trans']) return balanced_accuracy_score(d['test_labels'], pred)
def run(train_csv, test_csv, imagedir, model_path): data = utils.load(train_csv) classes = utils.determine_classes(data) test_data = utils.load(test_csv, is_train=False) test_set = prepare_dataset(test_data, imagedir, classes) cfg = PredictionConfig() model = MaskRCNN(mode="inference", model_dir=imagedir, config=cfg) model.load_weights(model_path, by_name=True) evaluate_model(test_set, model, cfg, classes, model_path) print("Done!")
def run(csv, model_dir, model_file, img_name): data = utils.load(csv) classes = utils.determine_classes(data) train_set = prepare_dataset(data, model_dir, classes) cfg = PredictionConfig() model = MaskRCNN(mode="inference", model_dir=model_dir, config=cfg) model.load_weights(model_file, by_name=True) image_path = f"{model_dir}/{img_name}" plot(train_set, model, cfg, image_path, classes) pyplot.show()
def run(csv, model_dir, model_file, img_name): data = utils.load(csv) classes = utils.determine_classes(data) train_set = prepare_dataset(data, model_dir, classes) cfg = PredictionConfig() model = MaskRCNN(mode="inference", model_dir=model_dir, config=cfg) model.load_weights(model_file, by_name=True) idx = data[data["image"] == img_name].index[0] plot(train_set, model, cfg, classes, 0, idx) pyplot.show()
def load_dataset(): train_set, valid_set, test_set = prepare_dataset(dataset_dir) train_x, train_y = train_set valid_x, valid_y = valid_set train_y = get_one_hot(train_y, 10) valid_y = get_one_hot(valid_y, 10) dataset = { "train_x": train_x, "train_y": train_y, "valid_x": valid_x, "valid_y": valid_y } return dataset
def __init__(self, data_type, config, vocab, is_train): self.config = config self.vocab = vocab filepath = DatasetConll2003.get_data_file(data_type, config) sentences = load_sentences(os.path.join(config.data_dir, filepath), config.label_type) self.data = prepare_dataset(sentences, self.vocab, config) self.i = 0 self.is_train = is_train self.epoch = 0 self.iterations = 0
def classify(**args): """ Main method that prepares dataset, builds model, executes training and displays results. :param args: keyword arguments passed from cli parser """ # only allow print-outs if execution has no repetitions allow_print = args['repetitions'] == 1 # determine classification targets and parameters to construct datasets properly cls_target, cls_str = set_classification_targets(args['cls_choice']) d = prepare_dataset(args['dataset_choice'], cls_target, args['batch_size'], args['norm_choice']) print('\n\tTask: Classify «{}» using «{}»\n'.format( cls_str, d['data_str'])) print_dataset_info(d) model = build_model(0, d['num_classes'], name='baseline_mlp', new_input=True) model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) if allow_print: model.summary() print('') # callback to log data for TensorBoard # tb_callback = TensorBoard(log_dir='./results', histogram_freq=0, write_graph=True, write_images=True) # train and evaluate model.fit( d['train_data'], steps_per_epoch=d['train_steps'], epochs=args['epochs'], # callbacks=[tb_callback], verbose=1, class_weight=d['class_weights']) model.evaluate(d['eval_data'], steps=d['test_steps'], verbose=1) # predict on testset and calculate classification report and confusion matrix for diagnosis pred = model.predict(d['test_data'], steps=d['test_steps']) if allow_print: diagnose_output(d['test_labels'], pred.argmax(axis=1), d['classes_trans']) return balanced_accuracy_score(d['test_labels'], pred.argmax(axis=1))
def run(path_to_csv="train.csv"): wandb.init() data = utils.load(path_to_csv) classes = utils.determine_classes(data) train, test = train_test_split(data, test_size=0.2) train_set = prepare_dataset(train, "./train/", classes) valid_set = prepare_dataset(test, "./train/", classes) print('Train: %d' % len(train_set.image_ids)) print('Test: %d' % len(valid_set.image_ids)) config = TrainingConfig() callbacks = [] if wandb_found: callbacks.append(WandbCallback()) config.STEPS_PER_EPOCH = wandb.config.STEPS_PER_EPOCH config.LEARNING_RATE = wandb.config.LEARNING_RATE config.LEARNING_MOMENTUM = wandb.config.LEARNING_MOMENTUM config.WEIGHT_DECAY = wandb.config.WEIGHT_DECAY else: # configure params through directly editing the TrainingConfig class pass model = MaskRCNN(mode="training", model_dir="train", config=config) model.load_weights("mask_rcnn_coco.h5", by_name=True, exclude=[ "mrcnn_class_logits", "mrcnn_bbox_fc", "mrcnn_bbox", "mrcnn_mask" ]) # tb = keras.callbacks.TensorBoard(log_dir="./logs") model.train(train_set, valid_set, learning_rate=config.LEARNING_RATE, epochs=wandb.config.EPOCHS, layers="heads", custom_callbacks=callbacks)
def classify(**args): """ Main method that prepares dataset, builds model, executes training and displays results. :param args: keyword arguments passed from cli parser """ # only allow print-outs if execution has no repetitions allow_print = args['repetitions'] == 1 # determine classification targets and parameters to construct datasets properly cls_target, cls_str = set_classification_targets(args['cls_choice']) d = prepare_dataset( args['dataset_choice'], cls_target, args['batch_size'], args['norm_choice'], mp_heatmap=True) print('\n\tTask: Classify «{}» using «{}»\n'.format(cls_str, d['data_str'])) print_dataset_info(d) model = build_model(0, d['num_classes'], name='64shot_mlp', new_input=True) model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) if allow_print: model.summary() print('') # train and evaluate model.fit( x=d['train_data'], steps_per_epoch=d['train_steps'], epochs=args['epochs'], verbose=1, class_weight=d['class_weights']) model.evaluate(d['eval_data'], steps=d['test_steps'], verbose=1) # predict on testset and calculate classification report and confusion matrix for diagnosis pred = model.predict(d['test_data'], steps=d['test_steps'], verbose=1) # instead of argmax, reduce list to only on-target predictions to see how accurate the model judged each shot target_preds = [pred[i][l] for i,l in enumerate(d['test_labels'])] pred = pred.argmax(axis=1) compute_accuracy_heatmaps(d, target_preds, cls_target, args['epochs']) return balanced_accuracy_score(d['test_labels'], pred)
def main(): root = '.' current_dataset = 'indoors' out_dir_tr = os.path.join(root, 'out', current_dataset, 'feature_data') out_dir_test = os.path.join(root, 'out', current_dataset, 'feature_data_test') batch_size = 1 nets_and_features = create_dict_nets_and_features() dataset, stats, number_of_classes = misc(root, current_dataset) dataset_train_temp, dataset_test_temp = prepare_dataset(dataset) for j in xrange(1): dataset_train = dataset_train_temp + '_' + str(j) dataset_test = dataset_test_temp + '_' + str(j) list_of_net_names = ['resnet18', 'resnet152', 'densenet121', 'densenet201'] dataset_size_train, dataset_size_test = dataset_size(current_dataset)[0], dataset_size(current_dataset)[1] for i, net_type in enumerate(list_of_net_names): net, feature_size = fe.get_net_info(net_type.split("_")[0], number_of_classes, nets_and_features) train_loader = prepare_loader_val(dataset_train, stats, batch_size) test_loader = prepare_loader_val(dataset_test, stats, batch_size) # if net is densenet if net_type[:3] == 'den': fc7_features_tr, labels_tr, net_tr, fnames_tr = fe.extract_features_train(net, feature_size, dataset_size_train, train_loader, dense=1) fc7_features_test, labels_test, net_test, fnames_test = fe.extract_features_train(net, feature_size, dataset_size_test, test_loader, dense=1) # if net is resnet else: fc7_features_tr, labels_tr, net_tr, fnames_tr = fe.extract_features_train(net, feature_size, dataset_size_train, train_loader, dense=0) fc7_features_test, labels_test, net_test, fnames_test = fe.extract_features_train(net, feature_size, dataset_size_test, test_loader, dense=0) # store the name of the net, the dataset on which we are going to use it, and the testing accuracy net_info_tr = [net_type.split("_")[0], labels_tr, fc7_features_tr, fnames_tr] with open(os.path.join(out_dir_tr, net_type.split("_")[0] + '_' + str(j) + '.pickle'), 'wb') as f: pickle.dump(net_info_tr, f, pickle.HIGHEST_PROTOCOL) net_info_test = [net_type.split("_")[0], labels_test, fc7_features_test, fnames_test] with open(os.path.join(out_dir_test, net_type.split("_")[0] + '_' + str(j) + '.pickle'), 'wb') as f: pickle.dump(net_info_test, f, pickle.HIGHEST_PROTOCOL)
def run_mlp(X_train, y_train, X_val, y_val, X_test, y_test): print 'PREPARING DATASET' dataset = utils.prepare_dataset(X_train, y_train, X_val, y_val, X_test, y_test) print 'BUILDING MODEL' output_layer = build_model( input_dim = dataset['input_dim'], output_dim = dataset['output_dim'], ) print 'CREATING ITER FUNCS' iter_funcs = create_iter_functions(dataset, output_layer) for epoch in train(iter_funcs, dataset): print("Epoch %d of %d" % (epoch['number'], num_epochs)) print("\ttraining loss:\t\t%.6f" % epoch['train_loss']) print("\tvalidation loss:\t\t%.6f" % epoch['valid_loss']) print("\tvalidation accuracy:\t\t%.2f %%" % (epoch['valid_accuracy'] * 100)) if epoch['number'] >= num_epochs: break
def classify(**args): """ Main method that prepares dataset, builds model, executes training and displays results. :param args: keyword arguments passed from cli parser """ batch_size = 64 # determine classification targets and parameters to construct datasets properly cls_target, cls_str = set_classification_targets(args['cls_choice']) d = prepare_dataset(args['dataset_choice'], cls_target, batch_size) print('\n\tTask: Classify «{}» using «{}»\n'.format( cls_str, d['data_str'])) print_dataset_info(d) model = build_model(0, d['num_classes'], name='baseline_mlp', new_input=True) model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) # train and evaluate model.fit(d['train_data'], steps_per_epoch=d['train_steps'], epochs=args['epochs'], verbose=1, class_weight=d['class_weights']) print('Evaluate ...') model.evaluate(d['eval_data'], steps=d['test_steps'], verbose=1) # predict on testset and calculate classification report and confusion matrix for diagnosis print('Test ...') pred = model.predict(d['test_data'], steps=d['test_steps']) diagnose_output(d['test_labels'], pred.argmax(axis=1), d['classes_trans'])
def __init__(self, indneed): self.indneed = indneed self.bestparas = np.loadtxt(model_path+rmsess[indneed][0]+'/paras/parameters.txt',dtype='str') X_train, X_test, y_train, self.y_test = prepare_dataset(featurename=self.bestparas[0]) #print("Size of features in training data: {}".format(X_train.shape)) #print("Size of output in training data: {}".format(y_train.shape)) #print("Size of features in test data: {}".format(X_test.shape)) #print("Size of output in test data: {}".format(y_test.shape)) scalers = {} datas = [X_train, X_test, y_train, self.y_test] for i in range(4): datas[i],scalers[i] = preprocess_zscore(datas[i]) X_train_, X_test_, y_train_, y_test_ = datas model = MixtureDensityNetwork(n_input=X_train.shape[1], n_output=2, n_components = int(self.bestparas[1]),n_hiddens=[int(self.bestparas[4]), int(self.bestparas[4])-2, int(self.bestparas[4])-4],logsigma_min=int(self.bestparas[-2]), logsigma_max=int(self.bestparas[-1])) model.load_state_dict(torch.load(model_path+rmsess[indneed][0]+'/model')) logpi_pred, logsigma_pred, mu_pred = model(torch.Tensor(X_test_)) #logpi_pred.size(), logsigma_pred.size(), mu_pred.size() self.pi_reversed, self.sigma_reversed, self.mu_reversed = get_original_parameters(logpi_pred, logsigma_pred, mu_pred) #print (self.pi_reversed.shape, self.sigma_reversed.shape, self.mu_reversed.shape) self.prediction_xy = np.ndarray([250,2]) self.prediction_xy_random = np.ndarray([250,2]) self.probabes = np.ndarray([250]) for i in tqdm(range(250)): _, pred = get_prediction(self.pi_reversed[i], self.mu_reversed[i],self.sigma_reversed[i],int(self.bestparas[1])) self.probabes[i]=get_probability(pred,self.pi_reversed[i],self.mu_reversed[i],self.sigma_reversed[i],int(self.bestparas[1]),scalers[3]) self.prediction_xy[i]=pred self.prediction_xy_random[i]=get_prediction_random(self.pi_reversed[i], self.mu_reversed[i], self.sigma_reversed[i],int(self.bestparas[1])) self.prediction_xy_reverse = scalers[3].inverse_transform(self.prediction_xy) self.prediction_xy_random_reverse = scalers[3].inverse_transform(self.prediction_xy_random) self.rmseall = np.sum((self.prediction_xy_reverse-self.y_test)**2,axis=1)**0.5 self.rmseall_random = np.sum((self.prediction_xy_random_reverse-self.y_test)**2,axis=1)**0.5 scaler = StandardScaler() self.rmseall_ = scaler.fit_transform(self.rmseall.reshape(-1,1)).ravel() self.probabes_ = scaler.fit_transform(self.probabes.reshape(-1,1)).ravel()
model.add(Dense(2)) model.add(Activation('softmax')) model.save(file) return model ## Configuration to run (dataset) class_subset = str(sys.argv[1]) # 'CN-AD' 'CN-MCI' 'MDD' 'ADHD' 'ABIDE' mask_initial = np.ones((264, 264)) (subject_groups_, X__, Y__, hidden_layer_size, cnn_model) = prepare_dataset(class_subset, mask_initial) flags = { 'USE_CORRELATION_FEATURES': True, 'USE_DEEPLIFT_FEATURES': True, 'PERFORM_PRUNING': True, 'EARLY_STOPPING': True, 'TRAIN_FROM_SCRATCH': True, } print(flags) ## Directory Setup SOURCE_DIRECTORY = '../../data/cluster_mask/' + class_subset + '/' TARGET_DIRECTORY = './results/model_results_'
kernel_regularizer=reg( 0.0001) #params.Choice(f'dense_reg_lb_last', reg_lambas)) )(net) net = Dense(num_classes, activation='softmax')(net) model = Model(inputs=inputs, outputs=net, name=model_name) model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy']) return model batch_size = 64 # determine classification targets and parameters to construct datasets properly num_classes, cls_target, cls_str = set_classification_targets(0) train_data, test_data, train_labels, test_labels, epoch_steps, _ = prepare_dataset( 2, cls_target, num_classes, batch_size) # list of "class" names used for confusion matrices and validity testing. Not always classes, also subgroups or minerals class_names = [i for i in range(num_classes)] class_weights = class_weight.compute_class_weight('balanced', class_names, train_labels) tuner = RandomSearch( build_model, objective='val_accuracy', max_trials=150, executions_per_trial=3, directory='/home/ben/Dropbox/uni/3_semester/ml/libs-pewpew/results', project_name='tuned_mlp') tuner.search_space_summary(extended=True)
loss_val, avg_grad = self.callback_fn(parameters) if avg_grad < 2e-6: break if sum(abs(parameters - previous_params)) <= self.epsilon or iteration == self.iterations: break return parameters if __name__ == '__main__': LAMBDA = 1e-2 X_train, y_train = prepare_dataset("train_sgd.txt") X_test, y_test = prepare_dataset("test_sgd.txt") filepath = "%s_%s.txt" % ('GIBBS', LAMBDA) callback = Callback(X_train, y_train, filepath, LAMBDA) gibbs = SamplingOptimizer(LAMBDA, callback_fn=callback.callback_fn_return_vals) opt_params = gibbs.train(X_train) W = matricize_W(opt_params) T = matricize_Tij(opt_params) y_preds = decode_crf(X_train, W, T) word_acc, char_acc = compute_word_char_accuracy_score(y_preds, y_train) print("Final train accuracy :", "Word =", word_acc, "Char =", char_acc)
# return (loss_avg, jaccard_avg) if __name__ == "__main__": torch.manual_seed(0) np.random.seed(0) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False max_len = 36 epochs = 10 batch_size = 128 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") path = "../oos-eval/data/data_full.json" data, intents_dict = prepare_dataset(path) num_train_steps = int(len(data.query("datatype == 'train'").index) * epochs / batch_size) + 1 tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = IntentClassificationModel() model.to(device) train_dataset = IntentData(data.query("datatype == 'train'").reset_index(drop=True), tokenizer, max_len=max_len) valid_dataset = IntentData(data.query("datatype == 'val'").reset_index(drop=True), tokenizer, max_len=max_len) test_dataset = IntentData(data.query("datatype == 'test'").reset_index(drop=True), tokenizer,
def classify(**args): """ Main method that prepares dataset, builds model, executes training and displays results. :param args: keyword arguments passed from cli parser """ # only allow print-outs if execution has no repetitions allow_print = args['repetitions'] == 1 # determine classification targets and parameters to construct datasets properly cls_target, cls_str = set_classification_targets(args['cls_choice']) d = prepare_dataset( 0, # any synthetic cls_target, args['batch_size']) print('\n\tTask: Classify «{}» using «{}»\n'.format( cls_str, d['data_str'])) print_dataset_info(d) model = build_model(1, d['num_classes'], name='concat_mlp', new_input=True) model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) if allow_print: model.summary() plot_model(model, to_file='img/concat_mlp.png') # train and evaluate model.fit(d['train_data'], steps_per_epoch=d['train_steps'], epochs=args['epochs'], verbose=1, class_weight=d['class_weights']) model.evaluate(d['eval_data'], steps=d['test_steps'], verbose=1) del d # load handheld dataset for evaluation d = prepare_dataset( 2, # any handheld cls_target, args['batch_size']) print_dataset_info(d) # build model for handheld data, concatenates the output of the last pre-classification layer of the synthetic network concat_model = build_model_concat(2, d['num_classes'], concat_model=model) concat_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) if allow_print: concat_model.summary() plot_model(concat_model, to_file='img/concat_mlp.png') concat_model.fit(d['train_data'], steps_per_epoch=d['train_steps'], epochs=args['epochs'], verbose=1, class_weight=d['class_weights']) # predict on test set and calculate classification report and confusion matrix for diagnosis pred = model.predict(d['test_data'], steps=d['test_steps']) if allow_print: diagnose_output(d['test_labels'], pred.argmax(axis=1), d['classes_trans']) return balanced_accuracy_score(d['test_labels'], pred.argmax(axis=1))
def main(args): ## ----------------------------------- argument processing args, extract_ids_fn, count_fn, automorphism_fn, loss_fn, prediction_fn, perf_opt = process_arguments( args) evaluator = Evaluator( args['dataset_name']) if args['dataset'] == 'ogb' else None ## ----------------------------------- infrastructure torch.manual_seed(args['seed']) torch.cuda.manual_seed(args['seed']) torch.cuda.manual_seed_all(args['seed']) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False np.random.seed(args['np_seed']) os.environ['PYTHONHASHSEED'] = str(args['seed']) random.seed(args['seed']) print('[info] Setting all random seeds {}'.format(args['seed'])) torch.set_num_threads(args['num_threads']) if args['GPU']: device = torch.device( "cuda:" + str(args['device_idx']) if torch.cuda.is_available() else "cpu") print('[info] Training will be performed on {}'.format( torch.cuda.get_device_name(args['device_idx']))) else: device = torch.device("cpu") print('[info] Training will be performed on cpu') if args['wandb']: import wandb wandb.init(sync_tensorboard=False, project=args['wandb_project'], reinit=False, config=args, entity=args['wandb_entity']) print('[info] Monitoring with wandb') ## ----------------------------------- datasets: prepare and preprocess (count or load subgraph counts) path = os.path.join(args['root_folder'], args['dataset'], args['dataset_name']) subgraph_params = { 'induced': args['induced'], 'edge_list': args['custom_edge_list'], 'directed': args['directed'], 'directed_orbits': args['directed_orbits'] } graphs_ptg, num_classes, orbit_partition_sizes = prepare_dataset( path, args['dataset'], args['dataset_name'], args['id_scope'], args['id_type'], args['k'], args['regression'], extract_ids_fn, count_fn, automorphism_fn, args['multiprocessing'], args['num_processes'], **subgraph_params) # OGB-specifics: different feature collections if args['dataset'] == 'ogb': if args['features_scope'] == 'simple': # only retain the top two node/edge features print('[info] (OGB) Using simple node and edge features') simple_graphs = [] for graph in graphs_ptg: new_data = Data() for attr in graph.__iter__(): name, value = attr setattr(new_data, name, value) setattr(new_data, 'x', graph.x[:, :2]) setattr(new_data, 'edge_features', graph.edge_features[:, :2]) simple_graphs.append(new_data) graphs_ptg = simple_graphs else: print('[info] (OGB) Using full node and edge features') ## ----------------------------------- node and edge feature dimensions if graphs_ptg[0].x.dim() == 1: num_features = 1 else: num_features = graphs_ptg[0].num_features if hasattr(graphs_ptg[0], 'edge_features'): if graphs_ptg[0].edge_features.dim() == 1: num_edge_features = 1 else: num_edge_features = graphs_ptg[0].edge_features.shape[1] else: num_edge_features = None if args['dataset'] == 'chemical' and args['dataset_name'] == 'ZINC': d_in_node_encoder, d_in_edge_encoder = torch.load( os.path.join(path, 'processed', 'num_feature_types.pt')) d_in_node_encoder, d_in_edge_encoder = [d_in_node_encoder ], [d_in_edge_encoder] else: d_in_node_encoder = [num_features] d_in_edge_encoder = [num_edge_features] ## ----------------------------------- encode ids and degrees (and possibly edge features) degree_encoding = args['degree_encoding'] if args['degree_as_tag'][ 0] else None id_encoding = args['id_encoding'] if args['id_encoding'] != 'None' else None encoding_parameters = { 'ids': { 'bins': args['id_bins'], 'strategy': args['id_strategy'], 'range': args['id_range'], }, 'degree': { 'bins': args['degree_bins'], 'strategy': args['degree_strategy'], 'range': args['degree_range'] } } print("Encoding substructure counts and degree features... ", end='') graphs_ptg, encoder_ids, d_id, encoder_degrees, d_degree = encode( graphs_ptg, id_encoding, degree_encoding, **encoding_parameters) print("Done.") assert args['mode'] in [ 'isomorphism_test', 'train', 'test' ], "Unknown mode. Supported options are 'isomorphism_test', 'train','test'" ## ----------------------------------- graph isomorphism testing ## ## We use GSN with random weights, so no training is performed ## if args['mode'] == 'isomorphism_test': eps = args['isomorphism_eps'] loader = DataLoader(graphs_ptg, batch_size=args['batch_size'], shuffle=False, worker_init_fn=random.seed(args['seed']), num_workers=args['num_workers']) model = GNNSubstructures(in_features=num_features, out_features=num_classes, encoder_ids=encoder_ids, d_in_id=d_id, in_edge_features=num_edge_features, d_in_node_encoder=d_in_node_encoder, d_in_edge_encoder=d_in_edge_encoder, encoder_degrees=encoder_degrees, d_degree=d_degree, **args) model = model.to(device) print("Instantiated model:\n{}".format(model)) # count model params params = sum(p.numel() for p in model.parameters() if p.requires_grad) print("[info] Total number of parameters is: {}".format(params)) mm, num_not_distinguished = test_isomorphism(loader, model, device, eps=eps) print('Total pairs: {}'.format(len(mm))) print('Number of non-isomorphic pairs that are not distinguised: {}'. format(num_not_distinguished)) print('Failure Percentage: {:.2f}%'.format( 100 * num_not_distinguished / len(mm))) if args['wandb']: wandb.run.summary['num_not_distinguished'] = num_not_distinguished wandb.run.summary['total pairs'] = len(mm) wandb.run.summary[ 'failure_percentage'] = num_not_distinguished / len(mm) return ## ----------------------------------- training ## ## Unified training code for all the datasets. ## Please use args['onesplit'] = True if cross-validation is not required. ## print("Training starting now...") train_losses_folds = [] train_accs_folds = [] test_losses_folds = [] test_accs_folds = [] val_losses_folds = [] val_accs_folds = [] results_folder_init = os.path.join(path, 'results', args['results_folder']) fold_idxs = [-1] if args['onesplit'] else args['fold_idx'] for fold_idx in fold_idxs: print( '############# FOLD NUMBER {:01d} #############'.format(fold_idx)) # prepare result folder results_folder = os.path.join(results_folder_init, str(fold_idx), args['model_name']) if not os.path.exists(results_folder): os.makedirs(results_folder) # prepare folder for model checkpoints checkpoint_path = os.path.join(results_folder, 'checkpoints') if not os.path.exists(checkpoint_path): os.makedirs(checkpoint_path) # save parameters of the training job with open(os.path.join(results_folder, 'params.json'), 'w') as fp: saveparams = copy.deepcopy(args) json.dump(saveparams, fp) # split data into training/validation/test if args['split'] == 'random': # use a random split dataset_train, dataset_test = separate_data( graphs_ptg, args['split_seed'], fold_idx) dataset_val = None elif args['split'] == 'given': # use a precomputed split dataset_train, dataset_test, dataset_val = separate_data_given_split( graphs_ptg, path, fold_idx) # instantiate data loaders loader_train = DataLoader(dataset_train, batch_size=args['batch_size'], shuffle=args['shuffle'], worker_init_fn=random.seed(args['seed']), num_workers=args['num_workers']) loader_test = DataLoader(dataset_test, batch_size=args['batch_size'], shuffle=False, worker_init_fn=random.seed(args['seed']), num_workers=args['num_workers']) if dataset_val is not None: loader_val = DataLoader(dataset_val, batch_size=args['batch_size'], shuffle=False, worker_init_fn=random.seed(args['seed']), num_workers=args['num_workers']) else: loader_val = None # instantiate model if args['model_name'] == 'MLP': Model = MLPSubstructures else: if args['dataset'] == 'ogb': Model = GNN_OGB else: Model = GNNSubstructures model = Model(in_features=num_features, out_features=num_classes, encoder_ids=encoder_ids, d_in_id=d_id, in_edge_features=num_edge_features, d_in_node_encoder=d_in_node_encoder, d_in_edge_encoder=d_in_edge_encoder, encoder_degrees=encoder_degrees, d_degree=d_degree, **args) model = model.to(device) print("Instantiated model:\n{}".format(model)) # count model params params = sum(p.numel() for p in model.parameters() if p.requires_grad) print("[info] Total number of parameters is: {}".format(params)) if args['mode'] == 'train': # optimizer and lr scheduler optimizer, scheduler = setup_optimization(model, **args) # logging if args['wandb']: wandb.watch(model) checkpoint_filename = os.path.join( checkpoint_path, args['checkpoint_file'] + '.pth.tar') if args['resume']: start_epoch = resume_training(checkpoint_filename, model, optimizer, scheduler, device) else: start_epoch = 0 # train (!) metrics = train(loader_train, loader_test, model, optimizer, loss_fn, loader_val=loader_val, prediction_fn=prediction_fn, evaluator=evaluator, scheduler=scheduler, min_lr=args['min_lr'], fold_idx=fold_idx, start_epoch=start_epoch, n_epochs=args['num_epochs'], n_iters=args['num_iters'], n_iters_test=args['num_iters_test'], eval_freq=args['eval_frequency'], checkpoint_file=checkpoint_filename, wandb_realtime=args['wandb_realtime'] and args['wandb']) # log results of training train_losses, train_accs, test_losses, test_accs, val_losses, val_accs = metrics train_losses_folds.append(train_losses) train_accs_folds.append(train_accs) test_losses_folds.append(test_losses) test_accs_folds.append(test_accs) val_losses_folds.append(val_losses) val_accs_folds.append(val_accs) best_idx = perf_opt( val_accs) if loader_val is not None else perf_opt(test_accs) print("Training complete!") print("\tbest train accuracy {:.4f}\n\tbest test accuracy {:.4f}". format(train_accs[best_idx], test_accs[best_idx])) elif args['mode'] == 'test': checkpoint_filename = os.path.join( checkpoint_path, args['checkpoint_file'] + '.pth.tar') print('Loading checkpoint from file {}... '.format( checkpoint_filename), end='') checkpoint_dict = torch.load(checkpoint_filename, map_location=device) model.load_state_dict(checkpoint_dict['model_state_dict']) print('Done.') if args['dataset'] == 'ogb': _, train_acc = test_ogb(loader_train, model, loss_fn, device, evaluator) _, test_acc = test_ogb(loader_test, model, loss_fn, device, evaluator) else: _, train_acc = test(loader_train, model, loss_fn, device, prediction_fn) _, test_acc = test(loader_test, model, loss_fn, device, prediction_fn) train_accs_folds.append(train_acc) test_accs_folds.append(test_acc) if dataset_val is not None: if args['dataset'] == 'ogb': _, val_acc = test(loader_val, model, loss_fn, device, prediction_fn) else: _, val_acc = test(loader_val, model, loss_fn, device, prediction_fn) val_accs_folds.append(val_acc) print("Evaluation complete!") if dataset_val is not None: print( "\ttrain accuracy {:.4f}\n\ttest accuracy {:.4f}\n\tvalidation accuracy {:.4f}" .format(train_acc, test_acc, val_acc)) else: print("\ttrain accuracy {:.4f}\n\ttest accuracy {:.4f}".format( train_acc, test_acc)) else: raise NotImplementedError( 'Mode {} is not currently supported.'.format(args['mode'])) # log metrics if args['mode'] == 'train': train_accs_folds = np.array(train_accs_folds) test_accs_folds = np.array(test_accs_folds) train_losses_folds = np.array(train_losses_folds) test_losses_folds = np.array(test_losses_folds) train_accs_mean = np.mean(train_accs_folds, 0) train_accs_std = np.std(train_accs_folds, 0) test_accs_mean = np.mean(test_accs_folds, 0) test_accs_std = np.std(test_accs_folds, 0) train_losses_mean = np.mean(train_losses_folds, 0) test_losses_mean = np.mean(test_losses_folds, 0) if val_losses_folds[0] is not None: val_accs_folds = np.array(val_accs_folds) val_losses_folds = np.array(val_losses_folds) val_accs_mean = np.mean(val_accs_folds, 0) val_accs_std = np.std(val_accs_folds, 0) val_losses_mean = np.mean(val_losses_folds, 0) best_index = perf_opt( test_accs_mean) if val_losses_folds[0] is None else perf_opt( val_accs_mean) if not args['wandb_realtime'] and args['wandb']: for epoch in range(len(train_accs_mean)): # log scores for each fold in the current epoch for fold_idx in fold_idxs: log_corpus = { 'train_accs_fold_' + str(fold_idx): train_accs_folds[fold_idx, epoch], 'train_losses_fold_' + str(fold_idx): train_losses_folds[fold_idx, epoch], 'test_accs_fold_' + str(fold_idx): test_accs_folds[fold_idx, epoch], 'test_losses_fold_' + str(fold_idx): test_losses_folds[fold_idx, epoch] } if val_losses_folds[0] is not None: log_corpus['val_accs_fold_' + str(fold_idx)] = val_accs_folds[fold_idx, epoch] log_corpus['val_losses_fold_' + str(fold_idx)] = val_losses_folds[fold_idx, epoch] wandb.log(log_corpus, step=epoch) # log epoch score means across folds log_corpus = { 'train_accs_mean': train_accs_mean[epoch], 'train_accs_std': train_accs_std[epoch], 'test_accs_mean': test_accs_mean[epoch], 'test_accs_std': test_accs_std[epoch], 'train_losses_mean': train_losses_mean[epoch], 'test_losses_mean': test_losses_mean[epoch] } if val_losses_folds[0] is not None: log_corpus['val_accs_mean'] = val_accs_mean[epoch] log_corpus['val_accs_std'] = val_accs_std[epoch] log_corpus['val_losses_mean'] = val_losses_mean[epoch] wandb.log(log_corpus, step=epoch) if args['wandb']: wandb.run.summary['best_epoch_val'] = best_index wandb.run.summary['best_train_mean'] = train_accs_mean[best_index] wandb.run.summary['best_train_std'] = train_accs_std[best_index] wandb.run.summary['best_train_loss_mean'] = train_losses_mean[ best_index] wandb.run.summary['last_train_std'] = train_accs_std[-1] wandb.run.summary['last_train_mean'] = train_accs_mean[-1] wandb.run.summary['best_test_mean'] = test_accs_mean[best_index] wandb.run.summary['best_test_std'] = test_accs_std[best_index] wandb.run.summary['best_test_loss_mean'] = test_losses_mean[ best_index] wandb.run.summary['last_test_std'] = test_accs_std[-1] wandb.run.summary['last_test_mean'] = test_accs_mean[-1] if val_losses_folds[0] is not None: wandb.run.summary['best_validation_std'] = val_accs_std[ best_index] wandb.run.summary['best_validation_mean'] = val_accs_mean[ best_index] wandb.run.summary[ 'best_validation_loss_mean'] = val_losses_mean[best_index] wandb.run.summary['last_validation_std'] = val_accs_std[-1] wandb.run.summary['last_validation_mean'] = val_accs_mean[-1] wandb.run.summary['performance_at_best_epoch'] = val_accs_mean[ best_index] else: wandb.run.summary[ 'performance_at_best_epoch'] = test_accs_mean[best_index] print("Best train mean: {:.4f} +/- {:.4f}".format( train_accs_mean[best_index], train_accs_std[best_index])) print("Best test mean: {:.4f} +/- {:.4f}".format( test_accs_mean[best_index], test_accs_std[best_index])) if args['return_scores']: scores = dict() scores['best_train_mean'] = train_accs_mean[best_index] scores['best_train_std'] = train_accs_std[best_index] scores['last_train_std'] = train_accs_std[-1] scores['last_train_mean'] = train_accs_mean[-1] scores['best_test_mean'] = test_accs_mean[best_index] scores['best_test_std'] = test_accs_std[best_index] scores['last_test_std'] = test_accs_std[-1] scores['last_test_mean'] = test_accs_mean[-1] if val_losses_folds[0] is not None: scores['best_validation_std'] = val_accs_std[best_index] scores['best_validation_mean'] = val_accs_mean[best_index] scores['last_validation_std'] = val_accs_std[-1] scores['last_validation_mean'] = val_accs_mean[-1] if args['mode'] == 'test' and not args['onesplit']: train_acc_mean = np.mean(train_accs_folds) test_acc_mean = np.mean(test_accs_folds) train_acc_std = np.std(train_accs_folds) test_acc_std = np.std(test_accs_folds) print("Train accuracy: {:.4f} +/- {:.4f}".format( train_acc_mean, train_acc_std)) print("Test accuracy: {:.4f} +/- {:.4f}".format( test_acc_mean, test_acc_std)) if dataset_val is not None: val_acc_mean = np.mean(val_accs_folds) val_acc_std = np.std(val_accs_folds) print("Validation accuracy: {:.4f} +/- {:.4f}".format( val_acc_mean, val_acc_std)) if args['mode'] == 'train' and args['return_scores']: return scores else: return None
tg_ml_cost = tf.reduce_mean(tg.ml_cost) global_step = tf.Variable(0, trainable=False, name="global_step") lr = tf.Variable(args.lr, trainable=False, name="lr") ml_opt_func = tf.train.AdamOptimizer(learning_rate=lr) ml_grads, _ = tf.clip_by_global_norm(tf.gradients(tg_ml_cost, tvars), clip_norm=1.0) ml_op = ml_opt_func.apply_gradients(zip(ml_grads, tvars), global_step=global_step) tf.add_to_collection('n_skip', args.n_skip) tf.add_to_collection('n_hidden', args.n_hidden) train_set, valid_set, test_set = utils.prepare_dataset(args) init_op = tf.global_variables_initializer() save_op, best_save_op = utils.init_savers(args) with tf.name_scope("tr_eval"): tr_summary = utils.get_summary('ce cr image'.split()) with tf.name_scope("val_eval"): val_summary = utils.get_summary('ce cr fer image'.split()) with tf.Session() as sess: sess.run(init_op) summary_writer = tf.summary.FileWriter(args.logdir, sess.graph, flush_secs=5.0)
from torch import optim from torch.utils import data as utilsdata import torch.nn.functional as F from torch.distributions import Categorical from sklearn.model_selection import train_test_split from scipy.stats import multivariate_normal from scipy.signal import argrelmax import scipy import numpy as np from utils import mdn_loss, mdn_logp from tqdm import tqdm from utils import prepare_dataset, preprocess, preprocess_zscore, get_original_parameters, report_metrics, mdn_logp, mdn_loss from model import MixtureDensityNetwork, IsotropicGaussianMixture X_train, X_test, y_train, y_test = prepare_dataset(featurename=args.data_name) print(args.logsigmamin) model = MixtureDensityNetwork( n_input=X_train.shape[1], n_output=2, n_components=args.n_components, n_hiddens=[args.nhidden1, args.nhidden2, args.nhidden3], logsigma_min=args.logsigmamin, logsigma_max=args.logsigmamax) optimizer = optim.Adam(model.parameters()) # create data loaders scalers = {} datas = [X_train, X_test, y_train, y_test] for i in range(4):
def main(): # Part 1 : Load dataset candidates_house_of_commons_df, representatives_df, representatives_house_of_commons_df, \ quebec_assemblee_nationale_df, conseil_municipal_de_montreal_df = utils.prepare_dataset() # Part 2 : Data Process # 2.1 Check Candidates : House of Commons col_candidates_house_of_commons_df = [ 'District name', 'Primary role', 'Name', 'First name', 'Last name', 'Party name' ] candidates_house_of_commons_df = utils.proces_data( candidates_house_of_commons_df, col_candidates_house_of_commons_df, 'Candidates : House of Commons') # 2.2 Check Representatives : All elected officials col_representatives_df = [ 'Organization', 'District name', 'Primary role', 'Name', 'First name', 'Last name', 'Gender', 'Party name', 'Office type' ] representatives_df = utils.proces_data( representatives_df, col_representatives_df, 'Representatives : All elected officials') # 2.3 Check Representative : House of Commons col_representatives_house_of_commons_df = [ 'District name', 'Primary role', 'Name', 'First name', 'Last name', 'Party name', 'Office type' ] representatives_house_of_commons_df = utils.proces_data( representatives_house_of_commons_df, col_representatives_house_of_commons_df, 'Representative : House of Commons') # 2.4 Check Provincial legislatures : Assemblée nationale du Québec col_quebec_assemblee_nationale_df = [ 'District name', 'Primary role', 'Name', 'First name', 'Last name', 'Party name', 'Office type' ] quebec_assemblee_nationale_df = utils.proces_data( quebec_assemblee_nationale_df, col_quebec_assemblee_nationale_df, 'Provincial legislatures : Assemblée nationale du Québec') # 2.5 Check Quebec councils : Conseil municipal de Montréal col_conseil_municipal_de_montreal_df = [ 'District name', 'Primary role', 'Name', 'First name', 'Last name', 'Gender', 'Party name', 'Office type' ] conseil_municipal_de_montreal_df = utils.proces_data( conseil_municipal_de_montreal_df, col_conseil_municipal_de_montreal_df, 'Quebec councils : Conseil municipal de Montréal') # Part 3 : Data Analysis # 3.1 Analyze Candidates : House of Commons # Group by 'Party name' title_pic = 'Candidates house of commons' save_path = dataset_path_output + 'Candidates house of commons.png' top10 = 0 utils.data_ana(candidates_house_of_commons_df, 'Party name', title_pic, save_path, top10) # Group by 'District name' title_pic = 'Top 10 District of Candidates <House of Commons>' save_path = dataset_path_output + 'Top 10 District of Candidates House of Commons.png' top10 = 1 utils.data_ana(candidates_house_of_commons_df, 'District name', title_pic, save_path, top10) # 3.2 Analyze Representatives : All elected officials # Group by 'Organization' title_pic = 'Top 10 Organizations of Representatives <All elected officials>' save_path = dataset_path_output + 'Top 10 Organizations of Representatives All elected officials.png' top10 = 1 utils.data_ana(representatives_df, 'Organization', title_pic, save_path, top10) # Group by 'District name' title_pic = 'Top 10 Districts of Representatives <All elected officials>' save_path = dataset_path_output + 'Top 10 districts of representatives of All elected officials.png' top10 = 1 utils.data_ana(representatives_df, 'District name', title_pic, save_path, top10) # Group by 'Primary role' title_pic = 'Primary Role of Representatives <All elected officials>' save_path = dataset_path_output + 'Primary role of representatives of All elected officials.png' top10 = 0 utils.data_ana(representatives_df, 'Primary role', title_pic, save_path, top10) # Group by 'Office type' title_pic = 'Office Type of Representatives <All elected officials>' save_path = dataset_path_output + 'Office type of representatives of All elected officials.png' top10 = 0 utils.data_ana(representatives_df, 'Office type', title_pic, save_path, top10) # 3.3 Analyze Representatives : House of Commons # Group by 'Party name' title_pic = 'Party name of Representatives <House of Commons>' save_path = dataset_path_output + 'Party name of Representatives House of Commons.png' top10 = 0 utils.data_ana(representatives_house_of_commons_df, 'Party name', title_pic, save_path, top10) # 3.4 Analyze Provincial legislatures : Assemblée nationale du Québec # Group by 'Party name' title_pic = 'Party name of legislatures <Assemblée nationale du Québec>' save_path = dataset_path_output + 'Party name of legislatures Assemblée nationale du Québec.png' top10 = 0 utils.data_ana(quebec_assemblee_nationale_df, 'Party name', title_pic, save_path, top10) # Group by 'Office type' title_pic = 'Office type of legislatures <Assemblée nationale du Québec>' save_path = dataset_path_output + 'Office type of legislatures Assemblée nationale du Québec.png' top10 = 0 utils.data_ana(quebec_assemblee_nationale_df, 'Office type', title_pic, save_path, top10) # 3.5 Analyze Quebec councils : Conseil municipal de Montréal # Group by 'Primary role' title_pic = 'Primary role of councils of Conseil municipal de Montréal' save_path = dataset_path_output + 'Primary role of councils of Conseil municipal de Montréal.png' top10 = 0 utils.data_ana(conseil_municipal_de_montreal_df, 'Primary role', title_pic, save_path, top10) # Group by 'Party name' title_pic = 'Party name of councils of Conseil municipal de Montréal' save_path = dataset_path_output + 'Party name of councils of Conseil municipal de Montréal.png' top10 = 0 utils.data_ana(conseil_municipal_de_montreal_df, 'Party name', title_pic, save_path, top10) # Group by 'Gender' title_pic = 'Gender of councils of Conseil municipal de Montréal' save_path = dataset_path_output + 'Gender of councils of Conseil municipal de Montréal.png' top10 = 0 utils.data_ana(conseil_municipal_de_montreal_df, 'Gender', title_pic, save_path, top10)
def train(): train_sentences, dico, char_to_id, id_to_char = load_sentence( FLAGS.train_file) if not os.path.isfile(FLAGS.map_file): if FLAGS.pre_emb: dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico.copy(), FLAGS.emb_file, ) else: sentences, dico, char_to_id, id_to_char = load_sentence( FLAGS.train_file) print(train_sentences[0]) with open(FLAGS.map_file, 'wb') as f: pickle.dump([char_to_id, id_to_char], f) else: with open(FLAGS.map_file, 'rb') as f: char_to_id, id_to_char = pickle.load(f) train_data, test_data, dev_data = prepare_dataset(train_sentences, char_to_id) print(train_data[0]) print(test_data[0]) print(dev_data[0]) print(len(train_data), len(dev_data), len(test_data)) train_manager = BatchManager(train_data, FLAGS.batch_size) test_manager = BatchManager(test_data, 100) dev_manager = BatchManager(dev_data, 100) make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9) tf_config = tf.ConfigProto(gpu_options=gpu_options) tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) logger.info("start training") loss = [] best = 0 # sess.graph.finalize() for i in range(50): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{},".format( iteration, step % steps_per_epoch, steps_per_epoch)) loss = [] Acc_result = evaluate(sess, model, "dev", dev_manager, logger) logger.info("Acc{}".format(Acc_result)) logger.info("test") # precision, recall, f1_score = model.evaluete_(sess,test_manager) # logger.info("P, R, F,{},{},{}".format(precision, recall, f1_score)) test_result = evaluate(sess, model, "test", test_manager, logger) if test_result > best: best = test_result save_model(sess, model, FLAGS.ckpt_path, logger)
def classify(**args): """ Main method that prepares dataset, builds model, executes training and displays results. :param args: keyword arguments passed from cli parser """ # only allow print-outs if execution has no repetitions allow_print = args['repetitions'] == 1 # determine classification targets and parameters to construct datasets properly cls_target, cls_str = set_classification_targets(args['cls_choice']) d = prepare_dataset(0, cls_target, args['batch_size'], args['norm_choice']) print('\n\tTask: Classify «{}» using «{}»'.format(cls_str, d['data_str'])) print_dataset_info(d) model = build_model(0, d['num_classes'], name='baseline_mlp', new_input=True) model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) # train and evaluate - pre-transfer model.fit(d['train_data'], steps_per_epoch=d['train_steps'], epochs=args['epochs'], verbose=1, class_weight=d['class_weights']) print('Evaluate ...') model.evaluate(d['eval_data'], steps=d['test_steps'], verbose=1) del d d = prepare_dataset( 1, # HH12 cls_target, args['batch_size'], args['norm_choice']) print_dataset_info(d) # make layers untrainable and remove classification layer, then train new last layer on handheld data for l in model.layers[:-1]: l.trainable = False if allow_print: plot_model(model, to_file='img/transfer_mlp_pre.png') new_layer = Dense(d['num_classes'], activation='softmax', name='dense_transfer')(model.layers[-2].output) model = Model(inputs=model.inputs, outputs=new_layer, name='transfer_model') model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) if allow_print: model.summary() print('') plot_model(model, to_file='img/transfer_mlp_post.png') # train and evaluate - post-transfer model.fit(d['train_data'], steps_per_epoch=d['train_steps'], epochs=args['epochs'] * 2, verbose=1, class_weight=d['class_weights']) print('Evaluate ...') model.evaluate(d['eval_data'], steps=d['test_steps'], verbose=1) # predict on testset and calculate classification report and confusion matrix for diagnosis print('Test ...') pred = model.predict(d['test_data'], steps=d['test_steps']) diagnose_output( d['test_labels'], pred.argmax(axis=1), d['classes_trans'], show=False, file_name= f'heatmap_transfer_{datetime.now().hour}_{datetime.now().minute}') return balanced_accuracy_score(d['test_labels'], pred.argmax(axis=1))
from torchtext.vocab import Vocab from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence from config import * from utils import read_conll_sentence, prepare_dataset, train, evaluate from models import BiLSTMTagger, BiLSTMCRFTagger if __name__ == '__main__': # load a list of sentences, where each word in the list is a tuple containing the word and the label train_data = list(read_conll_sentence(TRAIN_DATA)) train_word_counter = Counter([word for sent in train_data for word in sent[0]]) train_label_counter = Counter([label for sent in train_data for label in sent[1]]) word_vocab = Vocab(train_word_counter, specials=(UNK, PAD), min_freq=2) label_vocab = Vocab(train_label_counter, specials=(), min_freq=1) train_data = prepare_dataset(train_data, word_vocab, label_vocab) print('Train word vocab:', len(word_vocab), 'symbols.') print('Train label vocab:', len(label_vocab), f'symbols: {list(label_vocab.stoi.keys())}') valid_data = list(read_conll_sentence(VALID_DATA)) valid_data = prepare_dataset(valid_data, word_vocab, label_vocab) print('Train data:', len(train_data), 'sentences.') print('Valid data:', len(valid_data)) print(' '.join([word_vocab.itos[i.item()] for i in train_data[0][0]])) print(' '.join([label_vocab.itos[i.item()] for i in train_data[0][1]])) print(' '.join([word_vocab.itos[i.item()] for i in valid_data[1][0]])) print(' '.join([label_vocab.itos[i.item()] for i in valid_data[1][1]])) rnn_tagger = BiLSTMTagger(len(word_vocab), len(label_vocab), 128, 256)\ .to(device)