def load_full_dataset(): # If X_train.mtx have not been generated yet if (not os.path.exists('../data/X_train.mtx')): X_train, y_train = pre.load_dataset("../data/train_numeric.csv", batch=100000) X_date = pre.load_time_spent_by_station_features( "../data/train_date.csv", batch=100000) # TO DO # X_train_cat = mmwrite('../data/X_train', X_train) mmwrite('../data/train_date', X_date) # mmwrite('../data/train_categorical',X_train_cat) else: X_train, y_train = pre.load_dataset("../data/train_numeric.csv", batch=500000) #X_train = mmread('../data/X_train') #y_train = pre.load_labels("../data/train_numeric.csv") #X_date = mmread('../data/train_date') X_train_cat = mmread('../data/train_categorical') csvreader = csv.reader(open("../data/train_numeric.csv")) header = next(csvreader, None) X_train = scipy.sparse.hstack( (X_train, extract_missing(X_train, header[1:-1]), X_train_cat)).tocsr() #X_train = scipy.sparse.hstack((X_train,X_date)).tocsr() return X_train, y_train
def __getitem__(self, idx): batch_x = self.image_filenames[idx * self.batch_size:(idx + 1) * self.batch_size] batch_y = self.labels[idx * self.batch_size:(idx + 1) * self.batch_size] i_batch_x, o_batch_y = load_dataset(batch_x, batch_y) return np.array(i_batch_x), np.array(o_batch_y)
def run(args, meta): main.prerun(args, run_dir=False, exp=False) labels = meta["label"].values m = model.load_model(args, load_weights=False) data = preprocessing.load_hdf5_to_memory(args, labels) paths = [p for p in Path(args["split_dir"]).iterdir() if p.is_dir()] print(paths) results = {"pred": [None]*len(paths), "true": [None]*len(paths)} for d in paths: fold = int(d.parts[-1]) print(fold) m.load_weights(args["model_hdf5"] % fold) val_idx = np.loadtxt(Path(d, 'val.txt'), dtype=int) ds, ds_len = preprocessing.load_dataset(data, val_idx, labels, args, type="pred") preds = m.predict( ds, steps=int(np.ceil(ds_len/args["batch_size"])), ) results["pred"][int(d.parts[-1])] = preds results["true"][int(d.parts[-1])] = labels[val_idx] out_dir = Path(args["model_hdf5"]).parents[1] with open(str(Path(out_dir, "predictions.pkl")), "wb") as pkl: pickle.dump(results, pkl)
def run(options): # load dataset dataset = load_dataset( os.path.join(abs_output_data_folderpath, "processed_dataset.pkl")) mldata = MLDataset(dataset, 10) #train_inputs, train_outputs, val_inputs, val_outputs = preprocess_data(dataset) mlflow.keras.autolog() # setup model training = True if options.action == 'training': model = BaselineModel() model._functional_setup() train_inputs, train_outputs, val_inputs, val_outputs = mldata.get_kth_fold( 0) model.train(train_inputs, train_outputs, val_inputs, val_outputs) mlflow.keras.save_model(model.model, "models") if options.action == 'optimize': model = BaselineModel() optimizer = Optimizer(BaselineModel, mldata) optimizer.hyper_parameter_opt() best_model = optimizer.best_model if options.action == 'predict': model = BaselineModel() model.load_model("models/") preds = model.predict(val_inputs[0:20]) for i in range(20): plt.figure() plt.plot(preds[i], label="pred") plt.plot(val_outputs[i], label="real") plt.savefig(f"test{i}.png")
def load_test(self, path): # Loads testset tweets_test, _raw, _ugs, _list = pp.load_dataset(path) x, y, _headers = lp.vectorise_tweets(tweets_test) self.ids_test = x[:, 0] self.X_test = x[:, 1:].astype('float')
def run(args): def generator(embedder, ds, ds_len): for i, im_batch in enumerate(iter(ds)): if i % 100 == 0: print("Batch %d" % i) embedded_batch = embedder.predict_on_batch(im_batch) for embedding in embedded_batch: yield embedding m = model.load_model(args) outputs = m.get_layer(args["layer"]).output embedder = models.Model(inputs=m.inputs, outputs=outputs) data = preprocessing.load_hdf5_to_memory(args, None) ds, ds_len = preprocessing.load_dataset(data, None, None, args, type="pred") out_dir = Path(args["model_hdf5"]).parent with open(str(Path(out_dir, "embedding.pkl")), "wb") as pkl: for embedding in generator(embedder, ds, ds_len): pickle.dump(embedding, pkl)
def load_test(self, path): test_tweets, _tweets_raw, _unigram_stats, _list = pp.load_dataset(path) self.ids_test, Xl_test, _y = self.preparing_lexicon(loading=False, tweets=test_tweets) Xv_test = self.transform_test_vocabulary_embedding(test_tweets) Xw_test, _y = self.preparing_word_embedding(test_tweets) self.X_test = np.hstack((Xl_test, Xw_test, Xv_test.toarray()))
def __init__(self, loading=True, paths_same_folder=False): if paths_same_folder: train_paths = ["twitter-training-data.txt", "twitter-dev-data.txt"] else: train_paths = [ "semeval-tweets/twitter-training-data.txt", "semeval-tweets/twitter-dev-data.txt" ] if loading: # Loads preprocessed training tweets with open("pickle/tweets_pp.p", "rb") as f: tweets = pickle.load(f) with open("pickle/unigram_stats.p", "rb") as f: unigram_stats = pickle.load(f) else: # Preprocesses training tweets tweets, tweets_raw, unigram_stats, _list = pp.load_dataset( train_paths) # Balances number of tweets count_negatives = np.sum( [1 for item in dataset.items() if item[1]['sentiment'] == -1]) tweets = pp.balance_samples(tweets, count_negatives) lp.unigram_stats = unigram_stats lp.emojis_dict = lp.load_emoji_sentiment_dict() if loading: # Loads vectorised tweets ids, X_train, y_train, headers = lp.load_training_lexicons() else: # Generates vectorised tweets X_train, y_train, headers = lp.vectorise_tweets(tweets) X_train = X_train[:, 1:].astype('float') # Features selection using chi squared test self.ch2 = SelectKBest(chi2, k=10) X_train = self.ch2.fit_transform(X_train, y_train) # Train classifiers self.estimators = [ PassiveAggressiveClassifier(C=0.0001, loss='squared_hinge', max_iter=500), RidgeClassifier(solver='auto', alpha=1000), BaggingClassifier(base_estimator=RidgeClassifier(), n_estimators=5, max_features=1.0, max_samples=0.2) ] for clf in self.estimators: clf.fit(X_train, y_train)
def __init__(self): if my_file.is_file(): print("Using Pickle to restore serialized objects.") with open('dataset.pkl', 'rb') as input: self.word_to_id = pickle.load(input) self.id_to_word = pickle.load(input) self.y = pickle.load(input) self.x = pickle.load(input) self.eval_y = pickle.load(input) self.eval_x = pickle.load(input) else: #Create datasets for training and evaluation dataset = prep.load_dataset(con.data_path) eval_dataset = prep.load_dataset(con.eval_path) # vocab = prep.create_vocab(con.data_path) common_dataset = prep.remove_uncommon_from_dataset(dataset, vocab) word_to_id, number_ver = prep.convert_dataset_to_ids(common_dataset, vocab) number_ver_eval = prep.convert_test_dataset_to_ids(eval_dataset, word_to_id) self.word_to_id = word_to_id self.id_to_word = prep.map_id_to_word(word_to_id) self.y = [sentence[1:] for sentence in number_ver] self.x = [sentence[:-1] for sentence in number_ver] self.eval_y = [sentence[1:] for sentence in number_ver_eval] self.eval_x = [sentence[:-1] for sentence in number_ver_eval] with open('dataset.pkl', 'wb') as output: pickle.dump(self.word_to_id , output, pickle.HIGHEST_PROTOCOL) pickle.dump(self.id_to_word, output, pickle.HIGHEST_PROTOCOL) pickle.dump(self.y, output, pickle.HIGHEST_PROTOCOL) pickle.dump(self.x , output, pickle.HIGHEST_PROTOCOL) pickle.dump(self.eval_y , output, pickle.HIGHEST_PROTOCOL) pickle.dump(self.eval_x, output, pickle.HIGHEST_PROTOCOL)
def load_test(self, path): # Loads testset tweets_test, _raw, _ugs, _list = pp.load_dataset(path) self.X_test, y_test, ids_test = None, None, None x, y, _headers = lp.vectorise_tweets(tweets_test) self.ids_test = x[:, 0] y_test = y self.X_test = x[:, 1:].astype('float') self.X_test = self.ch2.transform(self.X_test)
def run(args, meta): tb = callbacks.TensorBoard("tmp", embeddings_freq=1, embeddings_layer_names=["activation_10"]) data = preprocessing.load_hdf5_to_memory(args, meta["label"]) idx = np.loadtxt(Path(args["split_dir"], "val.txt"), dtype=int) ds, _ = preprocessing.load_dataset(data, idx, meta["label"], args, type="val") model = models.load_model(args["model_hdf5"]) model.predict(ds, batch_size=128, callbacks=[tb])
def preprocessing(self, loading=True, paths=None): if loading: # Loads preprocessed training tweets with open("pickle/tweets_pp.p", "rb") as f: tweets = pickle.load(f) with open("pickle/unigram_stats.p", "rb") as f: unigram_stats = pickle.load(f) else: # Preprocesses training tweets tweets, tweets_raw, unigram_stats, _list = pp.load_dataset(paths) # Balances number of tweets count_negatives = np.sum( [1 for item in tweets.items() if item[1]['sentiment'] == -1]) tweets = pp.balance_samples(tweets, count_negatives) return tweets, unigram_stats
def run(args, meta): main.prerun(args, run_dir=False, exp=False) with tf.device("/cpu:0"): data = preprocessing.load_hdf5_to_memory(args, None) ds, ds_len = preprocessing.load_dataset(data, None, None, args, type="pred") m = tf.keras.models.load_model(args["model_hdf5"], compile=False) preds = m.predict( ds, batch_size=args["batch_size"], steps=int(np.ceil(ds_len / args["batch_size"])), ) np.save("predictions.npy", preds)
def train(): checkpoint = ModelCheckpoint('data/model/model.{epoch:02d}-{val_loss:.2f}.hdf5', monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=True, mode='auto', period=5) X_buckets, ORF_buckets, y_buckets = load_bucket_dataset('data/train/49000-train.h5') X_val, ORF_val, y_val = load_dataset('data/train/49000-validation.h5') history = [] model = build_rnn() bucket_num = len(X_buckets) epochs = 200 for epoch in range(epochs): print('===Epoch', epoch) for i in range(bucket_num): if len(y_buckets[i]) > 0: model.fit([X_buckets[i], ORF_buckets[i]], y_buckets[i], callbacks=[checkpoint], verbose=1) loss, acc, sensibility, specificity = model.evaluate([X_val, ORF_val], y_val) print('epoch: {}, acc: {}'.format(epoch, acc)) history.append({'loss': loss, 'acc': acc}) import pickle with open('history.pickle', 'wb') as f: pickle.dump(history, f)
def mainFunc(argv): def print_usage(): print('python3 ac1d/main.py -n <num_cores>') print( 'num_cores = Number of cores requested from the cluster. Set -1 to leave unset.' ) num_cores = -1 try: opts, args = getopt.getopt(argv, "n:", ['num_cores=']) except getopt.GetoptError: print_usage() sys.exit(2) for opt, arg in opts: if opt == '-h': print_usage() sys.exit() elif opt in ('-n', '--num_cores'): num_cores = int(arg) print(">> NUMBER OF CORES USED: {} ".format(num_cores)) tweets_w_labels, dictionary = preprocessing.load_dataset( full=conf.use_full_dataset) test_tweets = np.transpose( np.array(preprocessing.load_test_data(dictionary))) if conf.use_word2vec: word_embeddings.train_embeddings() model = advanced_model.AdvancedModel() model.build_graph() get_predictions( model, test_tweets, 0, load_saved=True, model_path= 'logs/exp-1-training-2017-06-26_21-47-32/exp-1-2017-06-26_21-47-32-ep-0.ckpt-1500' )
def __init__(self, loading=True, paths_same_folder=False): if paths_same_folder: train_paths = ["twitter-training-data.txt", "twitter-dev-data.txt"] else: train_paths = [ "semeval-tweets/twitter-training-data.txt", "semeval-tweets/twitter-dev-data.txt" ] if loading: with open("pickle/unigram_stats.p", "rb") as f: unigram_stats = pickle.load(f) else: # Only need unigram_stats for this model tweets, tweets_raw, unigram_stats, _list = pp.load_dataset( train_paths) lp.unigram_stats = unigram_stats lp.emojis_dict = lp.load_emoji_sentiment_dict() self.ANN = keras.models.load_model("models/ANN7")
def train_and_eval_model(cfg): """ Load data and train model args: cfg (YACS YAML config) """ # Data preprocessing dataset = { "columns": { 0: "raw_tokens", 1: "boundaries" }, # CoNLL format (tab-delineated) # Column 0: phones # Column 1: syllable boundary "label": "boundaries", # Which column we like to predict } # Load the embeddings and the dataset. Choose whether or not to pad the words. # Right now, padding must be done if CRF is chosen for output layer. # The CRF layer does not support masking. embeddings, data, mappings, vocab_size, n_class_labels, word_length = load_dataset( dataset, dataset_name=cfg.TRAINING.DATASET, do_pad_words=True) create_directory(cfg.CONFIG_NAME) logger.info( f"Starting training of `{cfg.CONFIG_NAME}` on dataset `{dataset}`") for training_repeat in range(cfg.TRAINING.TRAINING_REPEATS): model = BiLSTM(cfg) model.set_vocab(vocab_size, n_class_labels, word_length, mappings) model.set_dataset(dataset, data) # Path to store performance scores for dev / test model.store_results(PATH + "/" + cfg.CONFIG_NAME + "/" + str(training_repeat) + ".csv") model.fit(epochs=cfg.TRAINING.EPOCHS)
def __init__(self): if sentence_continuation_file.is_file(): print("Using Pickle to restore serialized objects of sentence continuation.") with open("sentencecontinuation.pkl", 'rb') as input: self.sc_y = pickle.load(input) self.sc_x = pickle.load(input) else: nlu_data = NLUProject1Dataset() sc_dataset = prep.load_dataset(con.test_sentence_continuation_path) number_ver_test = prep.convert_test_dataset_to_ids(sc_dataset, nlu_data.word_to_id) self.sc_y = [sentence[1:] for sentence in number_ver_test] self.sc_x = [sentence[:-1] for sentence in number_ver_test] with open("sentencecontinuation.pkl", 'wb') as output: pickle.dump(self.sc_y, output, pickle.HIGHEST_PROTOCOL) pickle.dump(self.sc_x, output, pickle.HIGHEST_PROTOCOL)
def main(argv): torch.manual_seed(42) if FLAGS.model == 'Unet': PATH = "models/Unet_model.pt" elif FLAGS.model == 'UnetResNet': PATH = "models/UnetResNet_model.pt" #data import DATA_PATH = "../dataset_mri/lgg-mri-segmentation/kaggle_3m/" dataset = load_dataset(DATA_PATH) print(dataset.head()) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') #dataset splitted train_loader, test_loader, val_loader = get_train_test_val_sets(dataset) print(train_loader) model = build_model(FLAGS.model) print(model) # Load saved model model.load_state_dict(torch.load(PATH), strict=False) model.eval() # Performance evaluation on test data criterion = BCELoss().cuda() avg_loss_test, dice_test, iou_test, pixel_acc_test = evaluate_model( model, device, test_loader, criterion) print("Dice (test): {:.1%}".format(dice_test)) print("IoU (test): {:.1%}".format(iou_test)) print("Pixel accuracy (test): {:.1%}".format(pixel_acc_test)) print("Loss (test): {:1.4f}".format(avg_loss_test)) predictions = get_predictions_data(model, device, test_loader) display_predictions_20(predictions)
epochs = 15 if __name__ == "__main__": parser = ArgumentParser() parser.add_argument('--train_model', default=False) parser.add_argument('--word2vec', default=False) parser.add_argument('--predict', default=False) args = parser.parse_args() train = bool(args.train_model) predict = bool(args.predict) use_word2vec = bool(args.word2vec) df_train, df_test = pp.load_dataset() #clean word dataset_train = df_train[0].apply(lambda x: pp.clean_word(x)) dataset_test = df_test[0].apply(lambda x: pp.clean_word(x)) #delete punc dataset_train = dataset_train.apply(lambda x: pp.clean_punct(x)) dataset_test = dataset_test.apply(lambda x: pp.clean_punct(x)) #normalization dataset_train = dataset_train.apply(lambda x: pp.normalization(x)) dataset_test = dataset_test.apply(lambda x: pp.normalization(x)) # #get word-index tokenizer = Tokenizer(num_words=2000, oov_token='<OOV>') tokenizer.fit_on_texts(list(dataset_train)) word_index = tokenizer.word_index
def __init__(self, file_path): self.sql_data, self.table_data = load_dataset(file_path)
import sys sys.path.append('../') import pandas as pd import numpy as np import csv import preprocessing as pre from IPython import embed from scipy.sparse import csr_matrix, hstack # loading closest parts label X_date_numeric = pre.load_dataset("../../data/train_date.csv", batch=1000, no_label=True).tocoo() x, y = pre.load_dataset("../../data/train_numeric.csv", batch=1000) clstLabels = pre.TimeRangeClosestLabelsPercentage("../../data/train_date.csv") clstLabels.fit(X_date_numeric, y) X_date_closest_y_ratio = clstLabels.transform(X_date_numeric, closeness=3) embed() # print(X_date.shape) # embed() #Reading time spent in stations features # X_date = pre.load_time_spent_by_station_features("../../data/train_date.csv", batch = 100000) # print(X_date.shape) # # Reading labels # file = open("../data/train_numeric.csv") # csvreader = csv.reader(file) # # ignore headers # next(csvreader, None) # labels = []
import sys import mlflow import numpy as np import pandas as pd from mlflow import log_artifacts, log_metric, log_param from sklearn.ensemble import RandomForestClassifier from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB from baseline_model import base_model from config import config # insert . to run from outside from preprocessing import feature_vectorizer, load_dataset, preprocessing, train_test # Load dataset from its path dataset = load_dataset(config.RAW_DATAPATH, config.DATASET_NAME) # Clean the data dataset = preprocessing(dataset) # Train test split the loaded dataset X_train, X_test, y_train, y_test = train_test(dataset) # Obtain feature vectors of the dataset X_train, X_test = feature_vectorizer(X_train, X_test, TfidfVectorizer) alpha = float(sys.argv[1]) with mlflow.start_run(): f1_score, accuracy = base_model( X_train, X_test, y_train, y_test, MultinomialNB, alpha ) print("Multinomial Naive Bayes model")
def load_dataset(filepath): train_features, train_labels, val_features, val_labels, test_features, test_labels = preprocessing.load_dataset( filepath) train_labels = Variable(torch.IntTensor(train_labels), requires_grad=False) val_labels = Variable(torch.IntTensor(val_labels), requires_grad=False) test_labels = Variable(torch.IntTensor(test_labels), requires_grad=False) return train_features, train_labels, val_features, val_labels, test_features, test_labels
def main(argv): # Set torch seed torch.manual_seed(42) #data import DATA_PATH="../dataset_mri/lgg-mri-segmentation/kaggle_3m/" dataset=load_dataset(DATA_PATH) print(dataset.head()) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') #dataset splitted train_loader,test_loader,val_loader=get_train_test_val_sets(dataset) print(train_loader) if FLAGS.mode == 'basic': model = build_model(FLAGS.model) print(model) # defining the optimizer and loss function optimizer = Adam(model.parameters(), lr=FLAGS.lr) criterion = BCELoss().cuda() # Train the model print("Training the model...") val_loss_history, val_dice_history, val_iou_history, val_pixel_acc_history = train_segmentation(model=model, device=device, train_loader=train_loader, val_loader=val_loader, optimizer=optimizer, criterion=criterion, epochs=FLAGS.epochs, early_stop=FLAGS.early_stopping) # Print loss, dice and iou history print("Dice history (val): ", val_dice_history) print("IoU history (val): ", val_iou_history) print("Pixel accuracy history (val): ", val_iou_history) print("Loss history (val): ", val_loss_history) # Performance evaluation on test data avg_loss_test, dice_test, iou_test, pixel_acc_test = evaluate_model(model, device, test_loader, criterion) print("Dice (test): {:.1%}".format(dice_test)) print("IoU (test): {:.1%}".format(iou_test)) print("Pixel accuracy (test): {:.1%}".format(pixel_acc_test)) print("Loss (test): {:1.4f}".format(avg_loss_test)) if FLAGS.display_predictions: predictions = get_predictions_data(model, device, test_loader) display_predictions(predictions) # Save model paramters to disk if save path is provided if FLAGS.save_model: print("Saving model...") model.to('cpu') torch.save(model.state_dict(), './models/' + FLAGS.model + '_model.pt') elif FLAGS.mode == 'learning_rate_comparison': lr_list = [1e-4] epochs = [i for i in range(FLAGS.epochs)] for lr in lr_list: model = build_model(FLAGS.model) # defining the optimizer and loss function print("Learning rate: ", lr) optimizer = Adam(model.parameters(), lr=lr) criterion = BCELoss().cuda() # Train the model print("Training the model...") val_loss_history, val_dice_history, val_iou_history, val_pixel_acc_history = train_segmentation(model=model, device=device, train_loader=train_loader, val_loader=val_loader, optimizer=optimizer, criterion=criterion, epochs=FLAGS.epochs, early_stop=FLAGS.early_stopping) # Print loss, dice and iou history print("Dice history (val): ", val_dice_history) print("IoU history (val): ", val_iou_history) print("Pixel accuracy history (val): ", val_pixel_acc_history) print("Loss history (val): ", val_loss_history) # Save validation loss graph plt.plot(epochs, val_loss_history) plt.title(f'Validation loss, lr = {lr}') plt.savefig(f'./graphs/{FLAGS.model}-val_loss_lr_{lr}.png') plt.clf() # Save validation dice graph plt.plot(epochs, val_dice_history) plt.title(f'Validation dice, lr = {lr}') plt.savefig(f'./graphs/{FLAGS.model}-val_dice_lr_{lr}.png') plt.clf() # Save validation IoU graph plt.plot(epochs, val_iou_history) plt.title(f'Validation IoU, lr = {lr}') plt.savefig(f'./graphs/{FLAGS.model}-val_iou_lr_{lr}.png') plt.clf() # Save validation pixel accuracy graph plt.plot(epochs, val_pixel_acc_history) plt.title(f'Validation pixel accuracy, lr = {lr}') plt.savefig(f'./graphs/{FLAGS.model}-val_pixel_acc_lr_{lr}.png') plt.clf() # Performance evaluation on test data avg_loss_test, dice_test, iou_test, pixel_acc_test = evaluate_model(model, device, test_loader, criterion) print("Dice (test): {:.1%}".format(dice_test)) print("IoU (test): {:.1%}".format(iou_test)) print("Pixel accuracy (test): {:.1%}".format(pixel_acc_test)) print("Loss (test): {:1.4f}".format(avg_loss_test)) elif FLAGS.mode == 'batch_size_comparison': batch_list = [20, 40, 55, 70] epochs = [i for i in range(FLAGS.epochs)] # Model initialization model = build_model(FLAGS.model) optimizer = Adam(model.parameters(), lr=FLAGS.lr) criterion = BCELoss().cuda() for batch_size in batch_list: # Get proper dataloader for the given batch_size print("Batch size: ", batch_size) train_loader,test_loader,val_loader=get_train_test_val_sets(dataset, batch_size=batch_size) # Train the model print("Training the model...") val_loss_history, val_dice_history, val_iou_history, val_pixel_acc_history = train_segmentation(model=model, device=device, train_loader=train_loader, val_loader=val_loader, optimizer=optimizer, criterion=criterion, epochs=FLAGS.epochs, early_stop=FLAGS.early_stopping) # Print loss, dice and iou history print("Dice history (val): ", val_dice_history) print("IoU history (val): ", val_iou_history) print("Pixel accuracy history (val): ", val_pixel_acc_history) print("Loss history (val): ", val_loss_history) # Performance evaluation on test data avg_loss_test, dice_test, iou_test, pixel_acc_test = evaluate_model(model, device, test_loader, criterion) print("Dice (test): {:.1%}".format(dice_test)) print("IoU (test): {:.1%}".format(iou_test)) print("Pixel accuracy (test): {:.1%}".format(pixel_acc_test)) print("Loss (test): {:1.4f}".format(avg_loss_test)) elif FLAGS.mode == 'k_fold_cross_validation': # Get split indices and test set loader split_indices, test_loader = get_k_splits_test_set(dataset, FLAGS.nb_splits) # Perform k-fold cross validation k_fold_cross_validation(dataset, split_indices, FLAGS.model, device, FLAGS.lr, epochs=FLAGS.epochs, early_stop=FLAGS.early_stopping)
def main(argv): if FLAGS.mode == 'basic': #### Basic training and evaluation of the classification model #### # Path to all data DATA_PATH = "../dataset_mri/lgg-mri-segmentation/kaggle_3m/" # Load dataset dataset = load_dataset(DATA_PATH) # Separate dataset into train, validation and test dataset print("Size of the datasets:") train_loader, test_loader, val_loader = get_train_test_val_sets( dataset) print("\n") device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = build_model(6, 4, 20) # defining the optimizer and loss function optimizer = SGD(model.parameters(), lr=0.00095, momentum=0.874) criterion = CrossEntropyLoss() # Train the model print("Training the model...") val_acc, val_f1_score = train_classification(model, device, train_loader, val_loader, optimizer, criterion, epochs=20) print("Accuracy (val): {:.1%}".format(val_acc)) print("F1-score (val): {:.1%}".format(val_f1_score)) # Performance evaluation on test data loss, accuracy, f1_score = evaluate_model(model, device, test_loader, optimizer, criterion) print("Accuracy (test): {:.1%}".format(accuracy)) print("F1-score (test): {:.1%}".format(f1_score)) #### Hyperparameters selection #### elif FLAGS.mode == 'optimizer_optimization': # Learning rate and momentum optimizer_analysis = tune.run(hyperparam_optimizer, metric="avg_accuracy", mode="max", stop={ "avg_accuracy": 0.98, "training_iteration": 1 }, resources_per_trial={ "cpu": 3, "gpu": 0.33 }, num_samples=30, config={ "lr": tune.loguniform(1e-4, 1e-2), "momentum": tune.uniform(0.1, 0.9), }) elif FLAGS.mode == 'nn_layers_optimization': # neural network layers optimization optimizer_analysis = tune.run(hyperparam_nn_layers, metric="avg_accuracy", mode="max", stop={ "avg_accuracy": 0.98, "training_iteration": 2 }, resources_per_trial={ "cpu": 3, "gpu": 0.33 }, config={ "conv_out_features": tune.grid_search([4, 5, 6]), "conv_kernel_size": tune.grid_search([3, 4, 5]), "linear_features": tune.grid_search([5, 10, 15, 20]), })
import preprocessing import utils import doc2vec import numpy as np import os import pickle if __name__ == "__main__": args = config.get_args() # make directory utils.make_directory_doc(args) # load dataset abstract, label = preprocessing.load_dataset(args) # stemming process. we used Snowball stemming of nltk package. abstract = preprocessing.stemming(abstract) # convert word text to idx. sequences, word2idx, vocab_size, instances = preprocessing.get_sequences( abstract, args) # get context words, target word and document idx context, target, document = preprocessing.get_trainable_data( sequences, instances, args) num_document = np.max(document) + 1 # model load and compile
def main(_): """ Please before running make sure all the files in the preprocessing code are present in the right location and with the right format. """ installation_test() classes_strings = ['ANIMALS', 'ART_ARCHITECTURE_AND_ARCHAEOLOGY', 'BIOLOGY', 'BUSINESS_ECONOMICS_AND_FINANCE', 'CHEMISTRY_AND_MINERALOGY', 'COMPUTING', 'CULTURE_AND_SOCIETY', 'EDUCATION', 'ENGINEERING_AND_TECHNOLOGY', 'FARMING', 'FOOD_AND_DRINK', 'GAMES_AND_VIDEO_GAMES', 'GEOGRAPHY_AND_PLACES', 'GEOLOGY_AND_GEOPHYSICS', 'HEALTH_AND_MEDICINE', 'HERALDRY_HONORS_AND_VEXILLOLOGY', 'HISTORY', 'LANGUAGE_AND_LINGUISTICS', 'LAW_AND_CRIME', 'LITERATURE_AND_THEATRE', 'MATHEMATICS', 'MEDIA', 'METEOROLOGY', 'MUSIC', 'NUMISMATICS_AND_CURRENCIES', 'PHILOSOPHY_AND_PSYCHOLOGY', 'PHYSICS_AND_ASTRONOMY', 'POLITICS_AND_GOVERNMENT', 'RELIGION_MYSTICISM_AND_MYTHOLOGY', 'ROYALTY_AND_NOBILITY', 'SPORT_AND_RECREATION', 'TEXTILE_AND_CLOTHING', 'TRANSPORT_AND_TRAVEL', 'WARFARE_AND_DEFENSE'] class2int = {} for class_string in classes_strings: class2int[class_string] = len(class2int) x_train, y_train, x_devel, y_devel, embeddings, domain_dict = load_dataset(MAX_DOC_LEN) \ if MODE is "DNN" else load_for_sklearn() print('Training model.') if MODE is "SVM": svm_domain_dict = {} classes_counts = Counter(y_train.tolist()) for c in classes_counts: svm_domain_dict[c] = (1 - classes_counts[c] / len(y_train)) # manually adjust underrepresented classes svm_domain_dict[classes_strings.index("CULTURE_AND_SOCIETY")] = 9.0 svm_domain_dict[classes_strings.index("METEOROLOGY")] = 5.0 svm_domain_dict[classes_strings.index("HISTORY")] = 5.0 svm_domain_dict[classes_strings.index("ENGINEERING_AND_TECHNOLOGY")] = 5.0 x_weights = [] for i in y_train: x_weights.append(svm_domain_dict[i]) classifier = svm.LinearSVC() classifier.fit(x_train, y_train, sample_weight=x_weights) y_pred = classifier.predict(x_devel) print(classification_report(y_devel, y_pred, target_names=classes_strings)) return # load pre-trained word embeddings into an Embedding layer # note that we set trainable = False so as to keep the embeddings fixed embedding_layer = Embedding(VOCABULARY_SIZE, EMBEDDING_SIZE, weights=[embeddings], input_length=MAX_DOC_LEN, trainable=False) # train a 1D conv net with global max pooling input_layer = Input(shape=(MAX_DOC_LEN,), dtype='float32') embeddings_output = embedding_layer(input_layer) x = Conv1D(128, 5, activation='relu')(embeddings_output) x = MaxPooling1D(5)(x) x = Conv1D(128, 5, activation='relu')(x) x = MaxPooling1D(5)(x) x = LSTM(150, dropout=0.2, recurrent_dropout=0.2)(x) x = Dense(128, activation='relu')(x) x = Dropout(0.5)(x) prediction_layer = Dense(len(class2int), activation='softmax')(x) model = Model(input_layer, prediction_layer) model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['acc']) model.fit(x_train, y_train, batch_size=256, epochs=22, validation_data=(x_devel, y_devel), class_weight=domain_dict) model.save("model.h5") y_prob = model.predict(x_devel) y_pred = y_prob.argmax(axis=-1) plot_heat_matrix(confusion_matrix(y_devel, y_pred)) print(classification_report(y_devel, y_pred, target_names=classes_strings))
pipeline.fit(X_train[idx], y_train[idx]) # evalutate it print(score_mcc(pipeline, X_test, y_test)) # Predicting test data and saving it for submission if (not args.output == ""): # fit train + test positive instances pipeline.fit( scipy.sparse.vstack((X_train[idx], X_test[y_test == 1])).tocsc(), np.concatenate((y_train[idx], y_test[y_test == 1]))) # free memory del X_train, X_test X_board = pre.load_dataset("../data/test_numeric.csv", batch=100000, no_label=True) X_board_cat = mmread('../data/test_categorical') X_board_date = pre.load_date_features("../data/test_date.csv", batch=100000) csvreader = csv.reader(open("../data/train_numeric.csv")) header = next(csvreader, None) X_board = scipy.sparse.hstack( (X_board, extract_missing(X_board, header[1:-1]), X_board_cat)).tocsr() X_board = scipy.sparse.hstack((X_board, X_board_date)).tocsr() del X_board_cat, X_board_date df = pd.read_csv("../data/sample_submission.csv")
import os import torch from torch.autograd import Variable from ray import tune from preprocessing import load_dataset from classification.preprocessing_for_classification import * from classification.classification import * from classification.model_classifier import * # Path to all data DATA_PATH = "../dataset_mri/lgg-mri-segmentation/kaggle_3m/" # Load dataset dataset = load_dataset(DATA_PATH) def hyperparam_optimizer(config): # Switch back to current dir instead of the custom one set by tune os.chdir(os.path.dirname(os.path.realpath(__file__))) # Initialize model, datasets and criterion train_dataloader, test_dataloader, val_dataloader = get_train_test_val_sets( dataset) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = build_model(4, 3, 10) criterion = CrossEntropyLoss() # Initialize the optimizer optimizer = SGD(model.parameters(),