def main(): for filename in os.listdir(PATH_TRAIN): os.remove(os.path.join(PATH_TRAIN, filename)) for filename in os.listdir(PATH_VAL): os.remove(os.path.join(PATH_VAL, filename)) image_to_matrix.convert() split_data.split_data()
def estructure_data(work_dir): # os.path.dirname(os.path.realpath(__file__)) root = os.path.join(work_dir, 'birds') if not os.path.exists(root): split_data() train_dir = os.path.join(root, 'train') valid_dir = os.path.join(root, 'valid') test_dir = os.path.join(root, 'test') return train_dir, valid_dir, test_dir
def split_and_write_data(data, mask, split_method, cell_dim, proportions, data_path, outfile_prefix): for method in split_method: if method == 'edge': sides = ['n', 's', 'e', 'w'] for side in sides: data_split = split.split_data(data, mask, method, cell_dim, proportions, side) write_data(data_split, method, data_path, outfile_prefix, side) else: data_split = split.split_data(data, mask, method, cell_dim, proportions) write_data(data_split, method, data_path, outfile_prefix)
def __init__(self, dim, size): try: self.data = pickle.load(open("data_cluster.pickle", "rb")) self.clusters = pickle.load(open("clusters.pickle", "rb")) except (OSError, IOError) as e: # dim centroids for the generation of the synthetic dataset # centroids = np.array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]) centroids = np.random.randn(4, dim) self.centroids = centroids data, clusters = sintetic_dataset(centroids, 20, size) data = add_nan(data, 0.001) train, validation, test, idx_train = split_data(data, 0.7, 0.2) pickle.dump(clusters[idx_train], open("clusters.pickle", "wb")) pickle.dump(train, open("data_cluster.pickle", "wb")) self.data = pickle.load(open("data_cluster.pickle", "rb")) self.clusters = pickle.load(open("clusters.pickle", "rb")) # pre-processing means, self.data = remove_mean_nan(self.data) print(self.data.shape[0]) self.mean = np.mean(self.data, axis=0) self.std = np.std(self.data, axis=0) self.norm_2 = np.linalg.norm(self.data, axis=0) self.exp_col = exponencial_n_samples(0.4, self.data.shape[0]) self.exp_col = np.reshape(self.exp_col, (self.data.shape[0], 1)) self.data = np.append(self.data, self.exp_col, axis=1)
def build_regression(districts): figure = pl.figure(1) dl = len(districts) plot_idx = 1 for district in districts: train_set, test_set = split_data(district=district) min_max_scaler = preprocessing.MinMaxScaler() X_train, y_train = preprocess_data(train_set, min_max_scaler, True) X_test, y_test = preprocess_data(test_set, min_max_scaler, False) tuned_parameters = [ {"kernel": ["rbf"], "C": [0.1, 1, 10, 100, 1000], "gamma": [0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]}, {"kernel": ["linear"], "C": [0.1, 1, 10, 100, 1000]}, ] # find optimal C and gamma grid = GridSearchCV(SVR(C=1, cache_size=400), tuned_parameters) grid.fit(X_train, y_train) district = district or "All" ax = plot_prediction(X_train, y_train, grid, figure, plot_idx, dl) plot_idx += 1 ax.set_title("Support Vector Regression (train set, %s)" % district) ax = plot_prediction(X_test, y_test, grid, figure, plot_idx, dl) plot_idx += 1 ax.set_title("Support Vector Regression (test set, %s)" % district) pl.show()
def main(): train_set, test_set = split_data() X = np.matrix(( np.ones(train_set.shape[0]), train_set['number_of_rooms'], train_set['living_space']) ).T Y = np.matrix((train_set['price'])).T ne_theta = normal_equation.learn(X, Y) gd_theta = gradient_descent.learn(X, Y, 0.00015, 100) test_set = test_set[['number_of_rooms', 'living_space', 'price']].values for rooms, area, price in test_set: ne_price = int(round(predict(rooms, area, ne_theta))) gd_price = int(round(predict(rooms, area, gd_theta))) print 'Number of rooms %s, area %s sqm:' % (rooms, area) print 'actual price: %s EUR' % price print 'ne predict: %s EUR (%s%%)' %( ne_price, int(100. * ne_price / price)) print 'gd predict: %s EUR (%s%%)' % ( gd_price, int(100. * gd_price / price))
def main(args): start_time = time.time() img_dir = args.data_dir + '/images' images = [os.path.join(img_dir, f) for f in os.listdir(img_dir)] images = sorted(images) mask_dir = args.data_dir + '/labels' masks = [os.path.join(mask_dir, f) for f in os.listdir(mask_dir)] masks = sorted(masks) loop = tqdm(range(len(images))) for idx in loop: img = cv2.imread(images[idx]) mask = cv2.imread(masks[idx]) h, w, _ = img.shape rows = h // args.patch_size cols = w // args.patch_size for i in range(0, rows): for j in range(0, cols): ymin = i * h // rows ymax = i * h // rows + h // rows xmin = j * w // cols xmax = j * w // cols + w // cols roi_img = img[ymin:ymax, xmin:xmax] roi_mask = mask[ymin:ymax, xmin:xmax] roi_img = cv2.resize(roi_img, (args.patch_size, args.patch_size), interpolation=cv2.INTER_CUBIC) roi_mask = cv2.resize(roi_mask, (args.patch_size, args.patch_size), interpolation=cv2.INTER_CUBIC) cv2.imwrite(f'{args.save_dir}/images/{idx:05d}.png', roi_img) cv2.imwrite(f'{args.save_dir}/labels/{idx:05d}.png', roi_mask) if args.split_train_val: split_data(f'{args.save_dir}/images', args.save_dir, True) end_time = time.time() - start_time print(f'Done! It took {end_time:.04f} seconds')
def main(): train_set, test_set = split_data(district='Steglitz') #train_set, test_set = split_data() X_train = np.matrix([ np.ones(train_set.shape[0]), train_set['number_of_rooms'], train_set['living_space'] ]).T Y_train = np.matrix((train_set['price'])).T alpha = 0.00015 n_iterations = 50 thetas = gradient_descent.learn(X_train, Y_train, alpha, n_iterations, True) theta = thetas[-1] X_test = np.matrix([ np.ones(test_set.shape[0]), test_set['number_of_rooms'], test_set['living_space'] ]).T Y_test = np.matrix((test_set['price'])).T test_predictions = np.dot(X_test, theta) costs = calc_cost_functions(X_train, Y_train, thetas) print 'Train error', calc_error(X_train, Y_train, theta) print 'Test error', calc_error(X_test, Y_test, theta) print 'Train R squared', r_squared(X_train, Y_train, theta) print 'Test R squared', r_squared(X_test, Y_test, theta) figure = pl.figure(1) ax = figure.add_subplot(211) ax.scatter(test_set['price'], test_set['living_space'], label='test set') ax.scatter(train_set['price'], train_set['living_space'], color='g', label='train set') ax.plot(test_predictions, test_set['living_space'], color='red', label='regression') ax.set_xlabel('Living space, sqm') ax.set_ylabel('Price, EUR') ax.legend() ax = figure.add_subplot(212) ax.scatter(np.arange(len(costs)), costs) ax.set_xlabel('Iteration') ax.set_ylabel('Cost function') pl.show()
def main(): x = [] valid = [] test = [] for i in xrange(2, 68, 5): print "Current training set size: ", i shutil.rmtree(TRAIN_DIR, ignore_errors=True) shutil.rmtree(TEST_DIR, ignore_errors=True) shutil.rmtree(VALIDATION_DIR, ignore_errors=True) split_data.split_data(i) theta = linear_regression.train() x.append(i) valid.append(linear_regression.validation(theta)) test.append(linear_regression.eval(theta)) plt.plot(x, valid) plt.plot(x, test) plt.ylabel('Accuracy') plt.xlabel('Training Set Size') plt.show()
def main(): split_data.split_data(base_path, input_path, class_names) print("[INFO] completed data splitting ") train_data, train_labels = image_preprocessing(train_dir) test_data, test_labels = image_preprocessing(test_dir, 'test') print("[INFO] completed data preprocessing ") model = build_cnn_model() print("[INFO] completed model building ") X_train, Y_train = bottleneck_feature_extractor(train_data, train_labels, model) X_test, Y_test = bottleneck_feature_extractor(test_data, test_labels, model) print("[INFO] completed bottleneck feature extraction ") if os.path.exists(output_path): shutil.rmtree(output_path) os.makedirs(output_path, mode=0o777) save_features_labels('train', X_train, Y_train) save_features_labels('test', X_test, Y_test) print("[INFO] completed saving bottleneck features ")
def split_orthography(trans, name): '''Split transcription into train and test data''' train, test = split_data(trans) trainfile = codecs.open("%s/%s_train.transcription" % (ETC, name), "w", ENC) testfile = codecs.open("%s/%s_test.transcription" % (ETC, name), "w", ENC) for entry in train: trainfile.write(entry) trainfile.close() for entry in test: testfile.write(entry) testfile.close() print "Data segmented into training and test data"
def train_model(spex, subject, date): gpus = tensorflow.config.experimental.list_physical_devices('GPU') tensorflow.config.experimental.set_memory_growth(gpus[0], True) k_folds = 10 (define_model, epochs, L, Fs, nchan, modelName) = spex path = "C:/Users/Kioskar/Desktop/Testing exjobb/Albin_Damir/AD_crop/" + modelName + "/" names = glob.glob(path + '*/*', recursive=True) #os.listdir(path) names = [x for x in names if "subj" + str(subject) not in x] #print(names) np.random.shuffle(names) vals = [] #map = np.zeros([3,120]) for i in range(0, 1): print("Fold number " + str(i + 1) + "!") (gen, genVal, trainlen, vallen) = split_data(["A", "B", "C"], k_folds, i, names, path, spex, class_on_char=74) checkpoint_path_fold = checkpoint_path + date + "/fold" + str( i + 1) + "/cp-{epoch:04d}.ckpt" cp_callback = tensorflow.keras.callbacks.ModelCheckpoint( filepath=checkpoint_path_fold, save_weights_only=True, verbose=1) model = define_model(nchan, L, Fs) history = model.fit(gen, validation_data=genVal, steps_per_epoch=trainlen, validation_steps=vallen, epochs=30, callbacks=[cp_callback], verbose=2) #heatmap_mean = generate_gradCAM(model,spex,path,gen,trainlen) #plt.imshow(np.repeat(heatmap_mean,50,axis=0)) #plt.show() #vals.append(history.history['accuracy']) return vals
print('Tokenized review: \n', reviews_ints[:1]) # removing outliers from pre_process import remove_outliers reviews_ints, encoded_labels = remove_outliers(reviews_ints, encoded_labels) # padding the sequences to be of equal length from pre_process import pad_features seq_length = 200 # can be modified as needed features = pad_features(reviews_ints, seq_length) # creating data sets from split_data import split_data split_frac = 0.8 train_x, train_y, val_x, val_y, test_x, test_y = split_data( features, encoded_labels, split_frac) print("\t\t\tFeature Shapes:") print("Train set: \t\t{}".format(train_x.shape), "\nValidation set: \t{}".format(val_x.shape), "\nTest set: \t\t{}".format(test_x.shape)) # create data loaders # create Tensor datasets train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y)) valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y)) test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y)) # data loaders batch_size = 50
names.append(name) if name[0:13] == ('C' + subject): names.append(name) k_folds = 10 date = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") val_accs = [] for i in range(0, k_folds): print("Fold number " + str(i + 1) + "!") (data_generator, data_generatorVal, l, lv) = split_data(['A', 'B', 'C'], k_folds, i, names, path(), nchan, data_aug, batch_size=batch_size) tensorboard_callback = load_tensorboard(who, date, i) #es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2) checkpoint_path_fold = checkpoint_path + date + "/fold" + str( i + 1) + "/cp-{epoch:04d}.ckpt" check_point_dir = os.path.dirname(checkpoint_path_fold) cp_callback = tensorflow.keras.callbacks.ModelCheckpoint( filepath=checkpoint_path_fold, save_weights_only=True, verbose=1) model = define_model(nchan, L, Fs, batch_size=batch_size) # Load weights: #model.load_weights("C:/Users/Oskar/Documents/GitHub/Exjobb/logs/model_check_points/20210126-143212/fold1/cp-0005.ckpt")
# min_size=300, max_size=300 # ) # See the model architecture print(model) # use our dataset and defined transformations dataset = Dataset(DATA_DIR, transforms=get_transform(train=True)) dataset_val = Dataset(DATA_DIR, transforms=get_transform(train=False)) dataset_test = Dataset(DATA_DIR, transforms=get_transform(train=False)) # split the dataset into train and validation sets torch.manual_seed(1) # get similar distributated train, val and test set sequences, sequenceStats = get_sequence_stats() training_seq_indices, validation_seq_indices, testing_seq_indices = split_data( sequenceStats) training_indices = seq_indices_to_frame_indices( training_seq_indices) #dataset.ann = load_labels() validation_indices = seq_indices_to_frame_indices( validation_seq_indices) #dataset.ann = load_labels() testing_indices = seq_indices_to_frame_indices( testing_seq_indices) #dataset.ann = load_labels() # not needed anymore indices = torch.randperm(len(dataset)).tolist() dataset_sub = torch.utils.data.Subset(dataset, training_indices) dataset_val_sub = torch.utils.data.Subset(dataset_val, validation_indices) dataset_test_sub = torch.utils.data.Subset(dataset_test, testing_indices) # define training and validation data loaders data_loader = torch.utils.data.DataLoader(dataset_sub,
#from crossval import * from implementations import * from helpers import * from split_data import split_data from classification_accuracy import * from logreg import * from create_data_with_jet import * from build_polynomial import * print("\n", '********************************************') #%%Import DATA_TRAIN_PATH = 'C:/Users/joeld/Desktop/EPFL/machine learning/AIAIaie/data/train.csv' #DATA_TRAIN_PATH = '/Users/benoithohl/Desktop/epfl/master_epfl/Ma3/Machine_learning/AIAIaie/data/train.csv' # TODO: download train data and supply path here y, tX, ids = load_csv_data(DATA_TRAIN_PATH) trainx, trainy, ids_train, validationx, validationy, ids_test = split_data( tX, y, ids, 0.8, seed=1) print('Data loaded') #%% Preprocessing data_jetnum = np.array( create_data_with_jet(trainx, trainy, ids_train, validationx, validationy, ids_test)) print('Data matrix ready') #%% parameters setting #partition of the train set lambda_ = 0.01 degree = 1 max_iters = 5000
#Open files for results res1=open('ref_res.txt','w') res2=open('alt_res.txt','w') #Create lists with splicing events ref=read_file(file1) alt=read_file(file2) #Create numpy arrays for lists with splicing arrays ref=np.array(ref) alt=np.array(alt) #Scale arrays (for each value subtract mean and then divide by SD) ref_scale,alt_scale=scale(ref,alt) #Split data onto train and test sets X_train,X_test=split_data(ref_scale) #Perform SVM with different parameters for choose the bese one y_pred_X_train,y_pred_X_test=svm_data(ref_scale,alt_scale) #Writing results for i in y_pred_X_train: res1.write(str(i)+"\n") for j in y_pred_X_test: res2.write(str(j)+"\n") res1.close() res2.close()
def main(): # Locations for audio and midi source files (train and test are subfolders) audio_folder = 'audio_files' midi_folder = 'midi_files' # Subfolder in audio_folder to get wav files from, i.e. 'clean' or 'noise' audio_source = 'noise' # Locations for input data to tdnn tdnn_feat_train = 'data/tdnn/mfcc_feat_train.pkl' tdnn_feat_test = 'data/tdnn/mfcc_feat_test.pkl' tdnn_target_train = 'data/tdnn/target_train.pkl' tdnn_target_test = 'data/tdnn/target_test.pkl' target_corpus = 'data/hmm/target_corpus' # Locations for output data from tdnn tdnn_probs_train = 'data/hmm/train_probs.pkl' tdnn_probs_test = 'data/hmm/test_probs.pkl' notes_train = 'data/hmm/train_notes.pkl' notes_test = 'data/hmm/test_notes.pkl' # Path for saved tdnn model and hmm model tdnn_model_name = "models/tdnn.h5" hmm_model_name = "models/hmm.pkl" # Path for resulting midi files from tdnn and hmm output_midi_tdnn = 'output_midi/output_midi_tdnn.mid' output_midi_hmm = 'output_midi/output_midi_hmm.mid' stage = 1 # Split data into train and test sets if stage <= 1: print("\nSplitting data...\n") wav_in = join(audio_folder, audio_source) wav_out = audio_folder mid_in = join(midi_folder, 'all') mid_out = midi_folder split_data(wav_in, wav_out, mid_in, mid_out) print("\nFinished splitting data...\n") # Create MFCC features for each test and train audio file if stage <= 2: print("\nGenerating MFCCs...\n") src_dir = 'audio_files/train' generate_features(src_dir, tdnn_feat_train) src_dir = 'audio_files/test' generate_features(src_dir, tdnn_feat_test) print("\nFinished generating MFCCs...\n") # Generate expected pitches for each MFCC from test, train, and corpus midi files if stage <= 3: print("\nGenerating target pitches...\n") src_dir = 'midi_files/train' src_audio_dir = 'audio_files/train' generate_outputs(src_dir, src_audio_dir, tdnn_target_train) src_dir = 'midi_files/test' src_audio_dir = 'audio_files/test' generate_outputs(src_dir, src_audio_dir, tdnn_target_test) # Generate expected pitches for corpus files (we don't need MFCCs for LM) src_dir = 'midi_files/corpus' src_audio_dir = 'audio_files/clean' generate_outputs(src_dir, src_audio_dir, target_corpus) print("\nFinished generating target pitches...\n") # Train TDNN on MFCCs and target pitch data if stage <= 4: print("\nTraining time-delay neural network...\n") tdnn_train(tdnn_feat_train, tdnn_target_train, tdnn_model_name) print("\nFinished training time-delay neural network...\n") # Make predictions based on TDNN if stage <= 5: print("\nPredicting note probabilities using TDNN...\n") tdnn_predict(tdnn_model_name, tdnn_feat_train, tdnn_target_train, tdnn_feat_test, tdnn_target_test, output_midi_tdnn) print("\nFinished predicting note probabilities using TDNN...\n") # Train HMM on TDNN output probabilities if stage <= 6: print("\nTraining hidden markov model...\n") hmm_train(tdnn_probs_train, notes_train, target_corpus, tdnn_target_train, hmm_model_name) print("\nFinished training hidden markov model...\n") # Train HMM on TDNN output probabilities if stage <= 7: print("\nDecoding hidden markov model...\n") hmm_predict(tdnn_probs_test, notes_test, hmm_model_name, output_midi_hmm) print("\nFinished decoding hidden markov model...\n")
from normalize_data import rescaleNormalize from split_data import split_data import numpy as np import pandas as pd import matplotlib.pyplot as plt # read data and normalize it. dataDf = read_data("sat.csv", [1, 2, 4]) data = rescaleNormalize(dataDf) data = data.values # split data X = np.ones((data.shape[0], data.shape[1])) X[:, 1:3] = data[:, :2] Y = data[:, 2] X_train, X_test, Y_train, Y_test = split_data(X, Y, 0.66) # final parameters: ALPHA = 0.05 ITERATIONS = 200 # call GradientDescent function to train the model theta = np.zeros(X.shape[1]) theta, costList = GradientDescent(X_train, Y_train, theta, ITERATIONS, ALPHA) # visualsize the convergence curve plt.plot(range(0, len(costList)), costList) plt.xlabel('iteration') plt.ylabel('cost') plt.title('alpha = {} theta = {}'.format(ALPHA, theta)) plt.show()
from SQL_connection import insert_val # use these columns as features # dropped amount_mean_lag7 to avoid errors feat_merch = ['description', 'transaction_category_name', 'amount', 'state', 'city', 'transaction_base_type', 'transaction_origin'] df = df_encoder(rng=14, spending_report=False, plots=False, include_lag_features=False) X_train, X_train_scaled, X_train_minmax, X_test, X_test_scaled, \ X_test_minmax, y_train, y_test = split_data(df= df, features = feat_merch, test_size=0.2, label='primary_merchant_name') # convert train data to ndarray to avoid feature_names mismatch error X_array = X_train.values y_array = y_train.values Xt_array = X_test.values yt_array = y_test.values # X_train and y_train used to train pipeline xgb_clf_object = pipeline_xgb(x=X_array, y=y_array, test_features=Xt_array, test_target=yt_array, verb=False)
def main(): if not RUN_DIRTY: clean_up() if not RUN_DIRTY and not os.path.exists(TRAINING_PATH) and not os.path.exists(TEST_PATH): print("Splicing raw data") split_data.split_data(RAW_PATH) training_labels_file = TRAINING_PATH + "/_label" test_labels_file = TEST_PATH + "/_label" print("Reading labels") training_labels = file_util.read_line_list(training_labels_file) test_labels = file_util.read_line_list(test_labels_file) training_tokens_path = tokenizer.get_token_path(TRAINING_PATH) test_tokens_path = tokenizer.get_token_path(TEST_PATH) print("Tokenizing...") if not os.path.exists(training_tokens_path): print("Tokenizing training set...") tokenizer.tokenize_path(TRAINING_PATH) print("Training set tokenization complete") if not os.path.exists(test_tokens_path): print("Tokenizing test set...") tokenizer.tokenize_path(TEST_PATH) print("Test set tokenization complete") print("Reading tokens") training_set_tokens = article_util.load_tokenized_articals(training_tokens_path) test_set_tokens = article_util.load_tokenized_articals(test_tokens_path) print("Training naive bayes") naive_bayes = NaiveBayes(training_set_tokens, training_labels) print("Validating with training set") training_true_positives = 0 training_false_positives = 0 training_false_negative = 0 for i in range(len(training_set_tokens)): predictedClass = naive_bayes.classify(training_set_tokens[i], N_OF_WORDS, N_OF_COMMAS) if VERBOSE: print("Predicted " + training_labels[i] + " as " + predictedClass) if predictedClass == training_labels[i]: training_true_positives += 1 else: training_false_positives += 1 training_false_negative += 1 training_precisions = training_true_positives / ( training_true_positives + training_false_positives) training__recall = training_true_positives / ( training_true_positives + training_false_negative) training_class_f_score = (2 * training_precisions * training__recall) / ( training_precisions + training__recall) print("Training Precision " + str(training_precisions)) print("Training Recall " + str(training__recall)) print("Training F-Score " + str(training_class_f_score)) print("*" * 50) print("Validating with test set") test_true_positives = 0 test_false_positives = 0 test_false_negative = 0 for i in range(len(test_set_tokens)): predictedClass = naive_bayes.classify(test_set_tokens[i], N_OF_WORDS, N_OF_COMMAS) if VERBOSE: print("Predicted " + test_labels[i] + " as " + predictedClass) if predictedClass == test_labels[i]: test_true_positives += 1 else: test_false_positives += 1 test_false_negative += 1 test_precisions = test_true_positives / ( test_true_positives + test_false_positives) test__recall = test_true_positives / ( test_true_positives + test_false_negative) test_class_f_score = (2 * test_precisions * test__recall) / ( test_precisions + test__recall) print("Test Precision " + str(test_precisions)) print("Test Recall " + str(test__recall)) print("Test F-Score " + str(test_class_f_score))
parser.add_argument("--nnz_per_slot", type=int, help="the number of keys in each slot", required=True) parser.add_argument("--vocabulary_size", type=int, required=False, default=1024 * 8) parser.add_argument("--iter_num", type=int, help="the number of training iterations", required=True) parser.add_argument("--filename", type=str, help="the filename used to save the generated datas.", required=False, default=r"./datas.file") parser.add_argument("--split_num", type=int, required=True, help="the number of shards to be splited.") parser.add_argument("--save_prefix", type=str, required=True, help="the prefix used to save splits.") args = parser.parse_args() generate_datas(args) split_data(args.filename, args.split_num, args.save_prefix)
parent = parent.split('_') lab = True if lab == '+' else False features = make_vector(parent=parent[1], parent_pos=parent[2][0], child=child[1], child_pos=child[2][0], custom={ 'Pid': parent[0], 'Cid': child[0] }) annot_data.append({**features, **{'result': lab}}) # split annotated data on train/validation/holdout divided = split_data(relations, train=0.65, validation=0.15, holdout=0.2, random_seed=24) for item in annot_data: parent = item['Pid'] + '_' + item['parent'] + '_' + item['parentPos'] child = item['Cid'] + '_' + item['child'] + '_' + item['childPos'] item['data'] = divided[(parent, child)] # feature analysis/selection if par.fsmi or par.fsce: # basic preprocessing dfc = pd.DataFrame(annot_data) dfc = dfc.drop(columns=['child', 'parent', 'Pid', 'Cid'], axis=1) # calculating mutual information if par.fsmi:
cd.convert_discontinuous_variable(data_frame) #print((data_frame.iloc[:,21:]).head()) #normalize otherwise it will overflow data_frame = (data_frame - data_frame.mean()) / data_frame.std() data_frame.rename(columns={"class": "MPG"}, inplace=True) data_frame = data_frame.drop(["weight"], axis=1) #extract dependent variable from the data y = (data_frame["MPG"].values) y = y.reshape(398, 1) y_column = "MPG" X = (data_frame.loc[:, data_frame.columns != "MPG"]) #X = (X.iloc[:,0:2]).values X = X.values train_x, train_y, test_x, test_y = sp.split_data(X, y) X_column = (data_frame.loc[:, data_frame.columns != "MPG"]) x_column = X_column.columns.values #add y columns X = gd.add_y_intercept(X) theta = np.matrix(np.zeros([train_x.shape[1], 1])) #set hyper parameters alpha = 0.001 iters = 10000 g, cost = gd.gradient_descent(train_x, train_y, theta, alpha, iters) x_column = x_column.reshape(24, 1)
def train_model(spex, subject, date, pretrain=False): batch_size = 1 k_folds = 10 (define_model, epochs, L, Fs, nchan, modelName) = spex path = "C:/Users/Kioskar/Desktop/Testing exjobb/EmoDecode1/Study/" + modelName + "/subj" + str( subject) + "/" path2 = "C:/Users/Kioskar/Desktop/Testing exjobb/EmoDecode1/Retrieval/" + modelName + "/subj" + str( subject) + "/" names = os.listdir(path) np.random.shuffle(names) vals = [] confusion_matrix_F = np.zeros([k_folds, 3, 3]) confusion_matrix_S = np.zeros([k_folds, 3, 3]) for i in range(0, k_folds): gpus = tensorflow.config.experimental.list_physical_devices('GPU') print(gpus) print(tensorflow.__version__) tensorflow.config.experimental.set_memory_growth(gpus[0], True) print(tensorflow.config.experimental.get_memory_growth(gpus[0])) print("Fold number " + str(i + 1) + "!") (gen, genVal, trainlen, vallen, val_names) = split_data(["A", "B", "C"], k_folds, i, names, path, spex, batch_size=batch_size) checkpoint_path_fold = checkpoint_path + date + "/fold" + str( i + 1) + "/cp-{epoch:04d}.ckpt" #cp_callback = tensorflow.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path_fold,save_weights_only=True,verbose=1) model = define_model(nchan, L, Fs) #model.load_weights("C:/Users/Kioskar/Documents/GitHub/Exjobb/logs/model_check_points/20210311-183357/fold1/cp-0042.ckpt") #Subj03 history = model.fit( gen, validation_data=genVal, steps_per_epoch=int(trainlen / batch_size) + 1, validation_steps=int(vallen / batch_size) + 1, epochs=epochs, #callbacks=[cp_callback], verbose=2) vals.append(history.history['val_accuracy']) #heatmap_mean = generate_gradCAM(model,spex,path) #plt.imshow(np.repeat(heatmap_mean,50,aixs=0)) #plt.show() labels = np.zeros([vallen, 3]) labels_FS = np.zeros(vallen) for j in range(0, vallen): (_, labels[j, :]) = next(genVal) if val_names[j][4] == "F": labels_FS[j] = 1 print(labels_FS) val_preds = np.argmax(model.predict(genVal, steps=vallen), axis=1) confusion_matrix_F[i, :, :] = tensorflow.math.confusion_matrix( np.argmax(labels, axis=1)[labels_FS == 1], val_preds[labels_FS == 1]) print(np.argmax(labels, axis=1)[labels_FS == 0]) print(val_preds[labels_FS == 0]) confusion_matrix_S[i, :, :] = tensorflow.math.confusion_matrix( np.argmax(labels, axis=1)[labels_FS == 0], val_preds[labels_FS == 0]) print(confusion_matrix_F[i, :, :]) print(confusion_matrix_S[i, :, :]) names2 = os.listdir(path2) (gen2, _, trainlen2, _, _) = split_data(["A", "B", "C"], 100, 0, names2, path2, spex, batch_size=batch_size) history2 = model.evaluate(gen2, steps=trainlen2) print(history2) del history del model tensorflow.keras.backend.clear_session() tensorflow.compat.v1.reset_default_graph() def limit_mem(): tensorflow.config.experimental.get_session().close() #limit_mem() print("final conf") print(np.mean(confusion_matrix_F, axis=0)) print(np.mean(confusion_matrix_S, axis=0)) return vals
y_fn = directory + '/training_solutions_rev1.csv' y = np.genfromtxt(y_fn, delimiter=',') y = y[1:, :] print('Finished loading input after ' + str(time.time() - start) + 'sec') return y if __name__ == '__main__': # define input arguments directory = '/Users/Karen_Loscocco/Desktop/galaxy-zoo-the-galaxy-challenge' size = 424 trim = 100 testsize = 0.33 randnum = 42 X = get_X_train(directory, size, trim) y = get_y_train(directory) X_train, X_test, y_train, y_test = split_data.split_data( X, y, testsize, randnum) np.save('X_train', X_train) np.save('X_test', X_test) np.save('y_train', y_train) np.save('y_test', y_test)
if word in message_words: log_prob_if_spam += math.log(prob_if_spam) log_prob_if_not_spam += math.log(prob_if_not_spam) else: log_prob_if_spam += math.log(1.0 - prob_if_spam) log_prob_if_not_spam += math.log(1.0 - prob_if_not_spam) prob_if_spam = math.exp(log_prob_if_spam) prob_if_not_spam = math.exp(log_prob_if_not_spam) return prob_if_spam / (prob_if_spam + prob_if_not_spam) #Use the SpamAssassin dataset #replace with your path path = r"C:\spam\*\*" data = [] for fn in glob.glob(path): is_spam = "ham" not in fn with open(fn, 'r') as file: for line in file: if line.startswith("Subject:"): subject = re.sub(r"^Subject: ", "", line).strip() data.append((subject, is_spam)) random.seed(0) train_data, test_data = split_data(data, 0.75) classifier = NaiveBayesClassifier() classifier.classify(train_data)
def main(): # Parse the training argument parser = argparse.ArgumentParser() parser.add_argument( '--training', help= 'Decide whether to train the model or just run testing on previously saved model.' ) args = parser.parse_args() is_training = args.training if (is_training is None) or (is_training == 'True'): is_training = True else: is_training = False print('is_training mode = ', is_training) chamferDist = ChamferDistance() # Decide on GPU or CPU if torch.cuda.is_available(): gpu_or_cpu = torch.device('cuda') else: gpu_or_cpu = torch.device('cpu') # Training Configuration # image_root = "./../../../datasets/cs253-wi20-public/ShapeNetRendering/" # point_cloud_root = "./../../../datasets/cs253-wi20-public/ShapeNet_pointclouds/" image_root = "/datasets/cs253-wi20-public/ShapeNetRendering/" point_cloud_root = "/datasets/cs253-wi20-public/ShapeNet_pointclouds/" num_epochs = 1000 batch_size = 64 shuffle = True num_workers = 8 use_2048 = True img_size = 227 # I don't know why, but this has to be 227! learning_rate = 1e-4 num_points = 2048 transform = transforms.Compose([ transforms.Resize(img_size, interpolation=2), transforms.CenterCrop(img_size), transforms.ToTensor() ]) # Checkpoint use_checkpoint = False # Split and Get data. Override the saved files if you change the ratios. train_ratio = 0.8 val_ratio = 0.1 test_ratio = 0.1 split_data(train_ratio, val_ratio, test_ratio, overrideFiles=False) path_train = 'train_data.txt' path_val = 'val_data.txt' path_test = 'test_data.txt' train_data = read_from_file(path_train) val_data = read_from_file(path_val) test_data = read_from_file(path_test) # Data loader train_data_loader = get_loader(image_root, point_cloud_root, train_data, use_2048, transform, batch_size, shuffle, num_workers) val_data_loader = get_loader(image_root, point_cloud_root, val_data, use_2048, transform, batch_size, shuffle, num_workers) test_data_loader = get_loader(image_root, point_cloud_root, test_data, use_2048, transform, batch_size, shuffle, num_workers) print('Len of train loader = ', len(train_data_loader)) # create model print("model building...") model = pic2points(num_points=num_points) model.to(device=gpu_or_cpu) if is_training: # Train print('Starting training...') train_losses, val_loss, best_model = train( model, train_data_loader, val_data_loader, chamferDist, model_name="Baseline_DL_Vis", num_epochs=num_epochs, lr=learning_rate, use_checkpoint=use_checkpoint) else: best_model = torch.load('best-Baseline_DL_Vis.pt') print('Loaded previously saved model.') model = best_model.cuda() model.eval() # Compute chamfer distance on Pix3D dataset. img_path = "/datasets/cs253-wi20-public/pix3d/" pc_path = "/datasets/cs253-wi20-public/pix_pointclouds/" objects = ['table', 'sofa'] test_dataset = TestDataset(img_path, pc_path, objects) test_data_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=1, shuffle=True, num_workers=8) print('Starting testing on Pix3D dataset...') total_test_loss = 0. # Get loss on training data. with torch.no_grad(): for i, (image, point_cloud) in enumerate(test_data_loader): image, point_cloud = Variable(image), Variable(point_cloud) # print(image.size()) if (image.size(1) != 3): continue # print('reaching.') image, point_cloud = image.float().to( device=gpu_or_cpu), point_cloud.float().to(device=gpu_or_cpu) pred = model(image) dist1, dist2 = chamferDist(pred, point_cloud) loss = (torch.mean(dist1)) + (torch.mean(dist2)) # emd_cost = torch.sum(dist(pred.cuda().double(), points.cuda().double())) total_test_loss += loss.item() # print(total_test_loss) # break if i % 100 == 0: print('Batch ' + str(i) + ' finished.') print('Chamfer distance on Pix3D dataset = ', total_test_loss / len(test_data_loader))
from split_data import split_data if __name__ == '__main__': ## test on split_data func data_name = 'ring' data_path = 'data/{}.csv'.format(data_name) num_folds = 5 split_data(data_path, data_name, num_folds)
"""Creates the Ancient_Greek_ML dataset and then prepares the train, dev and test sets for the character-level BERT.""" from clean_data import clean_data from sentence_tokenization import sentence_tokenize_corpus from split_data import split_data import os os.chdir("../data") clean_data() sentence_tokenize_corpus() split_data()
def train_test( data, instance_testing_size, forecast_horizon, feature_or_covariate_set, history_length, model='knn', base_models=None, model_type='regression', model_parameters=None, feature_scaler='logarithmic', target_scaler='logarithmic', labels=None, performance_measures=['MAPE'], performance_mode='normal', performance_report=True, save_predictions=True, verbose=0): """ Parameters: data: Pandas DataFrame a preprocessed DataFrame to be used for training the model and making predictions on the test part instance_testing_size: int or float the size of testing instances forecast_horizon: int forecast horizon to gap consideration in data splitting process by the gap, we mean the number of temporal units which are excluded from data to simulate the situation of real prediction in which we do not have access to the information of forecast horizon-1 units before the time point of the target variable. feature_or_covariate_set: list<string> a list of covariates or features which feature selection process will be based on them if historical data is provided, the input will be considered as a feature list, otherwise as a covariate list history_length: int history length of the input "data", history length is just used for the reports in "train_test" model: string or callable or dict string: one of the pre-defined model names function: a user-defined function dict: pre-defined model names and corresponding hyper parameters pre-defined model names: 'knn', 'nn' , 'gbm', 'glm' model_type: string model_parameters: list<int> or None feature_scaler: string target_scaler: string labels: list<int> or None performance_measures: list<string> a list of performance measures that the user wants to calculate the errors on predictions of test dataset performance_mode: string performance_report: bool if True, some tables containing a report on models and their corresponding errors (based on performance_measurements) will be saved in the same directory save_predictions: bool if True, the prediction values of trained models for training data and validation data through train_and_evaluate process will be saved in the same directory as your program is running as in ‘.csv’ format verbose: int the level of produced detailed logging information available options: 0: no logging 1: only important information logging 2: all details logging Returns: model: string or callable or dict exactly same as the 'model' parameter model_parameters: list<int> """ warnings.filterwarnings("once") ################################ checking for TypeError and other possible mistakes in the inputs if not(isinstance(data, pd.DataFrame)): raise TypeError("Expected a pandas DataFrame for data.") if not(isinstance(instance_testing_size, int) or isinstance(instance_testing_size, float)): raise TypeError("Expected an integer or a float number for instance_testing_size.") if not(isinstance(forecast_horizon, int)): raise TypeError("Expected an integer for forecast_horizon.") if not(isinstance(feature_or_covariate_set, list)): raise TypeError("Expected a list of strings for feature_or_covariate_set.") if not(isinstance(history_length, int)): raise TypeError("Expected an integer for history_length.") if not(isinstance(model, str) or callable(model) or isinstance(model, dict)): raise TypeError("Expected a string or function or a dictionary of model parameters for model.") if not(isinstance(model_type, str)): raise TypeError("Expected a string for model_type.") if not(isinstance(model_parameters, dict) or model_parameters == None): raise TypeError("Expected a dictionary or None value for model_parameters.") if not(isinstance(feature_scaler, str) or feature_scaler == None): raise TypeError("Expected a string or None value for feature_scaler.") if not(isinstance(target_scaler, str) or target_scaler == None): raise TypeError("Expected a string or None value for target_scaler.") if not(isinstance(labels, list) or labels == None): raise TypeError("Expected a list or None value for labels.") if not(isinstance(performance_measures, list)): raise TypeError("Expected a list for performance_measures.") if not(isinstance(performance_mode, str)): raise TypeError("Expected a string for performance_mode.") if not(isinstance(performance_report, bool)): raise TypeError("Expected a bool variable for performance_report.") if not(isinstance(save_predictions, bool)): raise TypeError("Expected a bool variable for save_predictions.") if not(isinstance(verbose, int)): raise TypeError("Expected an integer (0 or 1 or 2) for verbose.") ################################ # classification checking if model_type == 'classification': if not set(performance_measures) <= set(configurations.CLASSIFICATION_PERFORMANCE_MEASURES): raise Exception("Error: The input 'performance_measures' is not valid according to 'model_type=classification'.") if performance_mode != 'normal': performance_mode = 'normal' print("Warning: The input 'performance_mode' is set to 'normal' according to model_type=classification'.") if target_scaler is not None: target_scaler = None print("Warning: The input 'target_scaler' is set to None according to model_type=classification'.") # get some information of the data target_mode, target_granularity, granularity, data = get_target_quantities(data=data.copy()) # get the target temporal id from temporal id # if target temporal id is already in the data, call is from inside the predict function # otherwise backup file must be removed if 'target temporal id' in data.columns: data = data.rename(columns={'target temporal id':'temporal id'}) else: data, _ = get_target_temporal_ids(temporal_data = data.copy(), forecast_horizon = forecast_horizon, granularity = granularity) if os.path.isfile('test_process_backup.csv'): os.remove('test_process_backup.csv') # check rows related to future prediction are removed and if not then remove them temp_data = data.sort_values(by = ['temporal id','spatial id']).copy() number_of_spatial_units = len(temp_data['spatial id'].unique()) if all(temp_data.tail(granularity*forecast_horizon*number_of_spatial_units)['Target'].isna()): data = temp_data.iloc[:-(granularity*forecast_horizon*number_of_spatial_units)] # check if model is a string or function model_name = '' if isinstance(model, str) == False: model_name = model.__name__ if model_name in ['nn', 'knn', 'glm', 'gbm']: raise TypeError("Name of the user defined model matches the name of one of our predefined models.") else: model_name = model # find labels for classification problem if labels == None: if model_type == 'regression': # just an empty list labels = [] elif model_type == 'classification': # unique values in 'Target' column of data labels = data.Target.unique() labels.sort() # select features processed_data = select_features( data=data.copy(), ordered_covariates_or_features=feature_or_covariate_set ) # splitting data in the way is set for train_test training_data, _, testing_data, gap_data = split_data( data=processed_data.copy(), splitting_type='instance', instance_testing_size=instance_testing_size, instance_validation_size=None, instance_random_partitioning=False, fold_total_number=0, fold_number=0, forecast_horizon=forecast_horizon, granularity=granularity, verbose=verbose ) # separate some data which are needed later base_data = training_data['Target'].values.tolist() training_target = training_data[['spatial id', 'temporal id', 'Target', 'Normal target']] test_target = testing_data[['spatial id', 'temporal id', 'Target', 'Normal target']] # scaling data training_data, testing_data = data_scaling( train_data=training_data.copy(), test_data=testing_data.copy(), feature_scaler=feature_scaler, target_scaler=target_scaler ) # training model with processed data training_predictions, testing_predictions, trained_model, number_of_parameters = inner_train_evaluate( training_data=training_data.copy(), validation_data=testing_data.copy(), model=model, model_type=model_type, model_parameters=model_parameters, labels=labels, base_models = base_models, verbose=verbose ) # target descale training_predictions = target_descale( scaled_data=list(training_predictions), base_data=base_data, scaler=target_scaler ) testing_predictions = target_descale( scaled_data=list(testing_predictions), base_data=base_data, scaler=target_scaler ) # checking for some files to exit which will be used in the next phases test_process_backup_file_name = 'test_process_backup.csv' if pathlib.Path(test_process_backup_file_name).is_file() == False: if model_type == 'regression': df = pd.DataFrame(columns=['spatial id', 'temporal id', 'Target', 'Normal target', 'prediction']) elif model_type == 'classification': df = pd.DataFrame(columns=['spatial id', 'temporal id', 'Target', 'Normal target']+\ ['prediction class '+str(class_num) for class_num in range(np.array(testing_predictions).shape[1])]) df.to_csv(test_process_backup_file_name, index=False) # getting back previous points (useful for one-by-one method, also works for one-as-whole method) previous_test_points = pd.read_csv(test_process_backup_file_name) # append current point to previous points test_target = test_target.append(previous_test_points[['spatial id', 'temporal id', 'Target', 'Normal target']], ignore_index=True) if model_type == 'regression': previous_testing_predictions = previous_test_points['prediction'].tolist() testing_predictions = list(testing_predictions) + previous_testing_predictions elif model_type == 'classification': previous_testing_predictions = previous_test_points.filter(regex='^prediction class ',axis=1) testing_predictions = np.concatenate((np.array(testing_predictions),np.array(previous_testing_predictions))) testing_predictions_df = pd.DataFrame(testing_predictions) testing_predictions_df.columns = ['prediction class '+str(class_num) for class_num in testing_predictions_df.columns] # saving test_target+testing_predictions into a backup file to be used in the next point df_for_backup = test_target.copy() if model_type == 'regression': df_for_backup.insert(loc=len(df_for_backup.columns), column='prediction', value=testing_predictions) elif model_type == 'classification': df_for_backup = pd.concat([df_for_backup,testing_predictions_df],axis = 1) df_for_backup.to_csv(test_process_backup_file_name, index=False) # get normal data training_target, test_target, training_prediction, test_prediction = get_normal_target( training_target=training_target.append(gap_data[['spatial id', 'temporal id', 'Target', 'Normal target']], ignore_index=True), test_target=test_target.copy(), training_prediction=list(training_predictions) + gap_data['Target'].tolist(), test_prediction=testing_predictions, target_mode=target_mode, target_granularity=target_granularity ) # make copy of some data to be stores later test_target_normal, test_prediction_normal = test_target.copy(), test_prediction.copy() # including performance_mode training_target, test_target, training_prediction, test_prediction = apply_performance_mode( training_target=training_target.copy(), test_target=test_target.copy(), training_prediction=list(training_prediction), test_prediction=test_prediction, performance_mode=performance_mode ) # computing trivial values for the test set (just when want to calculate MASE) if 'MASE' in performance_measures: _, _, _, testing_true_values, testing_predicted_values, testing_trivial_values = get_trivial_values( train_true_values_df=training_target.copy(), validation_true_values_df=test_target.copy(), train_prediction=list(training_prediction), validation_prediction=test_prediction, forecast_horizon=forecast_horizon, granularity=granularity ) # computing performnace on test dataset test_prediction_errors = performance( true_values=testing_true_values, predicted_values=testing_predicted_values, performance_measures=performance_measures, trivial_values=testing_trivial_values, model_type=model_type, num_params=number_of_parameters, labels=labels) else: # computing performnace on test dataset test_prediction_errors = performance( true_values=test_target['Normal target'], predicted_values=test_prediction, performance_measures=performance_measures, trivial_values=[], model_type=model_type, num_params=number_of_parameters, labels=labels) # checking for existance of some directories for logging purpose if pathlib.Path('prediction/test process').is_dir() == False: pathlib.Path('prediction/test process').mkdir(parents=True, exist_ok=True) if pathlib.Path('performance/test process').is_dir() == False: pathlib.Path('performance/test process').mkdir(parents=True, exist_ok=True) # saving predictions based on model_type pred_file_name = 'prediction/test process/test prediction forecast horizon = %s.csv' % (forecast_horizon) testing_predictions = np.array(testing_predictions) if save_predictions == True: if model_type == 'regression': df = pd.DataFrame() df['real'] = test_target_normal['Normal target'].values.tolist() df['prediction'] = list(test_prediction_normal) df.insert(0, 'temporal id', test_target_normal['temporal id'].values.tolist(), True) df.insert(0, 'spatial id', test_target_normal['spatial id'].values.tolist(), True) df.insert(0, 'model name', model_name, True) df.to_csv(pred_file_name, index=False) elif model_type == 'classification': df = pd.DataFrame() df['real'] = test_target_normal['Normal target'].values.tolist() for i in range(len(labels)): col_name = 'class ' + str(labels[i]) df[col_name] = testing_predictions[:, i] df.insert(0, 'temporal id', test_target_normal['temporal id'].values.tolist(), True) df.insert(0, 'spatial id', test_target_normal['spatial id'].values.tolist(), True) df.insert(0, 'model name', model_name, True) df.to_csv(pred_file_name, index=False) # saving performance (same approach for both regression and classification) performance_file_name = 'performance/test process/test performance report forecast horizon = %s.csv' % (forecast_horizon) # selecting temporal and futuristic features or covariates from the feature_or_covariate_set list check_list = [item for item in feature_or_covariate_set if item.count(' ') != 0] # type_flag for detecting feature type (False) or covariate type (True) # check if all elements in check_list meet the condition for being covariate type type_flag = all(re.search(' t$', element) or re.search(' t[+]$', element) for element in check_list) processed_feature_or_covariate_set = [] # a list to be saved in performance report file if type_flag == 1: for item in feature_or_covariate_set: if item.count(' ') != 0: processed_feature_or_covariate_set.append(item[:-2]) else: processed_feature_or_covariate_set.append(item) else: processed_feature_or_covariate_set = feature_or_covariate_set.copy() if performance_report == True: df_data = { 'model name': list([model_name]), 'history length': list([history_length]), 'feature or covariate set': ', '.join(processed_feature_or_covariate_set) } df = pd.DataFrame(df_data, columns=list(df_data.keys())) for i in range(len(performance_measures)): df[performance_measures[i]] = list([float(test_prediction_errors[i])]) df.to_csv(performance_file_name, index=False) return trained_model
#%%Import print('Data is loading') DATA_TRAIN_PATH = '/Users/benoithohl/Desktop/epfl/master_epfl/Ma3/Machine_learning/AIAIaie/data/train.csv' # TODO: download train data and supply path here y, tX, ids = load_csv_data(DATA_TRAIN_PATH) print('Data loaded') #%% Preprocessing Data = remove_features_with_too_many_missing_values(tX, 0.66) Data = replace_missing_values_with_global_mean(Data) ZData = Z_score_of_each_feature(Data) print('Data matrix ready') #%% parameters setting #partition of the train set trainx, trainy, validationx, validationy = split_data(ZData, y, 0.75, seed=1) initial_w = np.zeros(trainx.shape[1]) max_iters = 100 gamma = 0.1 batch_size = 10 lambdas_vector = np.logspace(-3, 0, num=15) #lambdas_vector = np.linspace(0, 1, num=15) print('parameters set', "\n") #%% Ridge performance_ridge = [] performance_training = [] for lambda_ in lambdas_vector: w = ridge_regression(trainy, trainx, lambda_) weights = np.asarray(w) y_pred = predict_labels(weights, validationx)
def on_created(self, event): print("[{}] noticed: [{}] on: [{}] ".format(time.asctime(), event.event_type, event.src_path)) split_data(event.src_path)
def train_model(spex, subject, date, pretrain=False): batch_size = 1 k_folds = 10 (define_model, epochs, L, Fs, nchan, modelName) = spex path = "C:/Users/Kioskar/Desktop/Testing exjobb/Albin_Damir/all_subj_crop/" + modelName + "/subj" + str( subject) + "/" #path2 = "C:/Users/Kioskar/Desktop/Testing exjobb/Albin_Damir/all_subj_crop/" + modelName + "/subj" + str(subject) + "/" names = os.listdir(path) # classes = ["A","B","C"] k = k_folds # list_names = [[],[],[]] # i = 0 # for c in classes: # the_names = [idx for idx in names if idx[0].lower() == c.lower()] # list_names[i] = np.array_split(the_names,k) # i = i + 1 # def methodToLoad(files,path,spex,batch_size=1): # (_,_,L,Fs,nchan,modelName) = spex # train_0 = np.zeros([batch_size,Fs,L,nchan]) # for i,imID in enumerate(files): # spec = np.loadtxt(path+imID,delimiter=',') # spec = np.reshape(spec,[nchan,L,Fs]) # spec = np.transpose(spec,[2,1,0]) # train_0[i,:,:,:] = spec # return train_0 # files = [[],[],[]] # for i in range(0,3): # for the_names in list_names[i]: # files[i].append(methodToLoad(the_names,path,spex,batch_size=len(the_names))) np.random.shuffle(names) vals = [] #tracker = SummaryTracker() #vals_pretrain = [] confusion_matrix = np.zeros([k_folds, 3, 3]) for i in range(0, k_folds): gpus = tensorflow.config.experimental.list_physical_devices('GPU') tensorflow.config.experimental.set_memory_growth(gpus[0], True) print("Fold number " + str(i + 1) + "!") #fold_files = [] #fold_files_val = [] #class_names = [] #class_names_val = [] #for j in range(0,3): # class_names.extend(np.hstack(np.delete(list_names[j], i, 0)).transpose()) # class_names_val.extend(list_names[j][i]) # fold_files.extend(np.delete(files[j], i, 0)) # fold_files_val.extend(files[j]) #fold_files = np.vstack(fold_files) #fold_files_val = np.vstack(fold_files_val) #genVal = signalLoader(class_names_val,fold_files_val,path,spex,batch_size=batch_size,class_on_char=0) #gen = signalLoader(class_names,fold_files,path,spex,batch_size=batch_size,class_on_char=0) #print(class_names) #print(class_names_val) #trainlen = len(class_names) #vallen = len(class_names_val) #print(vallen) (gen, genVal, trainlen, vallen) = split_data(["A", "B", "C"], k_folds, i, names, path, spex, batch_size=batch_size) checkpoint_path_fold = checkpoint_path + date + "/fold" + str( i + 1) + "/cp-{epoch:04d}.ckpt" #cp_callback = tensorflow.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path_fold,save_weights_only=True,verbose=1) model = define_model(nchan, L, Fs) #model.load_weights("C:/Users/Kioskar/Documents/GitHub/Exjobb/logs/model_check_points/20210311-183357/fold1/cp-0042.ckpt") #Subj03 #model.load_weights("C:/Users/Kioskar/Documents/GitHub/Exjobb/logs/model_check_points/20210312-095405/fold1/cp-0051.ckpt") #Subj01 #if pretrain : # model.load_weights("C:/Users/Kioskar/Documents/GitHub/Exjobb/logs/model_check_points/"+date+"/fold1/cp-0030.ckpt") #for transfer algorithm. #history_pretrain = model.evaluate(gen,steps=trainlen,verbose=2) #print(history_pretrain) #vals_pretrain.append(history_pretrain[1]) history = model.fit( gen, validation_data=genVal, steps_per_epoch=int(trainlen / batch_size) + 1, validation_steps=int(vallen / batch_size) + 1, epochs=epochs, #callbacks=[cp_callback], verbose=2) vals.append(history.history['val_accuracy']) del history del model tensorflow.keras.backend.clear_session() tensorflow.compat.v1.reset_default_graph() def limit_mem(): tensorflow.config.experimental.get_session().close() #limit_mem() #heatmap_mean = generate_gradCAM(model,spex,path) #plt.imshow(np.repeat(heatmap_mean,50,aixs=0)) #plt.show() #labels = np.zeros([vallen,3]) #for j in range(0,vallen): # (_,labels[j,:]) = next(genVal) #val_preds = np.argmax(model.predict(genVal,steps=vallen),axis=1) #confusion_matrix[i,:,:] = tensorflow.math.confusion_matrix(np.argmax(labels,axis=1),val_preds) #print(confusion_matrix[i,:,:]) #names2 = os.listdir(path2) #(gen2,_,trainlen2,_) = split_data(["A","B","C"],100,0,names2,path2,spex,batch_size=batch_size) #history2 = model.evaluate(gen2,steps=trainlen2) #print(history2) #pri nt(np.mean(confusion_matrix,axis=0)) #pri nt(vals_pretrain) #pri nt(np.mean(vals_pretrain)) #del fold_files #del fold_files_val #tracker.print_diff() return vals
def make_bottleneck_dump_subdir(src_dir, shape, ratio): """ Use names of subdirs as a id. And then calculate class_index from id. """ class_id_set = set() #bottleneck_data = dict() feature_vectors, labels, filenames = [], [], [] image_size = (shape[0], shape[1]) listdir = os.listdir(src_dir) # 1) findout number of classes for class_id in listdir: subdir = src_dir + '/' + class_id if not os.path.isdir(subdir): continue if len(os.listdir(subdir)) == 0: continue else: try: class_id_int = int(class_id) class_id_set.add(class_id_int) except: continue # 2) maps class_id to class_index id_list = list(class_id_set) id_list.sort() print('Number of classes in the sample: {0}'.format(len(id_list))) print('Min class id: {0}'.format(min(id_list))) print('Max class id: {0}'.format(max(id_list))) map_id_label = {class_id: index for index, class_id in enumerate(id_list)} map_label_id = {index: class_id for index, class_id in enumerate(id_list)} maps = {'id_label': map_id_label, 'label_id': map_label_id} num_classes = len(map_id_label) # 3) Calculate bottleneck in TF height, width, color = shape x = tf.placeholder(tf.float32, [None, height, width, 3], name='Placeholder-x') resized_input_tensor = tf.reshape(x, [-1, height, width, 3]) #module = hub.Module("https://tfhub.dev/google/imagenet/resnet_v2_152/classification/1") # num_features = 2048, height x width = 224 x 224 pixels assert height, width == hub.get_expected_image_size(module) bottleneck_tensor = module( resized_input_tensor) # Features with shape [batch_size, num_features] print('bottleneck_tensor:', bottleneck_tensor) with tf.Session() as sess: # Connect to the TF runtime. init = tf.global_variables_initializer() sess.run(init) # Randomly initialize weights. for class_id in class_id_set: subdir = src_dir + '/' + str(class_id) print(subdir) files = os.listdir(subdir) num_files = len(files) for index_file, filename in enumerate(files): base = os.path.splitext(filename)[0] ext = os.path.splitext(filename)[1] if not ext in {'.jpg', ".png"}: continue class_index = map_id_label[class_id] #print(class_index) label = [0] * num_classes label[class_index] = 1 #class_index_set.add(class_index) file_path = subdir + '/' + filename im = Image.open(file_path) im = im.resize(image_size, Image.ANTIALIAS) arr = np.array(im, dtype=np.float32) / 256 feature_vector = bottleneck_tensor.eval(feed_dict={x: [arr]}) feature_vectors.append(feature_vector) labels.append(label) filenames.append(filename) # or file_path im.close() print("dir={0}, class={1}: {2}/{3}: {4}".format( class_id, class_index, index_file, num_files, filename)) print('----') print('Number of classes: {0}'.format(num_classes)) print('Number of feature vectors: {0}'.format(len(feature_vectors))) data = { 'images': feature_vectors, 'labels': labels, 'filenames': filenames } # mix data if DO_MIX: print('start mix data') zip3 = list(zip(data['images'], data['labels'], data['filenames'])) random.shuffle(zip3) print('mix ok') data['images'] = [x[0] for x in zip3] data['labels'] = [x[1] for x in zip3] data['filenames'] = [x[2] for x in zip3] print('Split data') data = split_data.split_data(data, ratio=ratio) data['id_label'] = map_id_label data['label_id'] = map_label_id return data