def train_new_model(time_data, labels, output_folder, itrs=1, classifier_name='resnet'): # Split data into train and test portions X_train, X_test, y_train, y_test = train_test_split(time_data.T, labels) # ----------------------train model---------------------- for itr in range(itrs): output_directory = f'{output_folder}{classifier_name}_{itr}/' logger.info(f'Method: {classifier_name} using {itr} iterations.') if os.path.exists(f'{output_directory}df_metrics.csv'): logger.info(f'{classifier_name} using {itr} iteration already done') else: if not os.path.exists(output_directory): os.makedirs(output_directory) fit_classifier(X_train, y_train, X_test, y_test, classifier_name, output_directory) logger.info('Training complete.') # evaluate best model on test data x_train, y_train, x_test, y_test = prepare_data(X_train, y_train, X_test, y_test,) model = keras.models.load_model(output_directory + 'best_model.hdf5') y_pred = model.predict(x_test) y_pred = np.argmax(y_pred, axis=1) y_true = np.argmax(y_test, axis=1) model_metrics = calculate_metrics(y_true, y_pred, 0.0) logger.info(f'Iteration {itr}: df metrics') [logger.info(f'{measure}: {round(val, 2)}') for measure, val in model_metrics.T.reset_index().values]
def predict(self, x_test, y_true,x_train,y_train,y_test,return_df_metrics = True): model_path = self.output_directory + 'best_model.hdf5' model = keras.models.load_model(model_path) y_pred = model.predict(x_test) if return_df_metrics: y_pred = np.argmax(y_pred, axis=1) df_metrics = calculate_metrics(y_true, y_pred, 0.0) return df_metrics else: return y_pred
def predict(self, x_test, y_true, x_train, y_train, y_test, return_df_metrics=True): start_time = time.time() model_path = self.output_directory + 'best_model.hdf5' model = keras.models.load_model(model_path) y_pred = model.predict(x_test) if return_df_metrics: y_pred = np.argmax(y_pred, axis=1) df_metrics = calculate_metrics(y_true, y_pred, 0.0) return df_metrics else: test_duration = time.time() - start_time save_test_duration(self.output_directory + 'test_duration.csv', test_duration) return y_pred
def train(self, x_train, y_train, x_test, y_test,y_true, pool_factor=None, filter_size=None,do_train=True): window_size = 0.2 n_train_batch = 10 n_epochs = 200 max_train_batch_size = 256 # print('Original train shape: ', x_train.shape) # print('Original test shape: ', x_test.shape) # split train into validation set with validation_size = 0.2 train_size x_train,y_train,x_val,y_val = self.split_train(x_train,y_train) ori_len = x_train.shape[1] # original_length of time series slice_ratio = 0.9 if do_train == True: kernel_size = int(ori_len * filter_size) if do_train == False: model = keras.models.load_model(self.output_directory+'best_model.hdf5') # model.summary() pool_size = model.get_layer('max_pooling1d_1').get_config()['pool_size'][0] conv_shape = model.get_layer('conv1d_1').output_shape[1] pool_factor = self.get_pool_factor(conv_shape,pool_size) #restrict slice ratio when data lenght is too large if ori_len > 500 : slice_ratio = slice_ratio if slice_ratio > 0.98 else 0.98 elif ori_len < 16: slice_ratio = 0.7 increase_num = ori_len - int(ori_len * slice_ratio) + 1 #this can be used as the bath size train_batch_size = int(x_train.shape[0] * increase_num / n_train_batch) if train_batch_size > max_train_batch_size : # limit the train_batch_size n_train_batch = int(x_train.shape[0] * increase_num / max_train_batch_size) # data augmentation by slicing the length of the series x_train,y_train = self.slice_data(x_train,y_train,slice_ratio) x_val,y_val = self.slice_data(x_val,y_val,slice_ratio) x_test,y_test = self.slice_data(x_test,y_test,slice_ratio) train_set_x, train_set_y = x_train,y_train valid_set_x, valid_set_y = x_val,y_val test_set_x, _ = x_test,y_test valid_num = valid_set_x.shape[0] # print("increase factor is ", increase_num, ', ori len', ori_len) valid_num_batch = int(valid_num / increase_num) test_num = test_set_x.shape[0] test_num_batch = int(test_num / increase_num) length_train = train_set_x.shape[1] #length after slicing. window_size = int(length_train * window_size) if window_size < 1 else int(window_size) #*******set up the ma and ds********# ma_base,ma_step,ma_num = 5, 6, 1 ds_base,ds_step, ds_num = 2, 1, 4 ds_num_max = length_train / (pool_factor * window_size) ds_num = int(min(ds_num, ds_num_max)) #*******set up the ma and ds********# (ma_train, ma_valid, ma_test , ma_lengths) = self.batch_movingavrg(train_set_x, valid_set_x, test_set_x, ma_base, ma_step, ma_num) (ds_train, ds_valid, ds_test , ds_lengths) = self.batch_downsample(train_set_x, valid_set_x, test_set_x, ds_base, ds_step, ds_num) #concatenate directly data_lengths = [length_train] #downsample part: if ds_lengths != []: data_lengths += ds_lengths train_set_x = np.concatenate([train_set_x, ds_train], axis = 1) valid_set_x = np.concatenate([valid_set_x, ds_valid], axis = 1) test_set_x = np.concatenate([test_set_x, ds_test], axis = 1) #moving average part if ma_lengths != []: data_lengths += ma_lengths train_set_x = np.concatenate([train_set_x, ma_train], axis = 1) valid_set_x = np.concatenate([valid_set_x, ma_valid], axis = 1) test_set_x = np.concatenate([test_set_x, ma_test], axis = 1) # print("Data length:", data_lengths) n_train_size = train_set_x.shape[0] n_valid_size = valid_set_x.shape[0] n_test_size = test_set_x.shape[0] batch_size = int(n_train_size / n_train_batch) n_train_batches = int(n_train_size / batch_size) data_dim = train_set_x.shape[1] num_dim = train_set_x.shape[2] # For MTS nb_classes = train_set_y.shape[1] # print('train size', n_train_size, ',valid size', n_valid_size, ' test size', n_test_size) # print('batch size ', batch_size) # print('n_train_batches is ', n_train_batches) # print('data dim is ', data_dim) # print('---------------------------') ###################### # BUILD ACTUAL MODEL # ###################### # print('building the model...') input_shapes, max_length = self.get_list_of_input_shapes(data_lengths,num_dim) start_time = time.time() best_validation_loss = np.inf if do_train == True: model = self.build_model(input_shapes, nb_classes, pool_factor, kernel_size) if (self.verbose==True) : model.summary() # print('Training') # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch max_before_stopping = 500 best_iter = 0 valid_loss = 0. epoch = 0 done_looping = False num_no_update_epoch = 0 epoch_avg_cost = float('inf') epoch_avg_err = float('inf') while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 epoch_train_err = 0. epoch_cost = 0. num_no_update_epoch += 1 if num_no_update_epoch == max_before_stopping: break for minibatch_index in range(n_train_batches): iteration = (epoch - 1) * n_train_batches + minibatch_index x = train_set_x[minibatch_index*batch_size: (minibatch_index+1)*batch_size] y = train_set_y[minibatch_index*batch_size: (minibatch_index+1)*batch_size] x = self.split_input_for_model(x,input_shapes) cost_ij, accuracy = model.train_on_batch(x,y) train_err = 1 - accuracy epoch_train_err = epoch_train_err + train_err epoch_cost = epoch_cost + cost_ij if (iteration + 1) % validation_frequency == 0: valid_losses = [] for i in range(valid_num_batch): x = valid_set_x[i * (increase_num) : (i + 1) * (increase_num)] y_pred = model.predict_on_batch(self.split_input_for_model(x,input_shapes)) # convert the predicted from binary to integer y_pred = np.argmax(y_pred , axis=1) label = np.argmax(valid_set_y[i * increase_num]) unique_value, sub_ind, correspond_ind, count = np.unique(y_pred, True, True, True) unique_value = unique_value.tolist() curr_err = 1. if label in unique_value: target_ind = unique_value.index(label) count = count.tolist() sorted_count = sorted(count) if count[target_ind] == sorted_count[-1]: if len(sorted_count) > 1 and sorted_count[-1] == sorted_count[-2]: curr_err = 0.5 #tie else: curr_err = 0 valid_losses.append(curr_err) valid_loss = sum(valid_losses) / float(len(valid_losses)) # print('...epoch%i,valid err: %.5f |' % (epoch,valid_loss)) # if we got the best validation score until now if valid_loss <= best_validation_loss: num_no_update_epoch = 0 #improve patience if loss improvement is good enough if valid_loss < best_validation_loss*improvement_threshold: patience = max(patience,iteration*patience_increase) # save best validation score and iteration number best_validation_loss = valid_loss best_iter = iteration # save model in h5 format model.save(self.output_directory+'best_model.hdf5') model.save(self.output_directory + 'last_model.hdf5') if patience<= iteration: done_looping=True break epoch_avg_cost = epoch_cost/n_train_batches epoch_avg_err = epoch_train_err/n_train_batches # print ('train err %.5f, cost %.4f' %(epoch_avg_err,epoch_avg_cost)) if epoch_avg_cost == 0: break # print('Optimization complete.') # test the model # print('Testing') # load best model model = keras.models.load_model(self.output_directory+'best_model.hdf5') # get the true predictions of the test set y_predicted = [] for i in range(test_num_batch): x = test_set_x[i * (increase_num) : (i + 1) * (increase_num)] y_pred = model.predict_on_batch(self.split_input_for_model(x,input_shapes)) # convert the predicted from binary to integer y_pred = np.argmax(y_pred , axis=1) unique_value, sub_ind, correspond_ind, count = np.unique(y_pred, True, True, True) idx_max = np.argmax(count) predicted_label = unique_value[idx_max] y_predicted.append(predicted_label) y_pred = np.array(y_predicted) duration = time.time() - start_time df_metrics = calculate_metrics(y_true,y_pred, duration) # print(y_true.shape) # print(y_pred.shape) df_metrics.to_csv(self.output_directory+'df_metrics.csv', index=False) return df_metrics, model , best_validation_loss
def train(self): start_time = time.time() ################ ### Training ### ################ # init the matrices self.init_matrices() # compute the state matrices which is the new feature space state_matrix = self.compute_state_matrix(self.x_train) # add the input to form the new feature space and transform to # the new feature space to be feeded to the classifier new_x_train = np.concatenate((self.x_train, state_matrix), axis=2).reshape(self.N * self.T, self.num_dim + self.N_x) # memory free state_matrix = None gc.collect() # transform the corresponding labels new_labels = np.repeat(self.y_train, self.T, axis=0) # new model ridge_classifier = Ridge(alpha=self.lamda) # fit the new feature space ridge_classifier.fit(new_x_train, new_labels) ################ ## Validation ## ################ # compute state matrix for validation set state_matrix = self.compute_state_matrix(self.x_val) # add the input to form the new feature space and transform to # the new feature space to be feeded to the classifier new_x_val = np.concatenate( (self.x_val, state_matrix), axis=2).reshape(self.x_val.shape[0] * self.T, self.num_dim + self.N_x) # get the prediction on the train set y_pred_val = ridge_classifier.predict(new_x_val) # reconstruct the training prediction y_pred_val = self.reshape_prediction(y_pred_val, self.x_val.shape[0], self.T) # get the metrics for the train df_val_metrics = calculate_metrics(np.argmax(self.y_val, axis=1), y_pred_val, 0.0) # get the train accuracy train_acc = df_val_metrics['accuracy'][0] ############### ### Testing ### ############### # get the predicition on the test set # transform the test set to the new features state_matrix = self.compute_state_matrix(self.x_test) # add the input to form the new feature space and transform to the new feature space to be feeded to the classifier new_x_test = np.concatenate( (self.x_test, state_matrix), axis=2).reshape(self.x_test.shape[0] * self.T, self.num_dim + self.N_x) # memory free state_matrix = None gc.collect() # get the prediction on the test set y_pred = ridge_classifier.predict(new_x_test) # reconstruct the test predictions y_pred = self.reshape_prediction(y_pred, self.x_test.shape[0], self.T) duration = time.time() - start_time # get the metrics for the test predictions df_metrics = calculate_metrics(self.y_true, y_pred, duration) # get the output layer weights self.W_out = ridge_classifier.coef_ ridge_classifier = None gc.collect() # save the model np.savetxt(self.output_directory + 'W_in.txt', self.W_in) np.savetxt(self.output_directory + 'W.txt', self.W) np.savetxt(self.output_directory + 'W_out.txt', self.W_out) # save the metrics df_metrics.to_csv(self.output_directory + 'df_metrics.csv', index=False) # return the training accuracy and the prediction metrics on the test set return df_metrics, train_acc
def predict(self, x_test, y_true,x_train,y_train,y_test): batch_size = 256 # limit the number of augmented time series if series too long or too many if x_train.shape[1] > 500 or x_train.shape[0] > 2000 or x_test.shape[0] > 2000: self.warping_ratios = [1] self.slice_ratio = 0.9 # increase the slice if series too short if x_train.shape[1] * self.slice_ratio < 8: self.slice_ratio = 8 / x_train.shape[1] new_x_train, new_y_train, new_x_test, new_y_test, tot_increase_num = \ self.pre_processing(x_train, y_train, x_test, y_test) model_path = self.output_directory + 'best_model.hdf5' model = keras.models.load_model(model_path) y_pred = model.predict(new_x_test, batch_size=batch_size) # convert the predicted from binary to integer y_pred = np.argmax(y_pred, axis=1) # get the true predictions of the test set y_predicted = [] test_num_batch = int(new_x_test.shape[0] / tot_increase_num) for i in range(test_num_batch): unique_value, sub_ind, correspond_ind, count = np.unique(y_pred, True, True, True) idx_max = np.argmax(count) predicted_label = unique_value[idx_max] y_predicted.append(predicted_label) y_pred = np.array(y_predicted) df_metrics = calculate_metrics(y_true, y_pred, 0.0) return df_metrics