예제 #1
0
def train_new_model(time_data, labels, output_folder, itrs=1, classifier_name='resnet'):

    # Split data into train and test portions
    X_train, X_test, y_train, y_test = train_test_split(time_data.T, labels)

    # ----------------------train model----------------------

    for itr in range(itrs):
        output_directory = f'{output_folder}{classifier_name}_{itr}/'

        logger.info(f'Method: {classifier_name} using {itr} iterations.')

        if os.path.exists(f'{output_directory}df_metrics.csv'):
            logger.info(f'{classifier_name} using {itr} iteration already done')
        else:
            if not os.path.exists(output_directory):
                os.makedirs(output_directory)

            fit_classifier(X_train, y_train, X_test, y_test, classifier_name, output_directory)

            logger.info('Training complete.')

            # evaluate best model on test data
            x_train, y_train, x_test, y_test = prepare_data(X_train, y_train, X_test, y_test,)
            model = keras.models.load_model(output_directory + 'best_model.hdf5')
            y_pred = model.predict(x_test)
            y_pred = np.argmax(y_pred, axis=1)
            y_true = np.argmax(y_test, axis=1)
            model_metrics = calculate_metrics(y_true, y_pred, 0.0)
            logger.info(f'Iteration {itr}: df metrics')
            [logger.info(f'{measure}: {round(val, 2)}') for measure, val in model_metrics.T.reset_index().values]
예제 #2
0
	def predict(self, x_test, y_true,x_train,y_train,y_test,return_df_metrics = True):
		model_path = self.output_directory + 'best_model.hdf5'
		model = keras.models.load_model(model_path)
		y_pred = model.predict(x_test)
		if return_df_metrics:
			y_pred = np.argmax(y_pred, axis=1)
			df_metrics = calculate_metrics(y_true, y_pred, 0.0)
			return df_metrics
		else:
			return y_pred
예제 #3
0
 def predict(self, x_test, y_true, x_train, y_train, y_test, return_df_metrics=True):
     start_time = time.time()
     model_path = self.output_directory + 'best_model.hdf5'
     model = keras.models.load_model(model_path)
     y_pred = model.predict(x_test)
     if return_df_metrics:
         y_pred = np.argmax(y_pred, axis=1)
         df_metrics = calculate_metrics(y_true, y_pred, 0.0)
         return df_metrics
     else:
         test_duration = time.time() - start_time
         save_test_duration(self.output_directory + 'test_duration.csv', test_duration)
         return y_pred
예제 #4
0
    def train(self, x_train, y_train, x_test, y_test,y_true, pool_factor=None, filter_size=None,do_train=True):
        window_size = 0.2
        n_train_batch = 10
        n_epochs = 200
        max_train_batch_size = 256

        # print('Original train shape: ', x_train.shape)
        # print('Original test shape: ', x_test.shape)

        # split train into validation set with validation_size = 0.2 train_size 
        x_train,y_train,x_val,y_val = self.split_train(x_train,y_train)
        
        ori_len = x_train.shape[1] # original_length of time series  
        slice_ratio = 0.9

        if do_train == True:
            kernel_size = int(ori_len * filter_size)

        if do_train == False:
            model = keras.models.load_model(self.output_directory+'best_model.hdf5')

            # model.summary()

            pool_size = model.get_layer('max_pooling1d_1').get_config()['pool_size'][0]

            conv_shape = model.get_layer('conv1d_1').output_shape[1]


            pool_factor = self.get_pool_factor(conv_shape,pool_size)

        #restrict slice ratio when data lenght is too large
        if ori_len > 500 : 
            slice_ratio = slice_ratio if slice_ratio > 0.98 else 0.98
        elif ori_len < 16:
            slice_ratio = 0.7

        increase_num = ori_len - int(ori_len * slice_ratio) + 1 #this can be used as the bath size

        train_batch_size = int(x_train.shape[0] * increase_num / n_train_batch)
        if train_batch_size > max_train_batch_size : 
            # limit the train_batch_size 
            n_train_batch = int(x_train.shape[0] * increase_num / max_train_batch_size)
        
        # data augmentation by slicing the length of the series 
        x_train,y_train = self.slice_data(x_train,y_train,slice_ratio)
        x_val,y_val = self.slice_data(x_val,y_val,slice_ratio)
        x_test,y_test = self.slice_data(x_test,y_test,slice_ratio)

        train_set_x, train_set_y = x_train,y_train
        valid_set_x, valid_set_y = x_val,y_val
        test_set_x, _ = x_test,y_test

        valid_num = valid_set_x.shape[0]
        
        # print("increase factor is ", increase_num, ', ori len', ori_len)
        valid_num_batch = int(valid_num / increase_num)

        test_num = test_set_x.shape[0]
        test_num_batch = int(test_num / increase_num)

        length_train = train_set_x.shape[1] #length after slicing.
        
        window_size = int(length_train * window_size) if window_size < 1 else int(window_size)

        #*******set up the ma and ds********#
        ma_base,ma_step,ma_num   = 5, 6, 1
        ds_base,ds_step, ds_num  = 2, 1, 4

        ds_num_max = length_train / (pool_factor * window_size)
        ds_num = int(min(ds_num, ds_num_max))
        
        #*******set up the ma and ds********#

        (ma_train, ma_valid, ma_test , ma_lengths) = self.batch_movingavrg(train_set_x,
                                                        valid_set_x, test_set_x,
                                                        ma_base, ma_step, ma_num)
        (ds_train, ds_valid, ds_test , ds_lengths) = self.batch_downsample(train_set_x,
                                                        valid_set_x, test_set_x,
                                                        ds_base, ds_step, ds_num)

        #concatenate directly
        data_lengths = [length_train] 
        #downsample part:
        if ds_lengths != []:
            data_lengths +=  ds_lengths
            train_set_x = np.concatenate([train_set_x, ds_train], axis = 1)
            valid_set_x = np.concatenate([valid_set_x, ds_valid], axis = 1)
            test_set_x = np.concatenate([test_set_x, ds_test], axis = 1)

        #moving average part
        if ma_lengths != []:
            data_lengths += ma_lengths
            train_set_x = np.concatenate([train_set_x, ma_train], axis = 1)
            valid_set_x = np.concatenate([valid_set_x, ma_valid], axis = 1)
            test_set_x = np.concatenate([test_set_x, ma_test], axis = 1)
        # print("Data length:", data_lengths)

        n_train_size = train_set_x.shape[0]
        n_valid_size = valid_set_x.shape[0]
        n_test_size = test_set_x.shape[0]
        batch_size = int(n_train_size / n_train_batch)
        n_train_batches = int(n_train_size / batch_size)
        data_dim = train_set_x.shape[1]  
        num_dim = train_set_x.shape[2] # For MTS 
        nb_classes = train_set_y.shape[1] 

        # print('train size', n_train_size, ',valid size', n_valid_size, ' test size', n_test_size)
        # print('batch size ', batch_size)
        # print('n_train_batches is ', n_train_batches)
        # print('data dim is ', data_dim)
        # print('---------------------------')

        ######################
        # BUILD ACTUAL MODEL #
        ######################
        # print('building the model...')

        input_shapes, max_length = self.get_list_of_input_shapes(data_lengths,num_dim)

        start_time = time.time()

        best_validation_loss = np.inf

        if do_train == True:

            model = self.build_model(input_shapes, nb_classes, pool_factor, kernel_size)

            if (self.verbose==True) :
                model.summary()

            # print('Training')


            # early-stopping parameters
            patience = 10000  # look as this many examples regardless
            patience_increase = 2  # wait this much longer when a new best is
                                   # found
            improvement_threshold = 0.995  # a relative improvement of this much is
                                           # considered significant
            validation_frequency = min(n_train_batches, patience / 2)
                                          # go through this many
                                          # minibatche before checking the network
                                          # on the validation set; in this case we
                                          # check every epoch
            max_before_stopping = 500

            best_iter = 0
            valid_loss = 0.

            epoch = 0
            done_looping = False
            num_no_update_epoch = 0
            epoch_avg_cost = float('inf')
            epoch_avg_err = float('inf')

            while (epoch < n_epochs) and (not done_looping):
                epoch = epoch + 1
                epoch_train_err = 0.
                epoch_cost = 0.

                num_no_update_epoch += 1
                if num_no_update_epoch == max_before_stopping:
                    break


                for minibatch_index in range(n_train_batches):

                    iteration = (epoch - 1) * n_train_batches + minibatch_index

                    x = train_set_x[minibatch_index*batch_size: (minibatch_index+1)*batch_size]
                    y = train_set_y[minibatch_index*batch_size: (minibatch_index+1)*batch_size]

                    x = self.split_input_for_model(x,input_shapes)

                    cost_ij, accuracy = model.train_on_batch(x,y)

                    train_err = 1 - accuracy

                    epoch_train_err = epoch_train_err + train_err
                    epoch_cost = epoch_cost + cost_ij


                    if (iteration + 1) % validation_frequency == 0:

                        valid_losses = []
                        for i in range(valid_num_batch):
                            x = valid_set_x[i * (increase_num) : (i + 1) * (increase_num)]
                            y_pred = model.predict_on_batch(self.split_input_for_model(x,input_shapes))

                            # convert the predicted from binary to integer
                            y_pred = np.argmax(y_pred , axis=1)
                            label = np.argmax(valid_set_y[i * increase_num])

                            unique_value, sub_ind, correspond_ind, count = np.unique(y_pred, True, True, True)
                            unique_value = unique_value.tolist()

                            curr_err = 1.
                            if label in unique_value:
                                target_ind = unique_value.index(label)
                                count = count.tolist()
                                sorted_count = sorted(count)
                                if count[target_ind] == sorted_count[-1]:
                                    if len(sorted_count) > 1 and sorted_count[-1] == sorted_count[-2]:
                                        curr_err = 0.5 #tie
                                    else:
                                        curr_err = 0
                            valid_losses.append(curr_err)
                        valid_loss = sum(valid_losses) / float(len(valid_losses))

                        # print('...epoch%i,valid err: %.5f |' % (epoch,valid_loss))

                        # if we got the best validation score until now
                        if valid_loss <= best_validation_loss:
                            num_no_update_epoch = 0

                            #improve patience if loss improvement is good enough
                            if valid_loss < best_validation_loss*improvement_threshold:
                                patience = max(patience,iteration*patience_increase)

                            # save best validation score and iteration number
                            best_validation_loss = valid_loss
                            best_iter = iteration

                            # save model in h5 format
                            model.save(self.output_directory+'best_model.hdf5')

                        model.save(self.output_directory + 'last_model.hdf5')
                    if patience<= iteration:
                        done_looping=True
                        break
                epoch_avg_cost = epoch_cost/n_train_batches
                epoch_avg_err = epoch_train_err/n_train_batches

                # print ('train err %.5f, cost %.4f' %(epoch_avg_err,epoch_avg_cost))
                if epoch_avg_cost == 0:
                    break

            # print('Optimization complete.')

        # test the model
        # print('Testing')
        # load best model
        model = keras.models.load_model(self.output_directory+'best_model.hdf5')

        # get the true predictions of the test set
        y_predicted = []
        for i in range(test_num_batch):
            x = test_set_x[i * (increase_num) : (i + 1) * (increase_num)]
            y_pred = model.predict_on_batch(self.split_input_for_model(x,input_shapes))

            # convert the predicted from binary to integer 
            y_pred = np.argmax(y_pred , axis=1)

            unique_value, sub_ind, correspond_ind, count = np.unique(y_pred, True, True, True)

            idx_max = np.argmax(count)
            predicted_label = unique_value[idx_max]

            y_predicted.append(predicted_label)

        y_pred = np.array(y_predicted)

        duration = time.time() - start_time        

        df_metrics = calculate_metrics(y_true,y_pred, duration)

        # print(y_true.shape)
        # print(y_pred.shape)

        df_metrics.to_csv(self.output_directory+'df_metrics.csv', index=False)

        return df_metrics, model , best_validation_loss
예제 #5
0
    def train(self):
        start_time = time.time()

        ################
        ### Training ###
        ################

        # init the matrices
        self.init_matrices()
        # compute the state matrices which is the new feature space
        state_matrix = self.compute_state_matrix(self.x_train)
        # add the input to form the new feature space and transform to
        # the new feature space to be feeded to the classifier
        new_x_train = np.concatenate((self.x_train, state_matrix),
                                     axis=2).reshape(self.N * self.T,
                                                     self.num_dim + self.N_x)
        # memory free
        state_matrix = None
        gc.collect()
        # transform the corresponding labels
        new_labels = np.repeat(self.y_train, self.T, axis=0)
        # new model
        ridge_classifier = Ridge(alpha=self.lamda)
        # fit the new feature space
        ridge_classifier.fit(new_x_train, new_labels)

        ################
        ## Validation ##
        ################
        # compute state matrix for validation set
        state_matrix = self.compute_state_matrix(self.x_val)
        # add the input to form the new feature space and transform to
        # the new feature space to be feeded to the classifier
        new_x_val = np.concatenate(
            (self.x_val, state_matrix),
            axis=2).reshape(self.x_val.shape[0] * self.T,
                            self.num_dim + self.N_x)
        # get the prediction on the train set
        y_pred_val = ridge_classifier.predict(new_x_val)
        # reconstruct the training prediction
        y_pred_val = self.reshape_prediction(y_pred_val, self.x_val.shape[0],
                                             self.T)
        # get the metrics for the train
        df_val_metrics = calculate_metrics(np.argmax(self.y_val, axis=1),
                                           y_pred_val, 0.0)
        # get the train accuracy
        train_acc = df_val_metrics['accuracy'][0]

        ###############
        ### Testing ###
        ###############

        # get the predicition on the test set
        # transform the test set to the new features
        state_matrix = self.compute_state_matrix(self.x_test)
        # add the input to form the new feature space and transform to the new feature space to be feeded to the classifier
        new_x_test = np.concatenate(
            (self.x_test, state_matrix),
            axis=2).reshape(self.x_test.shape[0] * self.T,
                            self.num_dim + self.N_x)
        # memory free
        state_matrix = None
        gc.collect()
        # get the prediction on the test set
        y_pred = ridge_classifier.predict(new_x_test)
        # reconstruct the test predictions
        y_pred = self.reshape_prediction(y_pred, self.x_test.shape[0], self.T)

        duration = time.time() - start_time
        # get the metrics for the test predictions
        df_metrics = calculate_metrics(self.y_true, y_pred, duration)

        # get the output layer weights
        self.W_out = ridge_classifier.coef_
        ridge_classifier = None
        gc.collect()
        # save the model
        np.savetxt(self.output_directory + 'W_in.txt', self.W_in)
        np.savetxt(self.output_directory + 'W.txt', self.W)
        np.savetxt(self.output_directory + 'W_out.txt', self.W_out)

        # save the metrics
        df_metrics.to_csv(self.output_directory + 'df_metrics.csv',
                          index=False)

        # return the training accuracy and the prediction metrics on the test set
        return df_metrics, train_acc
예제 #6
0
    def predict(self, x_test, y_true,x_train,y_train,y_test):
        batch_size = 256

        # limit the number of augmented time series if series too long or too many
        if x_train.shape[1] > 500 or x_train.shape[0] > 2000 or x_test.shape[0] > 2000:
            self.warping_ratios = [1]
            self.slice_ratio = 0.9
        # increase the slice if series too short
        if x_train.shape[1] * self.slice_ratio < 8:
            self.slice_ratio = 8 / x_train.shape[1]

        new_x_train, new_y_train, new_x_test, new_y_test, tot_increase_num = \
            self.pre_processing(x_train, y_train, x_test, y_test)

        model_path = self.output_directory + 'best_model.hdf5'
        model = keras.models.load_model(model_path)

        y_pred = model.predict(new_x_test, batch_size=batch_size)
        # convert the predicted from binary to integer
        y_pred = np.argmax(y_pred, axis=1)

        # get the true predictions of the test set
        y_predicted = []
        test_num_batch = int(new_x_test.shape[0] / tot_increase_num)
        for i in range(test_num_batch):
            unique_value, sub_ind, correspond_ind, count = np.unique(y_pred, True, True, True)

            idx_max = np.argmax(count)
            predicted_label = unique_value[idx_max]

            y_predicted.append(predicted_label)

        y_pred = np.array(y_predicted)

        df_metrics = calculate_metrics(y_true, y_pred, 0.0)
        return df_metrics