def train_test_split_matrix(self, split=0.2, use_saved=True): """ if use_saved is true, the pickle files of train and test are loaded. If not, we split the data into train and test datasets. :param split: the test proportion of the data; has to be a float between 0 and 1 :param use_saved: if true, we use the local pickled copies of the data """ if use_saved: self.train = Util.load_pickle_object( self.config.get_train_data_loc()) self.test = Util.load_pickle_object( self.config.get_test_data_loc()) else: matrix_copy = self.matrix.copy(deep=True) split_point = int(matrix_copy.shape[1] * split) # here we shuffle the columns to prevent overfitting shuffled_columns = matrix_copy.columns.tolist() shuffle(shuffled_columns) matrix_copy = matrix_copy[shuffled_columns] train_matrix = matrix_copy.copy(deep=True) train_matrix.iloc[:, split_point:] = 0 test_matrix = matrix_copy.copy(deep=True) test_matrix.iloc[:, :split_point] = 0 Util.pickle_object(self.config.get_train_data_loc(), train_matrix) Util.pickle_object(self.config.get_test_data_loc(), test_matrix) self.train = train_matrix self.test = test_matrix
def save_matrix(self): Util.pickle_object(self.config.get_matrix_loc(), self.matrix)
def save_data_dict(self, data_dict): Util.pickle_object(self.config.get_data_dict_loc(), data_dict)