def validation_sparse_matrix(filename='seq_valid_data', ext='bz2', max_items=None): """ Load the test data frame and make vectors grouped by users. :param filename: path to look whether the sequences data was already loaded :param ext: extension indicates compression type :param max_items: number of test items to retrieve :return: original dataframe and sparse matrix w/ shape (n_groups of 1st column, dimension of 2nd column) """ score_dataframe = pd.read_csv(scoreSet, nrows=max_items) fp = filename + '.' + ext data_file = os.path.join(data_dir, fp) try: matrix = load(data_file) except Exception: user_grp = score_dataframe.groupby('new_user', sort=False) rows, cols, data = [], [], [] for i, (user, item_lst) in enumerate(user_grp): n = len(item_lst) data.extend([preprocess(0)] * n) cols.extend(item_lst.new_item.values) rows.extend([i] * n) matrix = csr_matrix((np.array(data), (np.array(rows), np.array(cols))), shape=(len(user_grp), R_SHAPE[1]), dtype=np.float32) dump(matrix, data_file) return score_dataframe, matrix
def data_augmentation(x, y, T): """Split training set into smaller time windows Arguments: (x,y) total data set (3D arrays) Return: training set (batches of time steps size 2 times less than validation and test set) validation set (full time steps length) test set (full time steps length) """ x_tr, x_te, y_tr, y_te = train_test_split(preprocess(x), y, test_size=0.2, random_state=200) x_tr, x_val, y_tr, y_val = train_test_split(x_tr, y_tr, test_size=0.2, random_state=200) #build indices n = np.int(x_tr.shape[1] / T) bs = x_tr.shape[0] batch_x = np.zeros((bs * T, n, x_tr.shape[2])) batch_y = np.zeros((bs * T, n, y_tr.shape[2])) #build batches for i in range(T): batch_x[i * bs:bs * (i + 1), :, :] = x_tr[:, i * n:n * (i + 1), :] batch_y[i * bs:bs * (i + 1), :, :] = y_tr[:, i * n:n * (i + 1), :] """batch_x[bs:bs*2,:,:] = x_tr[:,n:n*2,:] batch_y[bs:bs*2,:,:] = y_tr[:,n:n*2,:] batch_x[bs*2:,:,:] = x_tr[:,n*2:,:] batch_y[bs*2:,:,:] = y_tr[:,n*2:,:]""" return [batch_x, batch_y], [x_te, y_te], [x_val, y_val]
def data_augmentation_2(x, y): "Creates a new larger data set by randomly perturbing positions with gaussian noise" train, test, val = train_te_val_split(preprocess(x), y) x_tr_1 = train[0] offset(x_tr_1) train, test, val = train_te_val_split(x, y) X_tr = np.vstack((train[0], x_tr_1)) Y_tr = np.vstack((train[1], train[1])) return [X_tr, Y_tr], test, val
def run(): print('The predefualt values are \n {}'.format(args)) "Initialize the model" wights = args.weights model = BRAF(args.S, args.p, args.k, wights, args.model_name) "Read the data" data = preprocess(file_name=args.csv_file_dir) "Specify T and Tc based on the paper" Tmaj, Tmin = Split_Maj_Min(data=data) Tc = critical_area(Tmaj, Tmin, args.k) "Run the cross validation and save the results" Kfold_cross_validation(args.k_fold, [data, Tc], model, name=args.model_name)
def load_test_data(folder_name): """Loads test data for final prediction Arguments: folder_name containing pose tracking file and annotated file should be contained in data_fly folder Returns: 3D preprocessed X and 3D Y for predictions""" X = load_file_x(folder_name, scorer='DLC_resnet50_FlyMar16shuffle0_500000') Y = np.empty((0, 8)) for ann in ['ann1.csv', 'ann2.csv', 'ann3.csv']: y = load_file_y(folder_name, ann) Y = np.vstack((Y, y)) print(Y.shape, X.shape) X = np.expand_dims(X, axis=0) Y = np.expand_dims(Y, axis=0) X = preprocess(X) return X, Y