示例#1
0
def validation_sparse_matrix(filename='seq_valid_data',
                             ext='bz2',
                             max_items=None):
    """
    Load the test data frame and make vectors grouped by users.
    :param filename: path to look whether the sequences data was already loaded
    :param ext: extension indicates compression type
    :param max_items: number of test items to retrieve
    :return: original dataframe and sparse matrix w/ shape (n_groups of 1st column, dimension of 2nd column)
    """
    score_dataframe = pd.read_csv(scoreSet, nrows=max_items)

    fp = filename + '.' + ext
    data_file = os.path.join(data_dir, fp)

    try:
        matrix = load(data_file)
    except Exception:
        user_grp = score_dataframe.groupby('new_user', sort=False)
        rows, cols, data = [], [], []
        for i, (user, item_lst) in enumerate(user_grp):
            n = len(item_lst)
            data.extend([preprocess(0)] * n)
            cols.extend(item_lst.new_item.values)
            rows.extend([i] * n)

        matrix = csr_matrix((np.array(data), (np.array(rows), np.array(cols))),
                            shape=(len(user_grp), R_SHAPE[1]),
                            dtype=np.float32)
        dump(matrix, data_file)
    return score_dataframe, matrix
示例#2
0
def data_augmentation(x, y, T):
    """Split training set into smaller time windows
    Arguments: (x,y) total data set (3D arrays)
    Return: training set (batches of time steps size 2 times less than validation and test set)
            validation set (full time steps length)
            test set (full time steps length) """

    x_tr, x_te, y_tr, y_te = train_test_split(preprocess(x),
                                              y,
                                              test_size=0.2,
                                              random_state=200)
    x_tr, x_val, y_tr, y_val = train_test_split(x_tr,
                                                y_tr,
                                                test_size=0.2,
                                                random_state=200)
    #build indices
    n = np.int(x_tr.shape[1] / T)
    bs = x_tr.shape[0]
    batch_x = np.zeros((bs * T, n, x_tr.shape[2]))
    batch_y = np.zeros((bs * T, n, y_tr.shape[2]))

    #build batches
    for i in range(T):
        batch_x[i * bs:bs * (i + 1), :, :] = x_tr[:, i * n:n * (i + 1), :]
        batch_y[i * bs:bs * (i + 1), :, :] = y_tr[:, i * n:n * (i + 1), :]
    """batch_x[bs:bs*2,:,:] = x_tr[:,n:n*2,:]
    batch_y[bs:bs*2,:,:] = y_tr[:,n:n*2,:]
    batch_x[bs*2:,:,:] = x_tr[:,n*2:,:]
    batch_y[bs*2:,:,:] = y_tr[:,n*2:,:]"""

    return [batch_x, batch_y], [x_te, y_te], [x_val, y_val]
示例#3
0
def data_augmentation_2(x, y):
    "Creates a new larger data set by randomly perturbing positions with gaussian noise"
    train, test, val = train_te_val_split(preprocess(x), y)
    x_tr_1 = train[0]
    offset(x_tr_1)
    train, test, val = train_te_val_split(x, y)
    X_tr = np.vstack((train[0], x_tr_1))
    Y_tr = np.vstack((train[1], train[1]))
    return [X_tr, Y_tr], test, val
示例#4
0
def run():
    print('The predefualt values are \n {}'.format(args))
    "Initialize the model"
    wights = args.weights
    model = BRAF(args.S, args.p, args.k, wights, args.model_name)
    "Read the data"
    data = preprocess(file_name=args.csv_file_dir)
    "Specify T and Tc based on the paper"
    Tmaj, Tmin = Split_Maj_Min(data=data)
    Tc = critical_area(Tmaj, Tmin, args.k)
    "Run the cross validation and save the results"
    Kfold_cross_validation(args.k_fold, [data, Tc],
                           model,
                           name=args.model_name)
示例#5
0
文件: Load.py 项目: PhiCtl/Fireflies
def load_test_data(folder_name):
    """Loads test data for final prediction
     Arguments: folder_name containing pose tracking file  and annotated file
                should be contained in data_fly folder
     Returns: 3D preprocessed X and 3D Y for predictions"""

    X = load_file_x(folder_name, scorer='DLC_resnet50_FlyMar16shuffle0_500000')
    Y = np.empty((0, 8))
    for ann in ['ann1.csv', 'ann2.csv', 'ann3.csv']:
        y = load_file_y(folder_name, ann)
        Y = np.vstack((Y, y))
    print(Y.shape, X.shape)

    X = np.expand_dims(X, axis=0)
    Y = np.expand_dims(Y, axis=0)
    X = preprocess(X)
    return X, Y