示例#1
0
def generate_fold(data, NUM_FOLDS=3, TEST_SAMPLE_SIZE=50):
    all_folds = []
    for fold in range(0, NUM_FOLDS):
        class_folds = {"train": [], "test": []}
        for i, group in data.groupby("Class_ID"):
            num_samples = group.shape[0]
            test_mask = np.zeros(num_samples, dtype=np.bool)
            if TEST_SAMPLE_SIZE * NUM_FOLDS > num_samples:
                start = fold * TEST_SAMPLE_SIZE
                end = start + TEST_SAMPLE_SIZE
                ix = [i % num_samples for i in range(start, end)]
            else:
                class_fold_size = num_samples // NUM_FOLDS
                start = fold * class_fold_size
                end = start + class_fold_size
                ix = range(start, end)

            test_mask[ix] = True
            try:
                class_folds["test"].append(group[test_mask].sample(
                    n=TEST_SAMPLE_SIZE, random_state=0))

            except:
                logging.warning('fold error')
            class_folds["train"].append(group[~test_mask])

        class_folds["test"] = pd.concat(class_folds["test"])
        class_folds["train"] = pd.concat(class_folds["train"])
        all_folds.append(class_folds)
    return all_folds
示例#2
0
def get_sequences(data):
    grouped = data.groupby('userId')
    res = []
    users = []
    for user_id, group in grouped:
        seq = group.sort_values('timestamp')['movieId'].values
        res.append(list(seq.astype(str)))
        users.append(user_id)
    return np.array(users), res
示例#3
0
def calc_mean_ndcg_wrmf(model, data, k):
    grouped = data.groupby('userId')
    res = []
    for user_id, group in grouped:
        ranked = WRMFEmbedded.rank_items(model, user_id,
                                         group['movieId'].values)
        y_pred = np.array([x[1] for x in ranked])
        y_true = group['rating'].values
        ndcg = calc_ndcg(y_true, y_pred, k)
        res.append(ndcg)
    return np.mean(res)
示例#4
0
def calc_mean_ndcg_als(model, csr_train, data, k):
    grouped = data.groupby('userId')
    res = []
    for user_id, group in grouped:
        ranked = model.rank_items(user_id, csr_train, group['movieId'].values)
        ranked.sort(key=lambda x: x[0])
        y_pred = np.array([x[1] for x in ranked])
        y_true = group.sort_values('movieId')['rating'].values
        ndcg = calc_ndcg(y_true, y_pred, k)
        res.append(ndcg)
    return np.mean(res)
def transform_data_to_file_folder_structure(path_to_csv, path_to_data_dir):
    data = pd.read_csv(path_to_csv)
    data.Date = pd.to_datetime(data.Date)
    data['day'] = (data.Date - pd.datetime(year=2017, month=1, day=1)).dt.days

    data = data.groupby(['DummyUserId', 'day']).agg('sum').reset_index()
    user_ids = data.DummyUserId.unique()
    size_data = len(user_ids)
    # print(int(np.floor(0.6*size_data)))

    np.random.seed(11)

    train = np.random.choice(user_ids,
                             size=int(np.floor(0.6 * size_data)),
                             replace=False)
    user_ids = user_ids[~np.in1d(user_ids, train)]
    validate = np.random.choice(user_ids,
                                size=int(np.floor(0.2 * size_data)),
                                replace=False)
    user_ids = user_ids[~np.in1d(user_ids, validate)]
    test = np.random.choice(user_ids,
                            size=int(np.floor(0.2 * size_data)),
                            replace=False)
    # print(len(user_ids))

    badge_achievements = {}

    for dset in [train, validate, test]:
        for user, trajectory in data[data.DummyUserId.isin(dset)].groupby(
                'DummyUserId'):
            trajectory = trajectory.sort_values('day')
            badge = {}
            for b in BADGES:
                idxs = np.where(trajectory[b] == 1)[0]
                if len(idxs) > 0:
                    badge[b] = [int(i) for i in idxs]
            badge_achievements[user] = badge

            action_trajectory = torch.tensor(trajectory[ACTIONS].values,
                                             dtype=torch.long)
            torch.save(action_trajectory,
                       '{}/user_{}.pt'.format(path_to_data_dir, user))

    with open('{}/badge_achievements.json'.format(path_to_data_dir), 'w') as f:
        json.dump(badge_achievements, f)

    with open('{}/data_indexes.json'.format(path_to_data_dir), 'w') as f:
        obj = {}
        obj['train'] = [int(u) for u in train]
        obj['test'] = [int(u) for u in test]
        obj['validate'] = [int(u) for u in validate]
        json.dump(obj, f)
示例#6
0
def split_data(data, test_part=0.2, min_test=10):
    grouped = data.groupby('userId')
    train = []
    test = []
    for name, group in grouped:
        entries = group.sort_values('timestamp')
        test_cnt = max(min_test, int(len(entries) * test_part))
        train.append(entries[:-test_cnt])
        test.append(entries[-test_cnt:])

    data_train = pd.concat(train)
    data_test = pd.concat(test)
    return data_train, data_test
示例#7
0
    def __init__(self, data, samples_in_a_row, shuffle=False):
        super().__init__(data)

        data = data[['sirna']].copy()
        data.index = np.arange(len(data))
        buckets = [bucket.index.values for _, bucket in data.groupby('sirna')]
        max_len = min(len(bucket) for bucket in buckets)
        print('max_len: {}, {}'.format(max_len, max_len // samples_in_a_row * samples_in_a_row))
        max_len = max_len // samples_in_a_row * samples_in_a_row

        self.max_len = max_len
        self.buckets = buckets
        self.samples_in_a_row = samples_in_a_row
        self.shuffle = shuffle
示例#8
0
 def artists_pca(self, data):
     data['artists'] = data['artists'].astype('category')
     data['artists'] = data['artists'].apply(lambda x: get_first_artist(x))
     grouped_by_artist = data.groupby('artists').mean()
     pca = PCA(n_components=self.pca_components)
     pca_components = pd.DataFrame(
         pca.fit_transform(grouped_by_artist, y='artists'),
         columns=['PCA%i' % i for i in range(self.pca_components)],
         index=grouped_by_artist.index)
     merged_data = pd.merge(data,
                            pca_components,
                            left_on='artists',
                            right_index=True,
                            how='inner')
     return merged_data.drop('artists', axis=1)
示例#9
0
    def __init__(self, data, batch_size, shuffle=False, drop_last=False):
        super().__init__(data)

        data = pd.DataFrame({
            "i": range(len(data)),
            "size": data,
        }).sort_values("size")

        batches = [
            group["i"]
            for _, group in data.groupby(np.arange(len(data)) // batch_size)
        ]
        batches = [b for b in batches if len(b) > 0]
        if drop_last:
            batches = [b for b in batches if len(b) == batch_size]

        self.batches = batches
        self.shuffle = shuffle
示例#10
0
    def __init__(self, path, csv_file, train, transform=[]):
        data = pd.read_csv(os.path.join(path, csv_file))
        image_path = '/home/we/zsw/train/'
        data['img_file'] = image_path + data['_id'].astype(str) + '_' \
                            + data['cum_sum'].astype(str) + '.jpg'

        gb = data.groupby('level1')
        keys = gb.groups.keys()

        self.transform = transform
        self.num_file = gb.size().max()  # subclass image num max
        self.train = train
        self.img_file = []
        self.level1 = []
        self.subcategory = []
        self.sublength = []
        for item in keys:
            group = gb.get_group(item)
            self.img_file.append(list(group['img_file']))
            self.level1.append(item)
            self.subcategory.append(list(group['level1_sub_class']))
            self.sublength.append(len(group))
def gen_batch_data(data):
    cols = list(data.columns)
    del_cols_index = [cols.index(col) for col in ['sid', 'pid', 'click_mode']]
    sel_cols_index = list(range(len(cols)))
    for item in del_cols_index:
        sel_cols_index.pop(item)

    grouped = data.groupby('sid')
    batch_feas_list = []
    batch_click_mode_list = []
    for i, (group_id, group) in tqdm(enumerate(grouped)):
        grouped_values = group.values
        batch_click_mode = grouped_values[:, del_cols_index[-1]][0]
        batch_feas = torch.tensor(grouped_values[:, sel_cols_index]).type(
            torch.FloatTensor)
        batch_feas_list.append(batch_feas)
        batch_click_mode_list.append(batch_click_mode)
    batch_click_mode_list = torch.tensor(batch_click_mode_list).type(
        torch.LongTensor)
    # first paddle
    # check why size does not add up to 10000???
    print('list_len:', len(batch_click_mode_list))
    return batch_feas_list, batch_click_mode_list
示例#12
0
def main(dataset_path, workers):
    transform = T.Compose([
        ApplyTo(
            ['image'],
            T.Compose([
                SplitInSites(),
                T.Lambda(
                    lambda xs: torch.stack([ToTensor()(x) for x in xs], 0)),
            ])),
        Extract(['image']),
    ])

    train_data = pd.read_csv(os.path.join(dataset_path, 'train.csv'))
    train_data['root'] = os.path.join(dataset_path, 'train')
    test_data = pd.read_csv(os.path.join(dataset_path, 'test.csv'))
    test_data['root'] = os.path.join(dataset_path, 'test')
    data = pd.concat([train_data, test_data])

    stats = {}
    for (exp, plate), group in tqdm(data.groupby(['experiment', 'plate'])):
        dataset = TestDataset(group, transform=transform)
        data_loader = torch.utils.data.DataLoader(dataset,
                                                  batch_size=32,
                                                  num_workers=workers)

        with torch.no_grad():
            images = [images for images, in data_loader]
            images = torch.cat(images, 0)
            mean = images.mean((0, 1, 3, 4))
            std = images.std((0, 1, 3, 4))
            stats[(exp, plate)] = mean, std

            del images, mean, std
            gc.collect()

    torch.save(stats, 'plate_stats.pth')
示例#13
0
for col in data.columns:
    if (data[col].dtype == 'object') and (col != 'UID'):
        data = encode_count(data, col)

train = data.drop(['merchant', 'UID'], axis=1).fillna(-1)
label = data['merchant'].values

if (os.path.exists('./feature/merchant_np.npy')):

    merchant_weight = np.load('./feature/merchant_np.npy')
    for item in ['merchant']:

        result = data.groupby([
            'UID'
        ])[item].apply(max_list).reset_index().rename(columns={
            item: 'arr_%s' % item
        }).fillna(0)
        y = y.merge(result[['UID', 'arr_%s' % item]], on=['UID'],
                    how='left').fillna(0)
        sub = sub.merge(result[['UID', 'arr_%s' % item]],
                        on=['UID'],
                        how='left').fillna(0)

    ##pay attention :: astype(int)!!!!!!!!
    for dat in [y, sub]:
        dat['new_merchant'] = dat['arr_merchant'].astype(int).apply(
            lambda x: merchant_weight[x])
    for i in range(100):
        y['new_merchant_%d' % i] = y['new_merchant'].apply(lambda x: x[i])
        sub['new_merchant_%d' % i] = sub['new_merchant'].apply(lambda x: x[i])
def process_data(base_path):
    import pandas as pd

    # processed_dataset = {}
    # validation == 1000 samples
    # train === 5000 samples
    # test === 1000 samples
    # convert to number of actions per week
    # edit out the badge outcome variables

    print("Processing raw data")

    output_fname = os.path.join(base_path, 'so_data.pkl')

    labels = ['train', 'valid', 'test']

    input_fname = os.path.join(csv_path, 'so_badges.csv')
    data = pd.read_csv(input_fname)
    data.Date = pd.to_datetime(data.Date)
    data['week'] = (data.Date - pd.datetime(year=2017, month=1, day=1)).dt.days

    data = data.groupby(['DummyUserId', 'week']).agg('sum').reset_index()
    badge_ixs = data[data.Electorate > 0]
    max_week = data.week.max()
    badge_ixs = badge_ixs[badge_ixs.week > 45]
    badge_ixs = badge_ixs[badge_ixs.week < max_week - 46]
    badge_ixs = badge_ixs.DummyUserId

    print(len(badge_ixs.unique()))

    indexes = badge_ixs.unique()
    train = np.random.choice(indexes, size=4000, replace=False)
    indexes = indexes[~np.in1d(indexes, train)]
    validate = np.random.choice(indexes, size=1000, replace=False)
    indexes = indexes[~np.in1d(indexes, validate)]
    test = np.random.choice(indexes, size=1000, replace=False)

    # data.set_index('DummyUserId', inplace=True)
    processed_dataset = {}

    for s, dset in enumerate([train, validate, test]):

        split = labels[s]
        processed_dataset[split] = {}

        sub_data = data[data.DummyUserId.isin(dset)]
        n_seqs = len(dset)

        processed_dataset[split]['sequence_lengths'] = torch.zeros(
            n_seqs, dtype=torch.long)
        processed_dataset[split]['sequences'] = []
        processed_dataset[split]['outcomes'] = []
        idx = 0

        for u_id, seqs in sub_data.groupby('DummyUserId'):
            seqs = seqs.sort_values('week')

            out = {}
            for b in BADGES:
                idxs = np.where(seqs[b] == 1)[0]
                if len(idxs) > 0:
                    out[b] = torch.tensor(idxs, dtype=torch.long)

            civic_duty = out['Electorate']
            days = 90

            action_vec = seqs[ACTIONS].values[civic_duty -
                                              days // 2:civic_duty +
                                              days // 2, :]
            out['Electorate'] = torch.tensor([days // 2], dtype=torch.long)

            processed_dataset[split]['sequence_lengths'][idx] = days
            processed_sequence = torch.tensor(action_vec, dtype=torch.long)
            processed_dataset[split]['sequences'].append(processed_sequence)

            processed_dataset[split]['outcomes'].append(out)
            idx += 1

    pickle.dump(processed_dataset, open(output_fname, "wb"),
                pickle.HIGHEST_PROTOCOL)
    print("dumped processed data to %s" % output_fname)
def get_scaleing(data, scaler, scaler_range, train):
    sensors = [
        's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11',
        's12', 's13', 's14', 's15', 's16', 's17', 's18', 's19', 's20', 's21'
    ]
    operating_condition = ["oc_0", "oc_1", "oc_2", "oc_3", "oc_4", "oc_5"]

    global scalingparams

    if "operating_condition" in data.columns:
        print("\n\t Scaling by Clusters")
        groupby_oc = data.groupby("operating_condition", sort=False)
        scaled_sensors = []

        if train:
            scalingparams = {}
            scalingparams['scaler'] = scaler
            scalingparams['scaler_range'] = scaler_range

            for operating_condition, data in groupby_oc:

                if scalingparams['scaler'] == 'mm':
                    min_ = np.min(data[sensors])
                    max_ = np.max(data[sensors])

                    scaled_data = (((data[sensors] - min_) / (max_ - min_)) *
                                   (scalingparams['scaler_range'][1] -
                                    scalingparams['scaler_range'][0])
                                   ) + scalingparams['scaler_range'][0]

                    scalingparams['min_oc' + str(operating_condition)] = min_
                    scalingparams['max_oc' + str(operating_condition)] = max_
                elif scalingparams['scaler'] == 'ss':
                    mean_ = np.mean(data[sensors])
                    std_ = np.std(data[sensors])

                    scaled_data = (data[sensors] - mean_) / std_

                    scalingparams['mean_oc' + str(operating_condition)] = mean_
                    scalingparams['std_oc' + str(operating_condition)] = std_
                scaled_sensors.append(scaled_data)
        else:
            for operating_condition, data in groupby_oc:

                if scalingparams['scaler'] == 'mm':
                    min_ = scalingparams['min_oc' + str(operating_condition)]
                    max_ = scalingparams['max_oc' + str(operating_condition)]
                    scalingparams['scaler_range'] = scaler_range

                    scaled_data = (((data[sensors] - min_) / (max_ - min_)) *
                                   (scalingparams['scaler_range'][1] -
                                    scalingparams['scaler_range'][0])
                                   ) + scalingparams['scaler_range'][0]

                elif scalingparams['scaler'] == 'ss':
                    mean_ = scalingparams['mean_oc' + str(operating_condition)]
                    std_ = scalingparams['std_oc' + str(operating_condition)]

                    scaled_data = (data[sensors] - mean_) / std_

                scaled_sensors.append(scaled_data)

        scaled_df = pd.concat(scaled_sensors, sort=False)
        scaled_df = scaled_df.sort_index(axis=0, ascending=True)

    else:
        print("\n\t Scaling Without Clusters")
        if train:
            scalingparams = {}
            scalingparams['scaler'] = scaler
            scalingparams['scaler_range'] = scaler_range

            if scalingparams['scaler'] == 'mm':
                min_ = np.min(data[sensors])
                max_ = np.max(data[sensors])

                scaled_data = (((data[sensors] - min_) / (max_ - min_)) *
                               (scalingparams['scaler_range'][1] -
                                scalingparams['scaler_range'][0])
                               ) + scalingparams['scaler_range'][0]

                scalingparams['min_'] = min_
                scalingparams['max_'] = max_
            elif scalingparams['scaler'] == 'ss':
                mean_ = np.mean(data[sensors])
                std_ = np.std(data[sensors])

                scaled_data = (data[sensors] - mean_) / std_

                scalingparams['mean_'] = mean_
                scalingparams['std_'] = std_

        else:
            if scalingparams['scaler'] == 'mm':
                min_ = scalingparams['min_']
                max_ = scalingparams['max_']
                scalingparams['scaler_range'] = scaler_range
                scaled_data = (((data[sensors] - min_) / (max_ - min_)) *
                               (scalingparams['scaler_range'][1] -
                                scalingparams['scaler_range'][0])
                               ) + scalingparams['scaler_range'][0]

            elif scalingparams['scaler'] == 'ss':
                mean_ = scalingparams['mean_']
                std_ = scalingparams['std_']

                scaled_data = (data[sensors] - mean_) / std_

        scaled_df = scaled_data

    scaled_df = scaled_df.dropna(axis=1)
    cols_wo_na = scaled_df.columns
    print("\n\t scaled_df after dropNA {} \n column names {}".format(
        scaled_df.shape, cols_wo_na))

    return scaled_df, cols_wo_na