def __init__(self, data_dir):
        data_cols = [
            'order_id',
            'product_id',
            'features',
            'label'
        ]
        data = [np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r') for i in data_cols]
        df = DataFrame(columns=data_cols, data=data)
        self.data_dim = df['features'].shape[1]

        print df.shapes()
        print 'loaded data'

        self.test_df = df.mask(df['label'] == -1)
        self.train_df = df.mask(df['label'] != -1)
        self.train_df, self.val_df = self.train_df.train_test_split(train_size=0.9)

        print 'train size', len(self.train_df)
        print 'val size', len(self.val_df)
        print 'test size', len(self.test_df)

        self.feature_means = np.load(os.path.join(data_dir, 'feature_means.npy'))
        self.feature_maxs = np.load(os.path.join(data_dir, 'feature_maxs.npy'))
        self.feature_mins = np.load(os.path.join(data_dir, 'feature_mins.npy'))
예제 #2
0
    def __init__(self, data_dir):
        data_cols = ['order_id', 'product_id', 'features', 'label']
        data = [
            np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r')
            for i in data_cols
        ]
        df = DataFrame(columns=data_cols, data=data)
        self.data_dim = df['features'].shape[1]

        print df.shapes()
        print 'loaded data'

        self.test_df = df.mask(df['label'] == -1)
        self.train_df = df.mask(df['label'] != -1)
        self.train_df, self.val_df = self.train_df.train_test_split(
            train_size=0.9)

        print 'train size', len(self.train_df)
        print 'val size', len(self.val_df)
        print 'test size', len(self.test_df)

        self.feature_means = np.load(
            os.path.join(data_dir, 'feature_means.npy'))
        self.feature_maxs = np.load(os.path.join(data_dir, 'feature_maxs.npy'))
        self.feature_mins = np.load(os.path.join(data_dir, 'feature_mins.npy'))
class DataReader(object):

    def __init__(self, data_dir):
        data_cols = [
            'user_id',
            'history_length',
            'order_size_history',
            'reorder_size_history',
            'order_number_history',
            'order_dow_history',
            'order_hour_history',
            'days_since_prior_order_history',
        ]
        data = [np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r') for i in data_cols]
        self.test_df = DataFrame(columns=data_cols, data=data)

        print self.test_df.shapes()
        print 'loaded data'

        self.train_df, self.val_df = self.test_df.train_test_split(train_size=0.9)

        print 'train size', len(self.train_df)
        print 'val size', len(self.val_df)
        print 'test size', len(self.test_df)

    def train_batch_generator(self, batch_size):
        return self.batch_generator(
            batch_size=batch_size,
            df=self.train_df,
            shuffle=True,
            num_epochs=10000,
            is_test=False
        )

    def val_batch_generator(self, batch_size):
        return self.batch_generator(
            batch_size=batch_size,
            df=self.val_df,
            shuffle=True,
            num_epochs=10000,
            is_test=False
        )

    def test_batch_generator(self, batch_size):
        return self.batch_generator(
            batch_size=batch_size,
            df=self.test_df,
            shuffle=False,
            num_epochs=1,
            is_test=True
        )

    def batch_generator(self, batch_size, df, shuffle=True, num_epochs=10000, is_test=False):
        batch_gen = df.batch_generator(batch_size, shuffle=shuffle, num_epochs=num_epochs, allow_smaller_final_batch=is_test)
        for batch in batch_gen:
            batch['order_dow_history'] = np.roll(batch['order_dow_history'], -1, axis=1)
            batch['order_hour_history'] = np.roll(batch['order_hour_history'], -1, axis=1)
            batch['days_since_prior_order_history'] = np.roll(batch['days_since_prior_order_history'], -1, axis=1)
            batch['order_number_history'] = np.roll(batch['order_number_history'], -1, axis=1)
            batch['next_reorder_size'] = np.roll(batch['reorder_size_history'], -1, axis=1)
            if not is_test:
                batch['history_length'] = batch['history_length'] - 1
            yield batch
예제 #4
0
class DataReader(object):
    def __init__(self, data_dir):
        data_cols = [
            'user_id',
            'history_length',
            'order_size_history',
            'reorder_size_history',
            'order_number_history',
            'order_dow_history',
            'order_hour_history',
            'days_since_prior_order_history',
        ]
        data = [
            np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r')
            for i in data_cols
        ]
        self.test_df = DataFrame(columns=data_cols, data=data)

        print self.test_df.shapes()
        print 'loaded data'

        self.train_df, self.val_df = self.test_df.train_test_split(
            train_size=0.9)

        print 'train size', len(self.train_df)
        print 'val size', len(self.val_df)
        print 'test size', len(self.test_df)

    def train_batch_generator(self, batch_size):
        return self.batch_generator(batch_size=batch_size,
                                    df=self.train_df,
                                    shuffle=True,
                                    num_epochs=10000,
                                    is_test=False)

    def val_batch_generator(self, batch_size):
        return self.batch_generator(batch_size=batch_size,
                                    df=self.val_df,
                                    shuffle=True,
                                    num_epochs=10000,
                                    is_test=False)

    def test_batch_generator(self, batch_size):
        return self.batch_generator(batch_size=batch_size,
                                    df=self.test_df,
                                    shuffle=False,
                                    num_epochs=1,
                                    is_test=True)

    def batch_generator(self,
                        batch_size,
                        df,
                        shuffle=True,
                        num_epochs=10000,
                        is_test=False):
        batch_gen = df.batch_generator(batch_size,
                                       shuffle=shuffle,
                                       num_epochs=num_epochs,
                                       allow_smaller_final_batch=is_test)
        for batch in batch_gen:
            batch['order_dow_history'] = np.roll(batch['order_dow_history'],
                                                 -1,
                                                 axis=1)
            batch['order_hour_history'] = np.roll(batch['order_hour_history'],
                                                  -1,
                                                  axis=1)
            batch['days_since_prior_order_history'] = np.roll(
                batch['days_since_prior_order_history'], -1, axis=1)
            batch['order_number_history'] = np.roll(
                batch['order_number_history'], -1, axis=1)
            batch['next_reorder_size'] = np.roll(batch['reorder_size_history'],
                                                 -1,
                                                 axis=1)
            if not is_test:
                batch['history_length'] = batch['history_length'] - 1
            yield batch
예제 #5
0
class DataReader(object):
    def __init__(self, data_dir):
        data_cols = [
            'user_id', 'product_id', 'aisle_id', 'department_id',
            'is_ordered_history', 'index_in_order_history',
            'order_dow_history', 'order_hour_history',
            'days_since_prior_order_history', 'order_size_history',
            'reorder_size_history', 'order_is_weekend_history',
            'order_part_of_day_history', 'order_number_history',
            'history_length', 'product_name', 'product_name_length',
            'eval_set', 'label'
        ]
        data = [
            np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r')
            for i in data_cols
        ]
        self.test_df = DataFrame(columns=data_cols, data=data)

        print(self.test_df.shapes())
        print('loaded data')

        self.train_df, self.val_df = self.test_df.train_test_split(
            train_size=0.9)

        print('train size', len(self.train_df))
        print('val size', len(self.val_df))
        print('test size', len(self.test_df))

    def train_batch_generator(self, batch_size):
        return self.batch_generator(batch_size=batch_size,
                                    df=self.train_df,
                                    shuffle=True,
                                    num_epochs=10000,
                                    is_test=False)

    def val_batch_generator(self, batch_size):
        return self.batch_generator(batch_size=batch_size,
                                    df=self.val_df,
                                    shuffle=True,
                                    num_epochs=10000,
                                    is_test=False)

    def test_batch_generator(self, batch_size):
        return self.batch_generator(batch_size=batch_size,
                                    df=self.test_df,
                                    shuffle=False,
                                    num_epochs=1,
                                    is_test=True)

    def batch_generator(self,
                        batch_size,
                        df,
                        shuffle=True,
                        num_epochs=10000,
                        is_test=False):
        batch_gen = df.batch_generator(batch_size,
                                       shuffle=shuffle,
                                       num_epochs=num_epochs,
                                       allow_smaller_final_batch=is_test)
        for batch in batch_gen:
            batch['order_dow_history'] = np.roll(batch['order_dow_history'],
                                                 -1,
                                                 axis=1)
            batch['order_hour_history'] = np.roll(batch['order_hour_history'],
                                                  -1,
                                                  axis=1)
            batch['days_since_prior_order_history'] = np.roll(
                batch['days_since_prior_order_history'], -1, axis=1)
            batch['order_is_weekend_history'] = np.roll(
                batch['order_is_weekend_history'], -1, axis=1)
            batch['order_part_of_day_history'] = np.roll(
                batch['order_part_of_day_history'], -1, axis=1)
            batch['order_number_history'] = np.roll(
                batch['order_number_history'], -1, axis=1)
            batch['next_is_ordered'] = np.roll(batch['is_ordered_history'],
                                               -1,
                                               axis=1)
            batch['is_none'] = batch['product_id'] == 0
            if not is_test:
                batch['history_length'] = batch['history_length'] - 1
            yield batch