class DataReader(object): def __init__(self, data_dir): data_cols = ['x', 'x_len', 'c', 'c_len'] data = [np.load(os.path.join(data_dir, '{}.npy'.format(i))) for i in data_cols] self.test_df = DataFrame(columns=data_cols, data=data) self.train_df, self.val_df = self.test_df.train_test_split(train_size=0.95, random_state=2018) print('train size', len(self.train_df)) print('val size', len(self.val_df)) print('test size', len(self.test_df)) def train_batch_generator(self, batch_size): return self.batch_generator( batch_size=batch_size, df=self.train_df, shuffle=True, num_epochs=10000, mode='train' ) def val_batch_generator(self, batch_size): return self.batch_generator( batch_size=batch_size, df=self.val_df, shuffle=True, num_epochs=10000, mode='val' ) def test_batch_generator(self, batch_size): return self.batch_generator( batch_size=batch_size, df=self.test_df, shuffle=False, num_epochs=1, mode='test' ) def batch_generator(self, batch_size, df, shuffle=True, num_epochs=10000, mode='train'): gen = df.batch_generator( batch_size=batch_size, shuffle=shuffle, num_epochs=num_epochs, allow_smaller_final_batch=(mode == 'test') ) for batch in gen: batch['x_len'] = batch['x_len'] - 1 max_x_len = np.max(batch['x_len']) max_c_len = np.max(batch['c_len']) batch['y'] = batch['x'][:, 1:max_x_len + 1, :] batch['x'] = batch['x'][:, :max_x_len, :] batch['c'] = batch['c'][:, :max_c_len] yield batch
def __init__(self, data_dir): data_cols = ['x', 'y'] data = [np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r') for i in data_cols] df = DataFrame(columns=data_cols, data=data) self.train_df, self.val_df = df.train_test_split(train_size=0.9) print 'train size', len(self.train_df) print 'val size', len(self.val_df) self.num_products = df['x'].max() + 1 self.product_dist = np.bincount(self.train_df['x']).tolist()
class DataReader(object): def __init__(self, data_dir): data_cols = [ 'user_id', 'history_length', 'order_size_history', 'reorder_size_history', 'order_number_history', 'order_dow_history', 'order_hour_history', 'days_since_prior_order_history', ] data = [np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r') for i in data_cols] self.test_df = DataFrame(columns=data_cols, data=data) print self.test_df.shapes() print 'loaded data' self.train_df, self.val_df = self.test_df.train_test_split(train_size=0.9) print 'train size', len(self.train_df) print 'val size', len(self.val_df) print 'test size', len(self.test_df) def train_batch_generator(self, batch_size): return self.batch_generator( batch_size=batch_size, df=self.train_df, shuffle=True, num_epochs=10000, is_test=False ) def val_batch_generator(self, batch_size): return self.batch_generator( batch_size=batch_size, df=self.val_df, shuffle=True, num_epochs=10000, is_test=False ) def test_batch_generator(self, batch_size): return self.batch_generator( batch_size=batch_size, df=self.test_df, shuffle=False, num_epochs=1, is_test=True ) def batch_generator(self, batch_size, df, shuffle=True, num_epochs=10000, is_test=False): batch_gen = df.batch_generator(batch_size, shuffle=shuffle, num_epochs=num_epochs, allow_smaller_final_batch=is_test) for batch in batch_gen: batch['order_dow_history'] = np.roll(batch['order_dow_history'], -1, axis=1) batch['order_hour_history'] = np.roll(batch['order_hour_history'], -1, axis=1) batch['days_since_prior_order_history'] = np.roll(batch['days_since_prior_order_history'], -1, axis=1) batch['order_number_history'] = np.roll(batch['order_number_history'], -1, axis=1) batch['next_reorder_size'] = np.roll(batch['reorder_size_history'], -1, axis=1) if not is_test: batch['history_length'] = batch['history_length'] - 1 yield batch
class DataReader(object): def __init__(self, data_dir): data_cols = [ 'user_id', 'history_length', 'order_size_history', 'reorder_size_history', 'order_number_history', 'order_dow_history', 'order_hour_history', 'days_since_prior_order_history', ] data = [ np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r') for i in data_cols ] self.test_df = DataFrame(columns=data_cols, data=data) print(self.test_df.shapes()) print('loaded data') self.train_df, self.val_df = self.test_df.train_test_split( train_size=0.9) print('train size', len(self.train_df)) print('val size', len(self.val_df)) print('test size', len(self.test_df)) def train_batch_generator(self, batch_size): return self.batch_generator(batch_size=batch_size, df=self.train_df, shuffle=True, num_epochs=10000, is_test=False) def val_batch_generator(self, batch_size): return self.batch_generator(batch_size=batch_size, df=self.val_df, shuffle=True, num_epochs=10000, is_test=False) def test_batch_generator(self, batch_size): return self.batch_generator(batch_size=batch_size, df=self.test_df, shuffle=False, num_epochs=1, is_test=True) def batch_generator(self, batch_size, df, shuffle=True, num_epochs=10000, is_test=False): batch_gen = df.batch_generator(batch_size, shuffle=shuffle, num_epochs=num_epochs, allow_smaller_final_batch=is_test) for batch in batch_gen: batch['order_dow_history'] = np.roll(batch['order_dow_history'], -1, axis=1) batch['order_hour_history'] = np.roll(batch['order_hour_history'], -1, axis=1) batch['days_since_prior_order_history'] = np.roll( batch['days_since_prior_order_history'], -1, axis=1) batch['order_number_history'] = np.roll( batch['order_number_history'], -1, axis=1) batch['next_reorder_size'] = np.roll(batch['reorder_size_history'], -1, axis=1) if not is_test: batch['history_length'] = batch['history_length'] - 1 yield batch
class DataReader(object): def __init__(self, data_dir): data_cols = [ 'data', 'isNAN', 'page_id', 'project', 'access', 'agent', 'test_data', 'test_isNAN' ] data = [np.load(os.path.join(data_dir, '{}.npy'.format(i))) for i in data_cols] self.testDataframe = DataFrame(columns=data_cols, data=data) self.trainDataframe, self.valDataframe = self.testDataframe.train_test_split(train_size=0.95) print 'Size of trained data', len(self.trainDataframe) print 'val size', len(self.valDataframe) print 'size of test data', len(self.testDataframe) def trainbatchGeneration(self, batchSize): return self.batch_generator( batchSize=batchSize, df=self.trainDataframe, shuffle=True, epochsCount=10000, isTestData=False ) def val_batch_generator(self, batchSize): return self.batch_generator( batchSize=batchSize, df=self.valDataframe, shuffle=True, epochsCount=10000, isTestData=False ) def test_batch_generator(self, batchSize): return self.batch_generator( batchSize=batchSize, df=self.testDataframe, shuffle=True, epochsCount=1, isTestData=True ) def batch_generator(self, batchSize, df, shuffle=True, epochsCount=10000, isTestData=False): batch_gen = df.batch_generator( batchSize=batchSize, shuffle=shuffle, epochsCount=epochsCount, allow_smaller_final_batch=isTestData ) data_col = 'test_data' if isTestData else 'data' isNAN_col = 'test_isNAN' if isTestData else 'isNAN' for batch in batch_gen: decodeCount = 64 fulllength = batch[data_col].shape[1] max_lengthOfencodegth = fulllength - decodeCount if not isTestData else fulllength xEncode = np.zeros([len(batch), max_lengthOfencodegth]) ydecode = np.zeros([len(batch), decodeCount]) isNANEncode = np.zeros([len(batch), max_lengthOfencodegth]) isNANdecode = np.zeros([len(batch), decodeCount]) lengthOfencode = np.zeros([len(batch)]) lengthOfDecode = np.zeros([len(batch)]) for i, (seq, nan_seq) in enumerate(zip(batch[data_col], batch[isNAN_col])): rand_len = np.random.randint(max_lengthOfencodegth - 365 + 1, max_lengthOfencodegth + 1) x_lengthOfencode = max_lengthOfencodegth if isTestData else rand_len xEncode[i, :x_lengthOfencode] = seq[:x_lengthOfencode] isNANEncode[i, :x_lengthOfencode] = nan_seq[:x_lengthOfencode] lengthOfencode[i] = x_lengthOfencode lengthOfDecode[i] = decodeCount if not isTestData: ydecode[i, :] = seq[x_lengthOfencode: x_lengthOfencode + decodeCount] isNANdecode[i, :] = nan_seq[x_lengthOfencode: x_lengthOfencode + decodeCount] batch['xEncode'] = xEncode batch['lengthOfencode'] = lengthOfencode batch['ydecode'] = ydecode batch['lengthOfDecode'] = lengthOfDecode batch['isNANEncode'] = isNANEncode batch['isNANdecode'] = isNANdecode yield batch
class DataReader(object): def __init__(self, data_dir): data_cols = [ 'user_id', 'product_id', 'aisle_id', 'department_id', 'is_ordered_history', 'index_in_order_history', 'order_dow_history', 'order_hour_history', 'days_since_prior_order_history', 'order_size_history', 'reorder_size_history', 'order_is_weekend_history', 'order_part_of_day_history', 'order_number_history', 'history_length', 'product_name', 'product_name_length', 'eval_set', 'label' ] data = [ np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r') for i in data_cols ] self.test_df = DataFrame(columns=data_cols, data=data) print self.test_df.shapes() print 'loaded data' self.train_df, self.val_df = self.test_df.train_test_split( train_size=0.9) print 'train size', len(self.train_df) print 'val size', len(self.val_df) print 'test size', len(self.test_df) def train_batch_generator(self, batch_size): return self.batch_generator(batch_size=batch_size, df=self.train_df, shuffle=True, num_epochs=10000, is_test=False) def val_batch_generator(self, batch_size): return self.batch_generator(batch_size=batch_size, df=self.val_df, shuffle=True, num_epochs=10000, is_test=False) def test_batch_generator(self, batch_size): return self.batch_generator(batch_size=batch_size, df=self.test_df, shuffle=False, num_epochs=1, is_test=True) def batch_generator(self, batch_size, df, shuffle=True, num_epochs=10000, is_test=False): batch_gen = df.batch_generator(batch_size, shuffle=shuffle, num_epochs=num_epochs, allow_smaller_final_batch=is_test) for batch in batch_gen: batch['order_dow_history'] = np.roll(batch['order_dow_history'], -1, axis=1) batch['order_hour_history'] = np.roll(batch['order_hour_history'], -1, axis=1) batch['days_since_prior_order_history'] = np.roll( batch['days_since_prior_order_history'], -1, axis=1) batch['order_is_weekend_history'] = np.roll( batch['order_is_weekend_history'], -1, axis=1) batch['order_part_of_day_history'] = np.roll( batch['order_part_of_day_history'], -1, axis=1) batch['order_number_history'] = np.roll( batch['order_number_history'], -1, axis=1) batch['next_is_ordered'] = np.roll(batch['is_ordered_history'], -1, axis=1) batch['is_none'] = batch['product_id'] == 0 if not is_test: batch['history_length'] = batch['history_length'] - 1 yield batch
class DataReader(object): def __init__(self, data_dir): data_cols = [ 'x_raw', 'onpromotion', 'id', 'x', 'store_nbr', 'item_nbr', 'city', 'state', 'type', 'cluster', 'family', 'class', 'perishable', 'is_discrete', 'start_date', 'x_lags', 'xy_lags', 'ts', ] data = [ np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r') for i in data_cols ] self.test_df = DataFrame(columns=data_cols, data=data) self.train_df, self.val_df = self.test_df.train_test_split( train_size=0.95) self.num_city = self.test_df['city'].max() + 1 self.num_state = self.test_df['state'].max() + 1 self.num_type = self.test_df['type'].max() + 1 self.num_cluster = self.test_df['cluster'].max() + 1 self.num_family = self.test_df['family'].max() + 1 self.num_item_class = self.test_df['class'].max() + 1 self.num_perishable = self.test_df['perishable'].max() + 1 self.num_store_nbr = self.test_df['store_nbr'].max() + 1 self.num_item_nbr = self.test_df['item_nbr'].max() + 1 print 'train size', len(self.train_df) print 'val size', len(self.val_df) print 'test size', len(self.test_df) def train_batch_generator(self, batch_size): return self.batch_generator(batch_size=batch_size, df=self.train_df, shuffle=True, num_epochs=10000, mode='train') def val_batch_generator(self, batch_size): return self.batch_generator(batch_size=batch_size, df=self.val_df, shuffle=True, num_epochs=10000, mode='val') def test_batch_generator(self, batch_size): return self.batch_generator(batch_size=batch_size, df=self.test_df, shuffle=True, num_epochs=1, mode='test') def batch_generator(self, batch_size, df, mode, shuffle=True, num_epochs=10000): batch_gen = df.batch_generator( batch_size=batch_size, shuffle=shuffle, num_epochs=num_epochs, allow_smaller_final_batch=(mode == 'test')) for batch in batch_gen: num_decode_steps = 16 full_seq_len = batch['x'].shape[1] - num_decode_steps max_encode_length = full_seq_len x = np.zeros([len(batch), max_encode_length]) y = np.zeros([len(batch), num_decode_steps]) x_raw = np.zeros([len(batch), max_encode_length]) x_lags = np.zeros([ len(batch), max_encode_length, batch['x_lags'].shape[2] + batch['xy_lags'].shape[2] ]) y_lags = np.zeros( [len(batch), num_decode_steps, batch['xy_lags'].shape[2]]) x_op = np.zeros([len(batch), max_encode_length]) y_op = np.zeros([len(batch), num_decode_steps]) x_len = np.zeros([len(batch)]) y_len = np.zeros([len(batch)]) x_idx = np.zeros([len(batch), max_encode_length]) y_idx = np.zeros([len(batch), num_decode_steps]) y_id = np.zeros([len(batch), num_decode_steps]) x_ts = np.zeros( [len(batch), max_encode_length, batch['ts'].shape[2]]) weights = np.zeros([len(batch)]) weights[batch['perishable'] == 1] = 1.25 weights[batch['perishable'] == 0] = 1.0 for i, (data, data_raw, start_idx, x_lag, xy_lag, op, uid, ts) in enumerate( zip(batch['x'], batch['x_raw'], batch['start_date'], batch['x_lags'], batch['xy_lags'], batch['onpromotion'], batch['id'], batch['ts'])): seq_len = full_seq_len - start_idx val_window = 365 train_window = 365 if mode == 'train': if seq_len == 0: rand_encode_len = 0 weights[i] = 0 elif seq_len <= train_window: rand_encode_len = np.random.randint(0, seq_len) else: rand_encode_len = np.random.randint( seq_len - train_window, seq_len) rand_decode_len = min(seq_len - rand_encode_len, num_decode_steps) elif mode == 'val': if seq_len <= num_decode_steps: rand_encode_len = 0 weights[i] = 0 elif seq_len <= val_window + num_decode_steps: rand_encode_len = np.random.randint( 0, seq_len - num_decode_steps + 1) else: rand_encode_len = np.random.randint( seq_len - (val_window + num_decode_steps), seq_len - num_decode_steps + 1) rand_decode_len = min(seq_len - rand_encode_len, num_decode_steps) elif mode == 'test': rand_encode_len = seq_len rand_decode_len = num_decode_steps end_idx = start_idx + rand_encode_len x[i, :rand_encode_len] = data[start_idx:end_idx] y[i, :rand_decode_len] = data[end_idx:end_idx + rand_decode_len] x_raw[i, :rand_encode_len] = data_raw[start_idx:end_idx] x_lags[i, :rand_encode_len, :x_lag.shape[1]] = x_lag[ start_idx:end_idx, :] x_lags[i, :rand_encode_len, x_lag.shape[1]:] = xy_lag[start_idx:end_idx, :] y_lags[i, :rand_decode_len, :] = xy_lag[end_idx:end_idx + rand_decode_len, :] x_op[i, :rand_encode_len] = op[start_idx:end_idx] y_op[i, :rand_decode_len] = op[end_idx:end_idx + rand_decode_len] x_ts[i, :rand_encode_len, :] = ts[start_idx:end_idx, :] x_idx[i, :rand_encode_len] = np.floor( np.log(np.arange(rand_encode_len) + 1)) y_idx[i, :rand_decode_len] = np.floor( np.log( np.arange(rand_encode_len, rand_encode_len + rand_decode_len) + 1)) y_id[i, :rand_decode_len] = uid[end_idx:end_idx + rand_decode_len] x_len[i] = end_idx - start_idx y_len[i] = rand_decode_len batch['x_'] = batch['x'] batch['x'] = x batch['y'] = y batch['x_raw'] = x_raw batch['x_lags'] = x_lags batch['y_lags'] = y_lags batch['x_op'] = x_op batch['y_op'] = y_op batch['x_ts'] = x_ts batch['x_idx'] = x_idx batch['y_idx'] = y_idx batch['y_id'] = y_id batch['x_len'] = x_len batch['y_len'] = y_len batch['item_class'] = batch['class'] batch['weights'] = weights yield batch