Exemplo n.º 1
0
    def __init__(self, data_dir, output_dir, **kwargs):
        _data_cols = ['x', 'x_len', 'c', 'c_len', 'text']
        data_cols = []

        # Make sure all of the columns exist
        for i in _data_cols:
            if os.path.exists(os.path.join(data_dir, '{}.npy'.format(i))):
                data_cols.append(i)

        f = []
        for i in data_cols:
            f.append(np.load(os.path.join(data_dir, '{}.npy'.format(i))))

        data = [np.load(os.path.join(data_dir, '{}.npy'.format(i))) for i in data_cols]

        self.test_df = DataFrame(columns=data_cols, data=data, **kwargs)
        self.train_df, self.val_df = self.test_df.train_test_split(train_size=0.95, random_state=2018)
        date_str = datetime.now().strftime('%Y-%m-%d_%H-%M')

        np.save(Path(output_dir) / date_str, self.val_df.dict)
        # Load the strokes
        # np.load("../checkpoints/original/2020-03-13_15-03.npy", allow_pickle=True).item()["x"]
        print('train size', len(self.train_df))
        print('val size', len(self.val_df))
        print('test size', len(self.test_df))
Exemplo n.º 2
0
    def __init__(self, data_dir):
        data_cols = ['order_id', 'product_id', 'features', 'label']
        data = [
            np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r')
            for i in data_cols
        ]
        df = DataFrame(columns=data_cols, data=data)
        self.data_dim = df['features'].shape[1]

        print(df.shapes())
        print('loaded data')

        self.test_df = df.mask(df['label'] == -1)
        self.train_df = df.mask(df['label'] != -1)
        self.train_df, self.val_df = self.train_df.train_test_split(
            train_size=0.9)

        print('train size', len(self.train_df))
        print('val size', len(self.val_df))
        print('test size', len(self.test_df))

        self.feature_means = np.load(
            os.path.join(data_dir, 'feature_means.npy'))
        self.feature_maxs = np.load(os.path.join(data_dir, 'feature_maxs.npy'))
        self.feature_mins = np.load(os.path.join(data_dir, 'feature_mins.npy'))
Exemplo n.º 3
0
    def __init__(self, data_dir):
        data_cols = [
            'user_id',
            'history_length',
            'order_size_history',
            'reorder_size_history',
            'order_number_history',
            'order_dow_history',
            'order_hour_history',
            'days_since_prior_order_history',
        ]
        data = [
            np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r')
            for i in data_cols
        ]
        self.test_df = DataFrame(columns=data_cols, data=data)

        print self.test_df.shapes()
        print 'loaded data'

        self.train_df, self.val_df = self.test_df.train_test_split(
            train_size=0.9)

        print 'train size', len(self.train_df)
        print 'val size', len(self.val_df)
        print 'test size', len(self.test_df)
Exemplo n.º 4
0
    def __init__(self, data_dir):
        data_cols = [
            'user_id', 'product_id', 'aisle_id', 'department_id',
            'is_ordered_history', 'index_in_order_history',
            'order_dow_history', 'order_hour_history',
            'days_since_prior_order_history', 'order_size_history',
            'reorder_size_history', 'order_is_weekend_history',
            'order_part_of_day_history', 'order_number_history',
            'history_length', 'product_name', 'product_name_length',
            'eval_set', 'label'
        ]
        data = [
            np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r')
            for i in data_cols
        ]
        self.test_df = DataFrame(columns=data_cols, data=data)

        print(self.test_df.shapes())
        print('loaded data')

        self.train_df, self.val_df = self.test_df.train_test_split(
            train_size=0.9)

        print('train size', len(self.train_df))
        print('val size', len(self.val_df))
        print('test size', len(self.test_df))
    def __init__(self, data_dir):
        data_cols = [
            'order_id',
            'product_id',
            'features',
            'label'
        ]
        data = [np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r') for i in data_cols]
        df = DataFrame(columns=data_cols, data=data)
        self.data_dim = df['features'].shape[1]

        print df.shapes()
        print 'loaded data'

        self.test_df = df.mask(df['label'] == -1)
        self.train_df = df.mask(df['label'] != -1)
        self.train_df, self.val_df = self.train_df.train_test_split(train_size=0.9)

        print 'train size', len(self.train_df)
        print 'val size', len(self.val_df)
        print 'test size', len(self.test_df)

        self.feature_means = np.load(os.path.join(data_dir, 'feature_means.npy'))
        self.feature_maxs = np.load(os.path.join(data_dir, 'feature_maxs.npy'))
        self.feature_mins = np.load(os.path.join(data_dir, 'feature_mins.npy'))
Exemplo n.º 6
0
    def __init__(self, data_dir):
        data_cols = [
            'x_raw',
            'holidayinfo',
            'air_store_id2',
            'id',
            'x',
            'air_area_name',
            'air_genre_name',
            'latitude',
            'longitude',
            'start_date',
            'x_lags',
            'xy_lags',
        ]
        data = [
            np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r')
            for i in data_cols
        ]

        self.test_df = DataFrame(columns=data_cols, data=data)
        self.train_df, self.val_df = self.test_df.train_test_split(
            train_size=0.9)

        self.num_area_name = self.test_df['air_area_name'].max() + 1
        self.num_genre_name = self.test_df['air_genre_name'].max() + 1
        self.num_store_id = self.test_df['air_store_id2'].max() + 1

        print('train size', len(self.train_df))
        print('val size', len(self.val_df))
        print('test size', len(self.test_df))
    def __init__(
        self,
        data_dir,
        seed,
    ):

        data_cols = ['all_df', 'all_id']
        train_data = [
            np.load(os.path.join(data_dir, '{}.npy'.format(i)))
            for i in data_cols
        ]

        self.full_train = DataFrame(columns=data_cols, data=train_data)
        self.test_df = self.full_train
        # seed=np.random.randint(0, 1000000)
        # seed = 99 + seed
        self.train_df, self.val_df = self.full_train.train_test_split(
            train_size=0.9, random_state=seed)
        print('train size', len(self.train_df))
        print('val size', len(self.val_df))
        print('test size', len(self.test_df))

        self.max_frames = 72
        self.GLOBAL_IS_VAL = False
        self.GLOBAL_IS_TEST = False
Exemplo n.º 8
0
    def __init__(self, data):
        columns = ['x', 'y']
        df = DataFrame(columns=columns, data=data)
        self.train_df, self.val_df = df.train_test_split(
            train_size=0.9, random_state=config.random_seed)

        self.num_products = df['y'].max() + 1
        self.product_dist = np.bincount(df['y']).tolist()
    def __init__(self, data):
        columns = ['i', 'j', 'V_ij']
        df = DataFrame(columns=columns, data=data)
        self.train_df, self.val_df = df.train_test_split(
            train_size=0.9, random_state=config.random_seed)
        self.test_df = df

        self.num_users = df['i'].max() + 1
        self.num_products = df['j'].max() + 1
Exemplo n.º 10
0
    def __init__(self, data_dir):
        data_cols = ['i', 'j', 'V_ij']
        data = [np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r') for i in data_cols]

        df = DataFrame(columns=data_cols, data=data)
        self.train_df, self.val_df = df.train_test_split(train_size=0.9)

        print('train size', len(self.train_df))
        print('val size', len(self.val_df))

        self.num_users = df['i'].max() + 1
        self.num_products = df['j'].max() + 1
Exemplo n.º 11
0
    def __init__(self, data_dir):
        data_cols = ['x', 'y']
        data = [np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r') for i in data_cols]

        df = DataFrame(columns=data_cols, data=data)

        self.train_df, self.val_df = df.train_test_split(train_size=0.9)

        print 'train size', len(self.train_df)
        print 'val size', len(self.val_df)

        self.num_products = df['x'].max() + 1
        self.product_dist = np.bincount(self.train_df['x']).tolist()
Exemplo n.º 12
0
    def __init__(self, data_dir):
        data_cols = ['data', 'is_nan', 'page_id', 'project', 'access', 'agent']
        data = [
            np.load(os.path.join(data_dir, '{}.npy'.format(i)))
            for i in data_cols
        ]

        self.test_df = DataFrame(columns=data_cols, data=data)
        self.train_df, self.val_df = self.test_df.train_test_split(
            train_size=0.9)

        print 'train size', len(self.train_df)
        print 'val size', len(self.val_df)
        print 'test size', len(self.test_df)
Exemplo n.º 13
0
    def __init__(self, data_dir):
        data_cols = ['x', 'x_len', 'c', 'c_len']
        data = [
            np.load(os.path.join(data_dir, '{}.npy'.format(i)))
            for i in data_cols
        ]

        self.test_df = DataFrame(columns=data_cols, data=data)
        self.train_df, self.val_df = self.test_df.train_test_split(
            train_size=0.95, random_state=2018)

        print('train size', len(self.train_df))
        print('val size', len(self.val_df))
        print('test size', len(self.test_df))
Exemplo n.º 14
0
    def __init__(self):
        super().__init__()

        self.winfo_toplevel().title("Registro")
        self.resizable(0, 0)

        self.dataFrame = DataFrame(self, lambda: self.showFrame('register'))
        self.dataFrame.grid(row=0, column=0, sticky='NSEW')

        self.mainFrame = MainFrame(self, lambda: self.showFrame('data'))
        self.mainFrame.grid(row=0, column=0, sticky='NSEW')

        self.frames = dict()
        self.frames['register'] = self.mainFrame
        self.frames['data'] = self.dataFrame
Exemplo n.º 15
0
    def query_measurements_original(self, field_query, begin_time, end_time):
        """
        Query for epidata measurements.

        Parameters
        ----------
        field_query : dictionary containing either strings or lists of strings
            A dictionary containing field names and the values those fields must
            contain in matching measurements. Some system configurations require
            that values of specific fields be specified. A string field value
            represents an equality match, while a list value represents set
            membership (all values within the set are matched).
        begin_time : datetime
            Beginning of the time interval to query, inclusive.
        end_time : datetime
            End of the time interval to query, exclusive.

        Returns
        -------
        result : epidata DataFrame
            A DataFrame containing measurements matching the query.
        """
        self._check_cluster_memory()

        java_field_query, java_begin_time, java_end_time = self._to_java_params(
            field_query, begin_time, end_time)

        java_data_frame = self._jec.query(java_field_query, java_begin_time,
                                          java_end_time)
        return DataFrame(jdf=java_data_frame, sql_ctx=self._sql_ctx)
    def __init__(self, data_dir):
        data_cols = [
            'user_id',
            'aisle_id',
            'department_id',
            'eval_set',
            'is_ordered_history',
            'index_in_order_history',
            'order_dow_history',
            'order_hour_history',
            'days_since_prior_order_history',
            'order_size_history',
            'order_number_history',
            'num_products_from_aisle_history',
            'history_length',
        ]
        data = [np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r') for i in data_cols]
        self.test_df = DataFrame(columns=data_cols, data=data)

        print self.test_df.shapes()
        print 'loaded data'

        self.train_df, self.val_df = self.test_df.train_test_split(train_size=0.9)

        print 'train size', len(self.train_df)
        print 'val size', len(self.val_df)
        print 'test size', len(self.test_df)
Exemplo n.º 17
0
class DataReader(object):
    def __init__(self, data_dir):

        data_cols = ['x', 'x_len', 'c', 'c_len']
        data = [
            np.load(os.path.join(data_dir, '{}.npy'.format(i)))
            for i in data_cols
        ]

        self.test_df = DataFrame(columns=data_cols, data=data)
        self.train_df, self.val_df = self.test_df.train_test_split(
            train_size=0.95, random_state=2018)

        print('train size', len(self.train_df))
        print('val size', len(self.val_df))
        print('test size', len(self.test_df))

    def train_batch_generator(self, batch_size):

        return self.batch_generator(batch_size=batch_size,
                                    df=self.train_df,
                                    shuffle=True,
                                    num_epochs=10000,
                                    mode='train')

    def val_batch_generator(self, batch_size):

        return self.batch_generator(batch_size=batch_size,
                                    df=self.val_df,
                                    shuffle=True,
                                    num_epochs=10000,
                                    mode='val')

    def test_batch_generator(self, batch_size):

        return self.batch_generator(batch_size=batch_size,
                                    df=self.test_df,
                                    shuffle=False,
                                    num_epochs=1,
                                    mode='test')

    def batch_generator(self,
                        batch_size,
                        df,
                        shuffle=True,
                        num_epochs=10000,
                        mode='train'):

        gen = df.batch_generator(batch_size=batch_size,
                                 shuffle=shuffle,
                                 num_epochs=num_epochs,
                                 allow_smaller_final_batch=(mode == 'test'))
        for batch in gen:
            batch['x_len'] = batch['x_len'] - 1
            max_x_len = np.max(batch['x_len'])
            max_c_len = np.max(batch['c_len'])
            batch['y'] = batch['x'][:, 1:max_x_len + 1, :]
            batch['x'] = batch['x'][:, :max_x_len, :]
            batch['c'] = batch['c'][:, :max_c_len]
            yield batch
Exemplo n.º 18
0
    def __init__(self, data_dir):
        data_cols = ['x', 'y']
        data = [
            np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r')
            for i in data_cols
        ]

        df = DataFrame(columns=data_cols, data=data)

        self.train_df, self.val_df = df.train_test_split(train_size=0.9)

        print 'train size', len(self.train_df)
        print 'val size', len(self.val_df)

        self.num_products = df['x'].max() + 1
        self.product_dist = np.bincount(self.train_df['x']).tolist()
Exemplo n.º 19
0
def read_dataframe_from_csv(path):
    # fixme - need to close file descriptor.
    dataset_handle = open(path)
    reader = csv.reader(dataset_handle)
    headers = next(reader)
    data_frame = DataFrame(headers, reader)
    return data_frame
Exemplo n.º 20
0
    def __init__(self, data_dir):
        data_cols = ['x', 'x_len', 'c', 'c_len']
        data = [np.load(os.path.join(data_dir, '{}.npy'.format(i))) for i in data_cols]

        self.test_df = DataFrame(columns=data_cols, data=data)
        self.train_df, self.val_df = self.test_df.train_test_split(train_size=0.95, random_state=2018)

        print('train size', len(self.train_df))
        print('val size', len(self.val_df))
        print('test size', len(self.test_df))
Exemplo n.º 21
0
    def __init__(self, data_dir):
        data_cols = [
            'data',
            'isNAN',
            'page_id',
            'project',
            'access',
            'agent',
            'test_data',
            'test_isNAN'
        ]
        data = [np.load(os.path.join(data_dir, '{}.npy'.format(i))) for i in data_cols]

        self.testDataframe = DataFrame(columns=data_cols, data=data)
        self.trainDataframe, self.valDataframe = self.testDataframe.train_test_split(train_size=0.95)

        print 'Size of trained data', len(self.trainDataframe)
        print 'val size', len(self.valDataframe)
        print 'size of test data', len(self.testDataframe)
Exemplo n.º 22
0
    def query_measurements_cleansed(self, field_query, begin_time, end_time):

        self._check_cluster_memory()

        java_field_query, java_begin_time, java_end_time = self._to_java_params(
            field_query, begin_time, end_time)

        java_data_frame = self._jec.queryMeasurementCleansed(
            java_field_query, java_begin_time, java_end_time)
        return DataFrame(jdf=java_data_frame, sql_ctx=self._sql_ctx)
Exemplo n.º 23
0
    def __init__(self, data_dir):
        data_cols = [
            'x_raw',
            'onpromotion',
            'id',
            'x',
            'store_nbr',
            'item_nbr',
            'city',
            'state',
            'type',
            'cluster',
            'family',
            'class',
            'perishable',
            'is_discrete',
            'start_date',
            'x_lags',
            'xy_lags',
            'ts',
        ]
        data = [
            np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r')
            for i in data_cols
        ]

        self.test_df = DataFrame(columns=data_cols, data=data)
        self.train_df, self.val_df = self.test_df.train_test_split(
            train_size=0.95)

        self.num_city = self.test_df['city'].max() + 1
        self.num_state = self.test_df['state'].max() + 1
        self.num_type = self.test_df['type'].max() + 1
        self.num_cluster = self.test_df['cluster'].max() + 1
        self.num_family = self.test_df['family'].max() + 1
        self.num_item_class = self.test_df['class'].max() + 1
        self.num_perishable = self.test_df['perishable'].max() + 1
        self.num_store_nbr = self.test_df['store_nbr'].max() + 1
        self.num_item_nbr = self.test_df['item_nbr'].max() + 1

        print('train size', len(self.train_df))
        print('val size', len(self.val_df))
        print('test size', len(self.test_df))
Exemplo n.º 24
0
class DataReader(object):

    def __init__(self, data_dir):
        data_cols = ['x', 'x_len', 'c', 'c_len']
        data = [np.load(os.path.join(data_dir, '{}.npy'.format(i))) for i in data_cols]

        self.test_df = DataFrame(columns=data_cols, data=data)
        self.train_df, self.val_df = self.test_df.train_test_split(train_size=0.95, random_state=2018)

        print('train size', len(self.train_df))
        print('val size', len(self.val_df))
        print('test size', len(self.test_df))

    def train_batch_generator(self, batch_size):
        return self.batch_generator(
            batch_size=batch_size,
            df=self.train_df,
            shuffle=True,
            num_epochs=10000,
            mode='train'
        )

    def val_batch_generator(self, batch_size):
        return self.batch_generator(
            batch_size=batch_size,
            df=self.val_df,
            shuffle=True,
            num_epochs=10000,
            mode='val'
        )

    def test_batch_generator(self, batch_size):
        return self.batch_generator(
            batch_size=batch_size,
            df=self.test_df,
            shuffle=False,
            num_epochs=1,
            mode='test'
        )

    def batch_generator(self, batch_size, df, shuffle=True, num_epochs=10000, mode='train'):
        gen = df.batch_generator(
            batch_size=batch_size,
            shuffle=shuffle,
            num_epochs=num_epochs,
            allow_smaller_final_batch=(mode == 'test')
        )
        for batch in gen:
            batch['x_len'] = batch['x_len'] - 1
            max_x_len = np.max(batch['x_len'])
            max_c_len = np.max(batch['c_len'])
            batch['y'] = batch['x'][:, 1:max_x_len + 1, :]
            batch['x'] = batch['x'][:, :max_x_len, :]
            batch['c'] = batch['c'][:, :max_c_len]
            yield batch
Exemplo n.º 25
0
    def __init__(self, data_dir):
        data_cols = [
            'data',
            'is_nan',
            'page_id',
            'project',
            'access',
            'agent',
            'test_data',
            'test_is_nan'
        ]
        data = [np.load(os.path.join(data_dir, '{}.npy'.format(i))) for i in data_cols]
        # 把原始数据构造成DataFrame 145063
        self.test_df = DataFrame(columns=data_cols, data=data)
        # 137809  7254 横向切分
        self.train_df, self.val_df = self.test_df.train_test_split(train_size=0.95)

        print('train size', len(self.train_df))
        print('val size', len(self.val_df))
        print('test size', len(self.test_df))
Exemplo n.º 26
0
    def list_keys(self):
        """
        List the epidata measurement keys.

        Returns
        -------
        result : epidata DataFrame
            A DataFrame containing values of the principal fields used for
            classifying measurements.
        """
        self._check_cluster_memory()
        return DataFrame(jdf=self._jec.listKeys(), sql_ctx=self._sql_ctx)
Exemplo n.º 27
0
def main(args):

    # set memory growth to true to fix potential memory issues
    physical_devices = tf.config.list_physical_devices('GPU') 
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

    # get data object
    data = DataFrame(
            path=args.dir, 
            n_symbols_in_captcha=args.captcha_size, 
            use_lowercase=args.use_lowercase, 
            use_uppercase=args.use_uppercase, 
            use_numbers=args.use_numbers
        )

    # get data split
    (X_train, t_train), (X_test, t_test) = data.get_data(args.test_size)

    model = None
    if os.path.exists(args.model_save):
        # load previously trained model if path exists
        model = tf.keras.models.load_model(args.model_save)
    # train 
    else:
        # init network
        model = ResNet18(n_classes=(data.get_num_symbols() * args.captcha_size), data_format='channels_last')

        # comile network with given params
        model.compile(loss='binary_crossentropy', optimizer=args.optm, metrics=["accuracy"])

        # train network
        model.fit(X_train, t_train, batch_size=args.batch_size, epochs=args.epochs, verbose=1)

        # dump trained model to file system
        model.save(args.model_save)

    assert(model is not None)

    # evaluate performance
    score = model.evaluate(X_test, t_test, verbose=1)
Exemplo n.º 28
0
    def __init__(self, data_dir):
        data_cols = [
            'x',
            'store_id',
            'item_id',
            'state_id',
            'dept_id',
            'cat_id',
            'wday',
            'month',
            'event_name_1',
            'event_type_1',
            'snap',
            'x_lags',
            'xy_lags',
            'ts',
            'sell_price',
            'sell_price_first_digit',  # to try
            'sell_price_last_digit',
            'start_date',
            'weights',
            'hierarchy_data',
            'all_id'
        ]

        data = [
            np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r')
            for i in data_cols
        ]

        self.test_df = DataFrame(columns=data_cols, data=data)
        #         self.train_df, self.val_df = self.test_df.train_test_split(train_size=0.95)

        self.num_store = self.test_df['store_id'].max() + 1
        self.num_item = self.test_df['item_id'].max() + 1
        self.num_state = self.test_df['state_id'].max() + 1
        self.num_dept = self.test_df['dept_id'].max() + 1
        self.num_cat = self.test_df['cat_id'].max() + 1
        self.num_wday = self.test_df['wday'].max() + 1
        self.num_month = self.test_df['month'].max() + 1
        self.num_event_name_1 = self.test_df['event_name_1'].max() + 1
        self.num_event_type_1 = self.test_df['event_type_1'].max() + 1

        #         print 'train size', len(self.train_df)
        #         print 'val size', len(self.val_df)
        print 'test size', len(self.test_df)
Exemplo n.º 29
0
def main():

    parser = argparse.ArgumentParser(
        description='Converts npz files to actual training libraries')
    parser.add_argument('dataset', help='The dataset folder.')
    args = parser.parse_args()
    print(args)

    data_cols = ['x', 'x_len', 'c', 'c_len']
    data = [
        np.load(os.path.join(args.dataset, '{}.npy'.format(i)))
        for i in data_cols
    ]

    dataFrame = DataFrame(columns=data_cols, data=data)
    dataDrawer = DataDrawer(dataFrame)

    dataDrawer.run()
Exemplo n.º 30
0
def IMR(measurements, meas_names=None):
    """
    Perform IMR analysis on a DataFrame of measurements. The measurements are
    grouped by the 'meas_name' field and IMR is performed on each group.

    Parameters
    ----------
    measurements : epidata DataFrame
        A DataFrame containing measurements, as returned by EpidataContext.query.
    meas_names : list of strings, or string or None, default None
        A list of measurement names on which to perform IMR, or a single
        measurement name on which to perform IMR. If None, all measurements will
        be analyzed.

    Returns
    -------
    result : epidata DataFrame
        A copy of the measurements DataFrame, with the IMR results appended as
        additional columns.
    """

    from context import ec

    local = not isinstance(measurements, DataFrame)
    if local:
        raise ValueError('Unsupported local measurements argument to IMR.')

    if isinstance(meas_names, basestring):
        # Filter a single string measurement name.
        measurements = measurements.filter(
            measurements.meas_name == meas_names)
    elif meas_names:
        # Build a composite filter for a list of measurement names.
        condition = (measurements.meas_name == meas_names[0])
        for name in meas_names[1:]:
            condition = (condition | (measurements.meas_name == name))
        measurements = measurements.filter(condition)

    java_IMR = ec._sc._jvm.com.epidata.spark.analytics.IMR.get()
    jdf = java_IMR.applyToDataFrame(measurements._pdf._jdf)
    return DataFrame(jdf=jdf, sql_ctx=measurements._pdf.sql_ctx)
class DataReader(object):

    def __init__(self, data_dir):
        data_cols = [
            'user_id',
            'history_length',
            'order_size_history',
            'reorder_size_history',
            'order_number_history',
            'order_dow_history',
            'order_hour_history',
            'days_since_prior_order_history',
        ]
        data = [np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r') for i in data_cols]
        self.test_df = DataFrame(columns=data_cols, data=data)

        print self.test_df.shapes()
        print 'loaded data'

        self.train_df, self.val_df = self.test_df.train_test_split(train_size=0.9)

        print 'train size', len(self.train_df)
        print 'val size', len(self.val_df)
        print 'test size', len(self.test_df)

    def train_batch_generator(self, batch_size):
        return self.batch_generator(
            batch_size=batch_size,
            df=self.train_df,
            shuffle=True,
            num_epochs=10000,
            is_test=False
        )

    def val_batch_generator(self, batch_size):
        return self.batch_generator(
            batch_size=batch_size,
            df=self.val_df,
            shuffle=True,
            num_epochs=10000,
            is_test=False
        )

    def test_batch_generator(self, batch_size):
        return self.batch_generator(
            batch_size=batch_size,
            df=self.test_df,
            shuffle=False,
            num_epochs=1,
            is_test=True
        )

    def batch_generator(self, batch_size, df, shuffle=True, num_epochs=10000, is_test=False):
        batch_gen = df.batch_generator(batch_size, shuffle=shuffle, num_epochs=num_epochs, allow_smaller_final_batch=is_test)
        for batch in batch_gen:
            batch['order_dow_history'] = np.roll(batch['order_dow_history'], -1, axis=1)
            batch['order_hour_history'] = np.roll(batch['order_hour_history'], -1, axis=1)
            batch['days_since_prior_order_history'] = np.roll(batch['days_since_prior_order_history'], -1, axis=1)
            batch['order_number_history'] = np.roll(batch['order_number_history'], -1, axis=1)
            batch['next_reorder_size'] = np.roll(batch['reorder_size_history'], -1, axis=1)
            if not is_test:
                batch['history_length'] = batch['history_length'] - 1
            yield batch
Exemplo n.º 32
0
class DataReader(object):
    def __init__(self, data_dir):
        data_cols = [
            'user_id',
            'history_length',
            'order_size_history',
            'reorder_size_history',
            'order_number_history',
            'order_dow_history',
            'order_hour_history',
            'days_since_prior_order_history',
        ]
        data = [
            np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r')
            for i in data_cols
        ]
        self.test_df = DataFrame(columns=data_cols, data=data)

        print self.test_df.shapes()
        print 'loaded data'

        self.train_df, self.val_df = self.test_df.train_test_split(
            train_size=0.9)

        print 'train size', len(self.train_df)
        print 'val size', len(self.val_df)
        print 'test size', len(self.test_df)

    def train_batch_generator(self, batch_size):
        return self.batch_generator(batch_size=batch_size,
                                    df=self.train_df,
                                    shuffle=True,
                                    num_epochs=10000,
                                    is_test=False)

    def val_batch_generator(self, batch_size):
        return self.batch_generator(batch_size=batch_size,
                                    df=self.val_df,
                                    shuffle=True,
                                    num_epochs=10000,
                                    is_test=False)

    def test_batch_generator(self, batch_size):
        return self.batch_generator(batch_size=batch_size,
                                    df=self.test_df,
                                    shuffle=False,
                                    num_epochs=1,
                                    is_test=True)

    def batch_generator(self,
                        batch_size,
                        df,
                        shuffle=True,
                        num_epochs=10000,
                        is_test=False):
        batch_gen = df.batch_generator(batch_size,
                                       shuffle=shuffle,
                                       num_epochs=num_epochs,
                                       allow_smaller_final_batch=is_test)
        for batch in batch_gen:
            batch['order_dow_history'] = np.roll(batch['order_dow_history'],
                                                 -1,
                                                 axis=1)
            batch['order_hour_history'] = np.roll(batch['order_hour_history'],
                                                  -1,
                                                  axis=1)
            batch['days_since_prior_order_history'] = np.roll(
                batch['days_since_prior_order_history'], -1, axis=1)
            batch['order_number_history'] = np.roll(
                batch['order_number_history'], -1, axis=1)
            batch['next_reorder_size'] = np.roll(batch['reorder_size_history'],
                                                 -1,
                                                 axis=1)
            if not is_test:
                batch['history_length'] = batch['history_length'] - 1
            yield batch
Exemplo n.º 33
0
class DataReader(object):
    def __init__(self, data_dir):
        data_cols = [
            'data', 'is_nan', 'page_id', 'project', 'access', 'agent',
            'test_data', 'test_is_nan'
        ]
        data = [
            np.load(os.path.join(data_dir, '{}.npy'.format(i)))
            for i in data_cols
        ]

        self.test_df = DataFrame(columns=data_cols, data=data)
        self.train_df, self.val_df = self.test_df.train_test_split(
            train_size=0.95)

        print 'train size', len(self.train_df)
        print 'val size', len(self.val_df)
        print 'test size', len(self.test_df)

    def train_batch_generator(self, batch_size):
        return self.batch_generator(batch_size=batch_size,
                                    df=self.train_df,
                                    shuffle=True,
                                    num_epochs=10000,
                                    is_test=False)

    def val_batch_generator(self, batch_size):
        return self.batch_generator(batch_size=batch_size,
                                    df=self.val_df,
                                    shuffle=True,
                                    num_epochs=10000,
                                    is_test=False)

    def test_batch_generator(self, batch_size):
        return self.batch_generator(batch_size=batch_size,
                                    df=self.test_df,
                                    shuffle=True,
                                    num_epochs=1,
                                    is_test=True)

    def batch_generator(self,
                        batch_size,
                        df,
                        shuffle=True,
                        num_epochs=10000,
                        is_test=False):
        batch_gen = df.batch_generator(batch_size=batch_size,
                                       shuffle=shuffle,
                                       num_epochs=num_epochs,
                                       allow_smaller_final_batch=is_test)
        data_col = 'test_data' if is_test else 'data'
        is_nan_col = 'test_is_nan' if is_test else 'is_nan'
        for batch in batch_gen:
            num_decode_steps = 64
            full_seq_len = batch[data_col].shape[1]
            max_encode_length = full_seq_len - num_decode_steps if not is_test else full_seq_len

            x_encode = np.zeros([len(batch), max_encode_length])
            y_decode = np.zeros([len(batch), num_decode_steps])
            is_nan_encode = np.zeros([len(batch), max_encode_length])
            is_nan_decode = np.zeros([len(batch), num_decode_steps])
            encode_len = np.zeros([len(batch)])
            decode_len = np.zeros([len(batch)])

            for i, (seq, nan_seq) in enumerate(
                    zip(batch[data_col], batch[is_nan_col])):
                rand_len = np.random.randint(max_encode_length - 365 + 1,
                                             max_encode_length + 1)
                x_encode_len = max_encode_length if is_test else rand_len
                x_encode[i, :x_encode_len] = seq[:x_encode_len]
                is_nan_encode[i, :x_encode_len] = nan_seq[:x_encode_len]
                encode_len[i] = x_encode_len
                decode_len[i] = num_decode_steps
                if not is_test:
                    y_decode[i, :] = seq[x_encode_len:x_encode_len +
                                         num_decode_steps]
                    is_nan_decode[i, :] = nan_seq[x_encode_len:x_encode_len +
                                                  num_decode_steps]

            batch['x_encode'] = x_encode
            batch['encode_len'] = encode_len
            batch['y_decode'] = y_decode
            batch['decode_len'] = decode_len
            batch['is_nan_encode'] = is_nan_encode
            batch['is_nan_decode'] = is_nan_decode

            yield batch
Exemplo n.º 34
0
class DataReader(object):
    def __init__(self, data_dir):
        data_cols = [
            'x_raw',
            'holidayinfo',
            'air_store_id2',
            'id',
            'x',
            'air_area_name',
            'air_genre_name',
            'latitude',
            'longitude',
            'start_date',
            'x_lags',
            'xy_lags',
        ]
        data = [
            np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r')
            for i in data_cols
        ]

        self.test_df = DataFrame(columns=data_cols, data=data)
        self.train_df, self.val_df = self.test_df.train_test_split(
            train_size=0.9)

        self.num_area_name = self.test_df['air_area_name'].max() + 1
        self.num_genre_name = self.test_df['air_genre_name'].max() + 1
        self.num_store_id = self.test_df['air_store_id2'].max() + 1

        print('train size', len(self.train_df))
        print('val size', len(self.val_df))
        print('test size', len(self.test_df))

    def train_batch_generator(self, batch_size):
        return self.batch_generator(batch_size=batch_size,
                                    df=self.train_df,
                                    shuffle=True,
                                    num_epochs=1000,
                                    mode='train')

    def val_batch_generator(self, batch_size):
        return self.batch_generator(batch_size=batch_size,
                                    df=self.val_df,
                                    shuffle=True,
                                    num_epochs=1000,
                                    mode='val')

    def test_batch_generator(self, batch_size):
        return self.batch_generator(batch_size=batch_size,
                                    df=self.test_df,
                                    shuffle=True,
                                    num_epochs=1,
                                    mode='test')

    def batch_generator(self,
                        batch_size,
                        df,
                        mode,
                        shuffle=True,
                        num_epochs=1000):
        batch_gen = df.batch_generator(
            batch_size=batch_size,
            shuffle=shuffle,
            num_epochs=num_epochs,
            allow_smaller_final_batch=(mode == 'test'))
        for batch in batch_gen:
            num_decode_steps = 39
            full_seq_len = batch['x'].shape[1] - num_decode_steps
            max_encode_length = full_seq_len

            x = np.zeros([len(batch), max_encode_length])
            y = np.zeros([len(batch), num_decode_steps])
            x_raw = np.zeros([len(batch), max_encode_length])
            x_lags = np.zeros([
                len(batch), max_encode_length,
                batch['x_lags'].shape[2] + batch['xy_lags'].shape[2]
            ])
            y_lags = np.zeros(
                [len(batch), num_decode_steps, batch['xy_lags'].shape[2]])
            x_holi = np.zeros([len(batch), max_encode_length])
            y_holi = np.zeros([len(batch), num_decode_steps])
            x_len = np.zeros([len(batch)])
            y_len = np.zeros([len(batch)])
            x_idx = np.zeros([len(batch), max_encode_length])
            y_idx = np.zeros([len(batch), num_decode_steps])
            y_id = np.zeros([len(batch), num_decode_steps])
            #x_ts = np.zeros([len(batch), max_encode_length, batch['ts'].shape[2]])
            weights = np.zeros([len(batch)])
            weights[:] = 1.0

            for i, (data, data_raw, start_idx, x_lag, xy_lag, holi,
                    uid) in enumerate(
                        zip(batch['x'], batch['x_raw'], batch['start_date'],
                            batch['x_lags'], batch['xy_lags'],
                            batch['holidayinfo'], batch['id'])):
                seq_len = full_seq_len - start_idx
                val_window = 180
                train_window = 180

                if mode == 'train':
                    if seq_len == 0:
                        rand_encode_len = 0
                        weights[i] = 0
                    elif seq_len <= train_window:
                        rand_encode_len = np.random.randint(0, seq_len)
                    else:
                        rand_encode_len = np.random.randint(
                            seq_len - train_window, seq_len)
                    rand_decode_len = min(seq_len - rand_encode_len,
                                          num_decode_steps)

                elif mode == 'val':
                    if seq_len <= num_decode_steps:
                        rand_encode_len = 0
                        weights[i] = 0
                    elif seq_len <= val_window + num_decode_steps:
                        rand_encode_len = np.random.randint(
                            0, seq_len - num_decode_steps + 1)
                    else:
                        rand_encode_len = np.random.randint(
                            seq_len - (val_window + num_decode_steps),
                            seq_len - num_decode_steps + 1)
                    rand_decode_len = min(seq_len - rand_encode_len,
                                          num_decode_steps)

                elif mode == 'test':
                    rand_encode_len = seq_len
                    rand_decode_len = num_decode_steps

                end_idx = start_idx + rand_encode_len

                x[i, :rand_encode_len] = data[start_idx:end_idx]
                y[i, :rand_decode_len] = data[end_idx:end_idx +
                                              rand_decode_len]
                x_raw[i, :rand_encode_len] = data_raw[start_idx:end_idx]

                x_lags[i, :rand_encode_len, :x_lag.shape[1]] = x_lag[
                    start_idx:end_idx, :]
                x_lags[i, :rand_encode_len,
                       x_lag.shape[1]:] = xy_lag[start_idx:end_idx, :]
                y_lags[i, :rand_decode_len, :] = xy_lag[end_idx:end_idx +
                                                        rand_decode_len, :]

                x_holi[i, :rand_encode_len] = holi[start_idx:end_idx]
                y_holi[i, :rand_decode_len] = holi[end_idx:end_idx +
                                                   rand_decode_len]
                x_idx[i, :rand_encode_len] = np.floor(
                    np.log(np.arange(rand_encode_len) + 1))
                y_idx[i, :rand_decode_len] = np.floor(
                    np.log(
                        np.arange(rand_encode_len, rand_encode_len +
                                  rand_decode_len) + 1))
                y_id[i, :rand_decode_len] = uid[end_idx:end_idx +
                                                rand_decode_len]
                x_len[i] = end_idx - start_idx
                y_len[i] = rand_decode_len

            batch['x_'] = batch['x']
            batch['x'] = x
            batch['y'] = y
            batch['x_raw'] = x_raw
            batch['x_lags'] = x_lags
            batch['y_lags'] = y_lags
            batch['x_holi'] = x_holi
            batch['y_holi'] = y_holi
            # batch['x_ts'] = x_ts
            batch['x_idx'] = x_idx
            batch['y_idx'] = y_idx
            batch['y_id'] = y_id
            batch['x_len'] = x_len

            batch['y_len'] = y_len
            # batch['item_class'] = batch['class']
            batch['weights'] = weights

            yield batch
Exemplo n.º 35
0
class DataReader(object):
    def __init__(self, data_dir):
        data_cols = [
            'user_id', 'product_id', 'aisle_id', 'department_id',
            'is_ordered_history', 'index_in_order_history',
            'order_dow_history', 'order_hour_history',
            'days_since_prior_order_history', 'order_size_history',
            'reorder_size_history', 'order_is_weekend_history',
            'order_part_of_day_history', 'order_number_history',
            'history_length', 'product_name', 'product_name_length',
            'eval_set', 'label'
        ]
        data = [
            np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r')
            for i in data_cols
        ]
        self.test_df = DataFrame(columns=data_cols, data=data)

        print(self.test_df.shapes())
        print('loaded data')

        self.train_df, self.val_df = self.test_df.train_test_split(
            train_size=0.9)

        print('train size', len(self.train_df))
        print('val size', len(self.val_df))
        print('test size', len(self.test_df))

    def train_batch_generator(self, batch_size):
        return self.batch_generator(batch_size=batch_size,
                                    df=self.train_df,
                                    shuffle=True,
                                    num_epochs=10000,
                                    is_test=False)

    def val_batch_generator(self, batch_size):
        return self.batch_generator(batch_size=batch_size,
                                    df=self.val_df,
                                    shuffle=True,
                                    num_epochs=10000,
                                    is_test=False)

    def test_batch_generator(self, batch_size):
        return self.batch_generator(batch_size=batch_size,
                                    df=self.test_df,
                                    shuffle=False,
                                    num_epochs=1,
                                    is_test=True)

    def batch_generator(self,
                        batch_size,
                        df,
                        shuffle=True,
                        num_epochs=10000,
                        is_test=False):
        batch_gen = df.batch_generator(batch_size,
                                       shuffle=shuffle,
                                       num_epochs=num_epochs,
                                       allow_smaller_final_batch=is_test)
        for batch in batch_gen:
            batch['order_dow_history'] = np.roll(batch['order_dow_history'],
                                                 -1,
                                                 axis=1)
            batch['order_hour_history'] = np.roll(batch['order_hour_history'],
                                                  -1,
                                                  axis=1)
            batch['days_since_prior_order_history'] = np.roll(
                batch['days_since_prior_order_history'], -1, axis=1)
            batch['order_is_weekend_history'] = np.roll(
                batch['order_is_weekend_history'], -1, axis=1)
            batch['order_part_of_day_history'] = np.roll(
                batch['order_part_of_day_history'], -1, axis=1)
            batch['order_number_history'] = np.roll(
                batch['order_number_history'], -1, axis=1)
            batch['next_is_ordered'] = np.roll(batch['is_ordered_history'],
                                               -1,
                                               axis=1)
            batch['is_none'] = batch['product_id'] == 0
            if not is_test:
                batch['history_length'] = batch['history_length'] - 1
            yield batch
Exemplo n.º 36
0
class DataReader(object):

    def __init__(self, data_dir, output_dir, **kwargs):
        _data_cols = ['x', 'x_len', 'c', 'c_len', 'text']
        data_cols = []

        # Make sure all of the columns exist
        for i in _data_cols:
            if os.path.exists(os.path.join(data_dir, '{}.npy'.format(i))):
                data_cols.append(i)

        f = []
        for i in data_cols:
            f.append(np.load(os.path.join(data_dir, '{}.npy'.format(i))))

        data = [np.load(os.path.join(data_dir, '{}.npy'.format(i))) for i in data_cols]

        self.test_df = DataFrame(columns=data_cols, data=data, **kwargs)
        self.train_df, self.val_df = self.test_df.train_test_split(train_size=0.95, random_state=2018)
        date_str = datetime.now().strftime('%Y-%m-%d_%H-%M')

        np.save(Path(output_dir) / date_str, self.val_df.dict)
        # Load the strokes
        # np.load("../checkpoints/original/2020-03-13_15-03.npy", allow_pickle=True).item()["x"]
        print('train size', len(self.train_df))
        print('val size', len(self.val_df))
        print('test size', len(self.test_df))


    def train_batch_generator(self, batch_size):
        return self.batch_generator(
            batch_size=batch_size,
            df=self.train_df,
            shuffle=True,
            num_epochs=10000,
            mode='train'
        )

    def val_batch_generator(self, batch_size):
        return self.batch_generator(
            batch_size=batch_size,
            df=self.val_df,
            shuffle=True,
            num_epochs=10000,
            mode='val'
        )

    def test_batch_generator(self, batch_size):
        return self.batch_generator(
            batch_size=batch_size,
            df=self.test_df,
            shuffle=False,
            num_epochs=1,
            mode='test'
        )

    def batch_generator(self, batch_size, df, shuffle=True, num_epochs=10000, mode='train'):
        gen = df.batch_generator(
            batch_size=batch_size,
            shuffle=shuffle,
            num_epochs=num_epochs,
            allow_smaller_final_batch=(mode == 'test')
        )

        for batch in gen:
            batch['x_len'] = batch['x_len'] - 1
            max_x_len = np.max(batch['x_len'])
            max_c_len = np.max(batch['c_len'])
            batch['y'] = batch['x'][:, 1:max_x_len + 1, :]
            batch['x'] = batch['x'][:, :max_x_len, :]
            batch['c'] = batch['c'][:, :max_c_len]
            yield batch