예제 #1
0
    def read_and_save_features(
        self,
        train_table_name: str,
        test_table_name: str,
        train_output_path: str,
        test_output_path: str,
    ) -> None:
        df_train_input = self._read_from_bigquery(train_table_name)
        df_test_input = self._read_from_bigquery(test_table_name)
        df_train_features, df_test_features = self.make_features(
            df_train_input, df_test_input)
        assert (df_train_input.shape[0] == df_train_features.shape[0]
                ), "generated train features is not compatible with the table"
        assert (df_test_input.shape[0] == df_test_features.shape[0]
                ), "generated test features is not compatible with the table"
        df_train_features.columns = f"{self.name}_" + df_train_features.columns
        df_test_features.columns = f"{self.name}_" + df_test_features.columns

        if self.save_memory:
            self._logger.info("Reduce memory size - train data")
            df_train_features = reduce_mem_usage(df_train_features)
            self._logger.info("Reduce memory size - test data")
            df_test_features = reduce_mem_usage(df_test_features)

        self._logger.info(f"Saving features to {train_output_path}")
        df_train_features.to_feather(train_output_path)
        self._logger.info(f"Saving features to {test_output_path}")
        df_test_features.to_feather(test_output_path)
예제 #2
0
def read_data():
    print('\n\nRunning read_data')
    calendar_df = pd.read_csv('./input/calendar.csv') #date, wm_yr_wk, weekday, wday, month, year, d, event_name_1, event_type_1, snap_CA, snap_TX, snap_WI 
    calendar_df = reduce_mem_usage(calendar_df)
    print('Calendar has {} rows and {} columns'.format(calendar_df.shape[0], calendar_df.shape[1]))

    # id, submission_id are only unique, # 30k
    # 'id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd_1' ... 'd_1941'
    sales_train_evaluation_df = pd.read_csv('./input/sales_train_evaluation.csv')
    print('Sales train validation has {} rows and {} columns'.format(sales_train_evaluation_df.shape[0], sales_train_evaluation_df.shape[1]))

    # no uniques, # 6M
    sell_prices_df = pd.read_csv('./input/sell_prices.csv')  #store_id, item_id, wm_yr_wk, sell_price
    sell_prices_df = reduce_mem_usage(sell_prices_df)
    print('Sell prices has {} rows and {} columns'.format(sell_prices_df.shape[0], sell_prices_df.shape[1]))

    submission_df = pd.read_csv('./input/sample_submission.csv')

    calendar_df = encode_categorical(calendar_df, ["event_name_1", "event_type_1", "event_name_2", "event_type_2"]).pipe(reduce_mem_usage)
    sales_train_evaluation_df = encode_categorical(sales_train_evaluation_df, ["item_id", "dept_id", "cat_id", "store_id", "state_id"]).pipe(reduce_mem_usage)
    sell_prices_df = encode_categorical(sell_prices_df, ["item_id", "store_id"]).pipe(reduce_mem_usage)

    # PICKLES
    calendar_df.to_pickle('./data/calendar_df.pkl.compress', compression="gzip")
    sales_train_evaluation_df.to_pickle('./data/sales_train_evaluation_df.pkl.compress', compression="gzip")
    sell_prices_df.to_pickle('./data/sell_prices_df.pkl.compress', compression="gzip")
    return calendar_df, sell_prices_df, sales_train_evaluation_df, submission_df
def extract_features(train_df):

    identity_df = pd.read_csv(config.DATA_PATH + 'train_identity.csv')

    target = train_df['isFraud']

    train_df.drop(['isFraud'], axis=1, inplace=True)

    train_df = train_df.merge(identity_df, on='TransactionID', how='left')

    handl_P_emaildomain(train_df)

    handle_NaN(train_df)

    transfer_cat_2_int(train_df)

    drop_corr_column(train_df)

    reduce_mem_usage(train_df)

    #  del train, df
    #  gc.collect()
    #  return X_train, y_train, test

    X = train_df  #.to_numpy()
    y = target  #.to_numpy()
    return X, y
예제 #4
0
def fe(df, path):

    # time delta
    df['AvSigVersion-m-Census_OSVersion'] = (df['AvSigVersion'] -
                                             df['Census_OSVersion']).dt.seconds
    df['Census_OSVersion-m-OsBuildLab'] = (df['Census_OSVersion'] -
                                           df['OsBuildLab']).dt.seconds

    # min max
    col = ['AvSigVersion', 'Census_OSVersion', 'OsBuildLab']
    df['date_min'] = df[col].min(1)
    df['date_max'] = df[col].max(1)
    df['date_max-m-min'] = (df['date_max'] - df['date_min']).dt.seconds

    # from max
    d_max = df[col].max().max()
    df['max-m-AvSigVersion'] = (d_max - df['AvSigVersion']).dt.seconds
    df['max-m-Census_OSVersion'] = (d_max - df['Census_OSVersion']).dt.seconds
    df['max-m-OsBuildLab'] = (d_max - df['OsBuildLab']).dt.seconds

    # save dt
    df[['AvSigVersion', 'Census_OSVersion', 'OsBuildLab']].to_feather(path)

    # into int64 and rank
    for c in [
            'AvSigVersion', 'Census_OSVersion', 'OsBuildLab', 'date_min',
            'date_max'
    ]:
        df[c] = df[c].astype(np.int64) // 10**9
        df.loc[df[c] < 0, c] = np.nan
        df[c] = df[c].rank(pct=True)

    utils.reduce_mem_usage(df)

    return
예제 #5
0
def multi(args):

    c, outpath_tr, outpath_te = args

    tr_f = pd.DataFrame(index=tr.index)
    te_f = pd.DataFrame(index=te.index)

    # count train + test
    di_tr = frequency_encoding(c, False, True)
    di_trte = frequency_encoding(c, True, True)
    tr_f[c + '_ts'] = tr[c].map(lambda x: di_tr.get(x, np.nan))
    te_f[c + '_ts'] = te[c].map(lambda x: di_trte.get(x, np.nan))

    # count train + test and na
    di_tr = frequency_encoding(c, False, False)
    di_trte = frequency_encoding(c, True, False)
    tr_f[c + '_ts_na'] = tr[c].map(lambda x: di_tr.get(x, np.nan))
    te_f[c + '_ts_na'] = te[c].map(lambda x: di_trte.get(x, np.nan))

    utils.reduce_mem_usage(tr_f)
    utils.reduce_mem_usage(te_f)

    # output
    tr_f.add_prefix(PREF + '_').to_feather(outpath_tr)
    te_f.add_prefix(PREF + '_').to_feather(outpath_te)

    return
예제 #6
0
    def transform(self, df):
        with Timer('features.FeatureGenerator.transform',
                   verbose=self.verbose):
            # Hand Written Features
            simple_feature_generator = SimpleFeatureGenerator(
                numeric=self.numeric, verbose=self.verbose)
            df_features = pd.concat(
                [df, simple_feature_generator.fit_transform(df)], axis=1)
            df_features = reduce_mem_usage(df_features)

            # 1-st level
            features = self.numeric + simple_feature_generator.get_feature_names(
            )
            df_features = pd.concat([
                df_features,
                GroupAggregatedFeatureGenerator(
                    features, verbose=self.verbose).fit_transform(df_features),
            ],
                                    axis=1)
            df_features = reduce_mem_usage(df_features)

            if self.created_features is None:
                self.created_features = [
                    col for col in df_features.columns if col in df.columns
                ]
            else:
                # TODO: test
                pass
            return df_features
def multi(args):

    c, outpath_tr, outpath_te = args

    tr_f = pd.DataFrame(index=tr.index)
    te_f = pd.DataFrame(index=te.index)

    di = frequency_encoding(c, False, False)
    tr_f[c + '_trte_00'] = tr[c].map(lambda x: di.get(x, np.nan))
    te_f[c + '_trte_00'] = te[c].map(lambda x: di.get(x, np.nan))

    di = frequency_encoding(c, False, True)
    tr_f[c + '_trte_01'] = tr[c].map(lambda x: di.get(x, np.nan))
    te_f[c + '_trte_01'] = te[c].map(lambda x: di.get(x, np.nan))

    di = frequency_encoding(c, True, False)
    tr_f[c + '_trte_10'] = tr[c].map(lambda x: di.get(x, np.nan))
    te_f[c + '_trte_10'] = te[c].map(lambda x: di.get(x, np.nan))

    di = frequency_encoding(c, True, True)
    tr_f[c + '_trte_11'] = tr[c].map(lambda x: di.get(x, np.nan))
    te_f[c + '_trte_11'] = te[c].map(lambda x: di.get(x, np.nan))

    utils.reduce_mem_usage(tr_f)
    utils.reduce_mem_usage(te_f)

    # output
    tr_f.add_prefix(PREF + '_').to_feather(outpath_tr)
    te_f.add_prefix(PREF + '_').to_feather(outpath_te)

    return
def gen_level_aggs(col, updata=False):
    feat_path = os.path.join(feats_root,'level_aggs_{}.pkl'.format(col))
    if os.path.exists(feat_path) and updata == False:
        print('Found ' + feat_path)
    else:
        print('Generating ' + feat_path)
        dfal = get_nominal_dfal()[[col, 'da'] + level_cols]
        dmax = dfal.da.max()
        dmin = dfal.da.min()
        
        level_agg = None
        for da in sorted(dfal.da.unique())[1:]:
            da_agg = None
            for win_das in [1, 2, 3]:
                if da - win_das < dmin:
                    continue
                agg = gen_level_agg_features(dfal, da, win_das, col)
                print('Generated {} {} {}'.format(col, da, win_das))
                if da_agg is None:
                    da_agg = agg
                else:
                    da_agg = da_agg.merge(agg, how='outer')
            if level_agg is None:
                level_agg = da_agg
            else: 
                level_agg = pd.concat([level_agg, da_agg], axis=0)
                level_agg.fillna(0, inplace=True)
                level_agg, _ = reduce_mem_usage(level_agg)
        print(level_agg.shape)
        level_agg, _ = reduce_mem_usage(level_agg)
        dump_pickle(level_agg, feat_path)
예제 #9
0
def data_loader():
    train = pd.read_csv("data/")
    test = pd.read_csv("data/")
    train2 = utils.reduce_mem_usage(train)
    train2.to_csv("data/train2.csv")
    test2 = utils.reduce_mem_usage(test)
    test2.to_csv("data/test2.csv")
    exit()
    return train, test
def fe(df):
    df['AvSigVersion'] = (df['AvSigVersion'] - date_min).dt.days
    df['AvSigVersion'] = df['AvSigVersion'] // 7
    df.rename(columns={'AvSigVersion': 'key'}, inplace=True)

    df = pd.merge(df[['key']], report, on='key', how='left')
    del df['key']
    df = df.rank(pct=True)
    utils.reduce_mem_usage(df)

    return df
예제 #11
0
def multi_te(args):

    cat, outpath = args

    tbl = te.groupby(cat).agg(num_agg)
    tbl.columns = [f'{"-".join(cat)}_{i}_{j}' for i, j in tbl.columns]
    tbl.reset_index(inplace=True)

    te_f = pd.merge(te[[cat]], tbl, on=cat, how='left')
    del te_f[cat]

    utils.reduce_mem_usage(te_f)
    te_f.add_prefix(PREF + '_').to_feather(outpath)

    return
예제 #12
0
 def load_data(file_name,
               directory='../input/',
               sample_size=None,
               normilize_names=True):
     """
     Load data from .csv file.
     Transform columns names from CamelCase to _underscore notation.
     :param file_name: file name
     :param directory: path to the directory with the file
     :param nrows: sample size
     :param normilize_names: camelcase to underscore
     :return: DataFrame
     """
     if file_name.startswith('train'):
         full_file_name = 'train_V2.csv'
     elif file_name.startswith('test'):
         full_file_name = 'test_V2.csv'
     elif 'sub' in file_name:
         full_file_name = 'sample_submission_V2.csv'
     else:
         full_file_name = file_name
     with Timer('Data Loading:'):
         df = pd.read_csv(os.path.join(directory, full_file_name),
                          nrows=sample_size)
         df = reduce_mem_usage(df)
         gc.collect()
         if normilize_names:
             df.columns = [
                 camelcase_to_underscore(col) for col in df.columns
             ]
     return df
예제 #13
0
        def transform(self, df):
            """
            Used to test/submit
            :param df: DataFrame
            :return: DataFrame
            """
            with Timer('preprocessing.Preprocessor.transform',
                       verbose=self.verbose):
                # Drop ID and Categorical columns
                to_drop = [
                    col for col in df.columns
                    if col in self.id + [self.target] + self.categorical
                ]
                x = df.drop(to_drop, axis=1).copy()

                # # Feature Selection
                # non_selected = [col for col in x.columns if col not in self.SELECTED_FEATURES]
                # x.drop(non_selected, axis=1, inplace=True)

                # Fill missings
                x.fillna(0, inplace=True)

                # # Normilize
                # x = x.astype(np.float64)
                # # x = pd.DataFrame(self.scaler.transform(x), columns=[
                # #     col for col in self.features if col in self.SELECTED_FEATURES])
                # x = pd.DataFrame(self.scaler.transform(x), columns=self.features)
                x = reduce_mem_usage(x)
                return x
def gen_dfal():
    dump_nominal_file = os.path.join(utils.cache_root, 'dfda_nominal.pkl')
    dump_textual_file = os.path.join(utils.cache_root, 'dfda_textual.pkl')
    if not os.path.exists(dump_nominal_file):
        tr = pd.read_csv('./input/round1_ijcai_18_train_20180301.txt',
                         sep=' ',
                         dtype={'is_trade': np.uint8})
        tr.is_trade = tr.is_trade.astype(np.int8)
        te = pd.read_csv('./input/round1_ijcai_18_test_b_20180418.txt',
                         sep=' ')
        da = pd.concat([tr, te], axis=0)
        da = utils.add_time_fields(da)

        for col in utils.nominal_cate_cols + utils.identity_cols:
            da[col] = LabelEncoder().fit_transform(da[col])

        for col in utils.ordinal_cate_cols:
            levels = sorted(da[col].unique())
            da[col] = da[col].apply(lambda x: levels.index(x)).astype(np.uint8)

        del da['context_id']
        del da['context_timestamp']
        del da['ts']
        da, _ = utils.reduce_mem_usage(da)
        utils.dump_pickle(da[utils.textual_cols], dump_textual_file)
        utils.dump_pickle(da.drop(utils.textual_cols, axis=1),
                          dump_nominal_file)
    print('gen dfal ok.')
    def make_features(self, df_train_input, df_test_input):
        df_train_features = pd.DataFrame()
        df_test_features = pd.DataFrame()

        df_train_input = reduce_mem_usage(df_train_input)
        df_train_input["timestamp"] = pd.to_datetime(
            df_train_input["timestamp"], unit="s")

        # 2020-02-06 00:00 ~ 2020-02-12 23:59
        df_train_input["timestamp_bin"] = -9999
        df_train_input.loc[
            df_train_input["timestamp"] <= "2020-02-08 08:00:00",
            "timestamp_bin"] = 0
        df_train_input.loc[
            (df_train_input["timestamp"] > "2020-02-08 08:00:00") &
            (df_train_input["timestamp"] <= "2020-02-10 16:00:00"),
            "timestamp_bin"] = 1
        df_train_input.loc[df_train_input["timestamp"] > "2020-02-10 16:00:00",
                           "timestamp_bin"] = 2

        print(df_train_input["timestamp_bin"].value_counts().sort_index())

        val_position = np.zeros(len(df_train_input)).astype(np.int8)
        for i_fold, bin_number in enumerate([0, 1, 2]):
            is_trn = df_train_input["timestamp_bin"] != bin_number
            is_val = df_train_input["timestamp_bin"] == bin_number
            trn_idx = df_train_input[is_trn].index
            val_idx = df_train_input[is_val].index
            val_position[val_idx] = i_fold
            print(f"{i_fold}fold: n_trn={len(trn_idx)}, n_val={len(val_idx)}")

        df_train_features["val_position"] = val_position
        print(df_train_features["val_position"].value_counts().sort_index())

        return df_train_features, df_test_features
def read_data():
    print('Reading files...')
    calendar = pd.read_csv('./m5-forecasting-accuracy/calendar.csv')
    calendar = reduce_mem_usage(calendar)
    print('Calendar has {} rows and {} columns'.format(calendar.shape[0],
                                                       calendar.shape[1]))
    sell_prices = pd.read_csv('./m5-forecasting-accuracy/sell_prices.csv')
    sell_prices = reduce_mem_usage(sell_prices)
    print('Sell prices has {} rows and {} columns'.format(
        sell_prices.shape[0], sell_prices.shape[1]))
    train_data = pd.read_csv(
        './m5-forecasting-accuracy/sales_train_validation.csv')

    print('Sales train validation has {} rows and {} columns'.format(
        train_data.shape[0], train_data.shape[1]))

    return calendar, sell_prices, train_data
예제 #17
0
def melt_and_merge(calendar, sell_prices, sales_train_evaluation, submission, nrows = 55000000, merge = False):
    print('\n\n Running melt and merge\n')
    # melt sales data, get it ready for training
    sales_train_evaluation = pd.melt(sales_train_evaluation, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')
    print('Melted sales train validation has {} rows and {} columns'.format(sales_train_evaluation.shape[0], sales_train_evaluation.shape[1]))
    sales_train_evaluation = reduce_mem_usage(sales_train_evaluation)
    sales_train_evaluation = sales_train_evaluation.iloc[-nrows:,:]
    
    # seperate test dataframes
    test1_rows = [row for row in submission['id'] if 'validation' in row]
    test2_rows = [row for row in submission['id'] if 'evaluation' in row]
    test1 = submission[submission['id'].isin(test1_rows)]
    test2 = submission[submission['id'].isin(test2_rows)]
    
    # change column names
    test1.columns = ['id', 'd_1914', 'd_1915', 'd_1916', 'd_1917', 'd_1918', 'd_1919', 'd_1920', 'd_1921', 'd_1922', 'd_1923', 'd_1924', 'd_1925', 'd_1926', 'd_1927', 'd_1928', 'd_1929', 'd_1930', 'd_1931', 
                      'd_1932', 'd_1933', 'd_1934', 'd_1935', 'd_1936', 'd_1937', 'd_1938', 'd_1939', 'd_1940', 'd_1941']
    test2.columns = ['id', 'd_1942', 'd_1943', 'd_1944', 'd_1945', 'd_1946', 'd_1947', 'd_1948', 'd_1949', 'd_1950', 'd_1951', 'd_1952', 'd_1953', 'd_1954', 'd_1955', 'd_1956', 'd_1957', 'd_1958', 'd_1959', 
                      'd_1960', 'd_1961', 'd_1962', 'd_1963', 'd_1964', 'd_1965', 'd_1966', 'd_1967', 'd_1968', 'd_1969']
    
    # get product table
    product = sales_train_evaluation[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']].drop_duplicates()

    # merge with product table
    test1 = test1.merge(product, how = 'left', on = 'id')
    test2['id'] = test2['id'].str.replace('_evaluation','_validation')
    test2 = test2.merge(product, how = 'left', on = 'id')
    test2['id'] = test2['id'].str.replace('_validation','_evaluation')
    
    test1 = pd.melt(test1, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')
    test2 = pd.melt(test2, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')
    
    sales_train_evaluation['part'] = 'train'
    test1['part'] = 'test1'
    test2['part'] = 'test2'
    
    data = pd.concat([sales_train_evaluation, test1, test2], axis = 0)
    del sales_train_evaluation, test1, test2
    print(data.shape)
    
    # drop some calendar features
    print(calendar.columns)
    calendar.drop(['weekday'], inplace = True, axis = 1)
    
    # delete test2 for now, test2 to be predicted when full data is available. For now, predict on test1.
    data = data[data['part'] != 'test1']
    
    if merge:
        # notebook crash with the entire dataset (maybee use tensorflow, dask, pyspark xD)
        data = pd.merge(data, calendar, how = 'left', left_on = ['day'], right_on = ['d'])
        data.drop(['d', 'day'], inplace = True, axis = 1)
        # get the sell price data (this feature should be very important)
        data = data.merge(sell_prices, on = ['store_id', 'item_id', 'wm_yr_wk'], how = 'left')
        print('Our final dataset to train has {} rows and {} columns'.format(data.shape[0], data.shape[1]))
    else: 
        print('Merge failed!')
    gc.collect()
    return data
예제 #18
0
def get_features(features, cfg):
    dfs = [
        pd.read_feather(f'../features/{f}_{cfg.data_type}.feather')
        for f in features if f is not None
    ]
    df = pd.concat(dfs, axis=1)
    if cfg.reduce:
        df = reduce_mem_usage(df)
    return df
def main():
    # load pkls
    df = read_pickles('../feats/sales_diff')
    df_calendar = loadpkl('../feats/calendar.pkl')
    df_sell_prices = loadpkl('../feats/sell_prices.pkl')

    # merge
    df = df.merge(df_calendar, on='d',how='left')
    df = df.merge(df_sell_prices, on=['store_id','item_id','wm_yr_wk'],how='left')

    del df_calendar, df_sell_prices
    gc.collect()

    # drop pre-release rows
    df = df[df['wm_yr_wk']>=df['release']]

    # make lag features
    df = make_lags(df,28)

    # label encoding
    cols_string = ['item_id','dept_id','cat_id','store_id','state_id']
    for c in cols_string:
        df[c], _ = pd.factorize(df[c])
        df[c].replace(-1,np.nan,inplace=True)

    # add price features
    df_grouped = df[['id','sell_price']].groupby('id')['sell_price']
    df['shift_price_t1'] = df_grouped.transform(lambda x: x.shift(1))
    df['price_change_t1'] = (df['shift_price_t1'] - df['sell_price']) / (df['shift_price_t1'])
    df['rolling_price_max_t365'] = df_grouped.transform(lambda x: x.shift(1).rolling(365).max())
    df['price_change_t365'] = (df['rolling_price_max_t365'] - df['sell_price']) / (df['rolling_price_max_t365'])
    df['rolling_price_std_t7'] = df_grouped.transform(lambda x: x.rolling(7).std())
    df['rolling_price_std_t30'] = df_grouped.transform(lambda x: x.rolling(30).std())

    # features release date
    df['release'] = df['release'] - df['release'].min()

    # price momentum by month & year
    df['price_momentum_m'] = df['sell_price']/df.groupby(['store_id','item_id','month'])['sell_price'].transform('mean')
    df['price_momentum_y'] = df['sell_price']/df.groupby(['store_id','item_id','year'])['sell_price'].transform('mean')

    # days for CustomTimeSeriesSplitter
    df['d_numeric'] = df['d'].apply(lambda x: str(x)[2:]).astype(int)

    # reduce memory usage
    df = reduce_mem_usage(df)

    # save as feather
    to_feature(df, '../feats/f105')

    # save feature name list
    features_json = {'features':df.columns.tolist()}
    to_json(features_json,'../configs/105_all_features_diff.json')

    # LINE notify
    line_notify('{} done.'.format(sys.argv[0]))
예제 #20
0
def main(args):
    """
    
    input_path, output_path = '../data/train.f', f'../data/train_{PREF}.f'
    
    """
    
    input_path, output_path = args
    
    df = pd.read_feather(input_path)[col]
    
    df['Census_PrimaryDiskTotalCapacity-m-Census_SystemVolumeTotalCapacity'] = df['Census_PrimaryDiskTotalCapacity'] - df['Census_SystemVolumeTotalCapacity']
    df['Census_PrimaryDiskTotalCapacity-d-Census_SystemVolumeTotalCapacity'] = df['Census_PrimaryDiskTotalCapacity'] / df['Census_SystemVolumeTotalCapacity']
    
    df['Census_InternalPrimaryDisplayResolution'] = df['Census_InternalPrimaryDisplayResolutionHorizontal'] * df['Census_InternalPrimaryDisplayResolutionVertical']
    
    utils.reduce_mem_usage(df)
    df.add_prefix(PREF+'_').to_feather(output_path)
    
    return
def main(args):
    
    input_path, output_path = args
    df = pd.read_csv(input_path, dtype=utils.DTYPES)
    
    di = {'on':0, 'audit':1}
    df['PuaMode'] = df['PuaMode'].map(lambda x: di.get(x, np.nan))
    
    # parse OsBuildLab
    # wired record in test -> 17134.1*amd64fre.rs4_release.180410-1804
    tmp = df['OsBuildLab'].map(lambda x: x.replace('*', '.').split('.') if isinstance(x, str) else [np.nan]*5)
    print(tmp.map(len).describe())
    df['OsBuildLab_major'] = tmp.map(lambda x: x[0]).astype(np.float64)
    df['OsBuildLab_minor'] = tmp.map(lambda x: x[1]).astype(np.float64)
    df['OsBuildLab_build'] = tmp.map(lambda x: x[2])
    df['OsBuildLab_architecture'] = tmp.map(lambda x: x[3])
    df['OsBuildLab_date'] = tmp.map(lambda x: x[4].split('-')[0] if isinstance(x[4], str) else np.nan).astype(np.float64)
    df['OsBuildLab_time'] = tmp.map(lambda x: x[4].split('-')[1] if isinstance(x[4], str) else np.nan).astype(np.float64)
    
    # SmartScreen
    di = {
          '00000000': '0', # tekiko~
          'enabled': 'on', # tekiko~
          'requiredadmin': 'requireadmin',
          'deny': 'off', # tekiko~
          'of': 'off',
          'promprt': 'prompt',
          }
    df['SmartScreen'] = df['SmartScreen'].str.lower()
    df['SmartScreen'].replace(di, inplace=True)
    
    
    utils.reduce_mem_usage(df)
    df.to_feather(output_path)
    
    if 'train' in input_path:
        df[['HasDetections']].to_feather('../data/target.f')
    
    return
예제 #22
0
def add_subject_feature(df, subject):

    sid2lcnum = {
        i: j
        for i, j in subject[['SubjectId', 'Level__SubjectId_cnum']].values
    }
    sid2lev = {i: j for i, j in subject[['SubjectId', 'Level']].values}
    sid2cnum = {
        i: j
        for i, j in subject[['SubjectId', 'SubjectId_cnum']].values
    }

    level_cnum_oht = []
    subject_meta = []
    for slist in tqdm(df['SubjectId'].values, total=len(df)):
        lsoht = np.zeros(len(level_cnum_list), dtype=int)
        for sid in slist:
            i = level_cnum_list.index(sid2lcnum[sid])
            lsoht[i] += 1
        snum = len(slist)
        levlist = [sid2lev[sid] for sid in slist]
        cnumlist = [sid2cnum[sid] for sid in slist]
        level_cnum_oht.append(lsoht)
        subject_meta.append(
            [snum,
             max(levlist),
             sum(levlist),
             max(cnumlist),
             sum(cnumlist)])

    level_subject_oht = pd.DataFrame(
        level_cnum_oht, columns=level_cnum_list).add_prefix('subj_')
    level_subject_oht = reduce_mem_usage(level_subject_oht)
    subject_meta = pd.DataFrame(subject_meta,
                                columns=subject_meta_cols).add_prefix('subj_')
    subject_meta = reduce_mem_usage(subject_meta)

    df = pd.concat([df, level_subject_oht, subject_meta], axis=1)
    return df
예제 #23
0
    def create_feature(self, random_state=None, devmode=False):
        trn_dir, tst_dir = self.get_feature_dir(random_state)

        if os.path.exists(trn_dir) and os.path.exists(
                tst_dir) and devmode is False:
            print(
                "There are cache dir for feature [{}] (train_cache_dir=[{}], test_cache_dir=[{}])"
                .format(self.__class__.__name__, trn_dir, tst_dir))
            trn_feature_files = list(Path(trn_dir).glob('*.f'))
            tst_feature_files = list(Path(tst_dir).glob('*.f'))

            return trn_feature_files, tst_feature_files

        print(
            "Start computing feature [{}] (train_cache_dir=[{}], test_cache_dir=[{}])"
            .format(self.__class__.__name__, trn_dir, tst_dir))

        if isinstance(self.fin, list):
            # 入力ファイルがlistだった場合DataFrameのlistを渡す
            df_list = [pd.read_feather(f) for f in self.fin]
            feat = self.create_feature_impl(df_list, random_state)
            del df_list
            gc.collect()
        else:
            df = pd.read_feather(self.fin)
            feat = self.create_feature_impl(df, random_state)
            del df
            gc.collect()

        feat = utils.reduce_mem_usage(feat)
        trn = self.trn_base.merge(feat, on=CONST.KEY,
                                  how='left').drop(columns=CONST.KEY)
        tst = self.tst_base.merge(feat, on=CONST.KEY,
                                  how='left').drop(columns=CONST.KEY)

        trn = trn.add_prefix(self.pref)
        tst = tst.add_prefix(self.pref)

        # Save ...
        if not devmode:
            os.makedirs(trn_dir)
            os.makedirs(tst_dir)
            utils.to_feature(trn, trn_dir)
            utils.to_feature(tst, tst_dir)
            trn_feature_files = list(Path(trn_dir).glob('*.f'))
            tst_feature_files = list(Path(tst_dir).glob('*.f'))

            return trn_feature_files, tst_feature_files

        else:
            return trn, tst
예제 #24
0
        def transform(self, df):
            with Timer('features.GroupAggregatedFeatureGenerator.transform',
                       verbose=self.verbose):
                df_features = []
                # Aggregate by Group
                # for agg_type in ('mean', 'max', 'min', 'count', 'std'):
                for agg_type in (
                        'mean',
                        'max',
                        'min',
                        'count',
                ):
                    df_aggregated = df.groupby(
                        ['match_id', 'group_id'],
                        as_index=False)[self.features].agg(agg_type)
                    df_aggregated = self.restore_row_order(
                        df, df_aggregated, on=['match_id', 'group_id'])
                    agg_column_names = {
                        col: f'{agg_type}_group_{col}'
                        for col in self.features
                    }
                    df_aggregated.rename(columns=agg_column_names,
                                         inplace=True)

                    # # TODO: Computational problems
                    # # Rank Groups by Match
                    # columns_to_select = list(agg_column_names.values())
                    # # Anyway deletes match_id
                    # df_ranked = df_aggregated.groupby('match_id', as_index=False)[columns_to_select].rank(pct=True)
                    # ranked_column_names = {col: f'rank_{col}' for col in columns_to_select}
                    # df_ranked.rename(columns=ranked_column_names, inplace=True)
                    # # Unsafe merge because of rank, which deletes match_id
                    # df_aggregated_ranked = pd.concat([df_aggregated, df_ranked], axis=1)
                    # df_features.append(df_aggregated_ranked)
                    # del df_aggregated, df_ranked, df_aggregated_ranked
                    # gc.collect()
                    df_aggregated = reduce_mem_usage(df_aggregated)
                    df_features.append(df_aggregated)
                df_features = pd.concat(df_features, axis=1)

                if self.created_features is None:
                    self.created_features = list(df_features.columns)
                else:
                    if self.created_features == list(df_features.columns):
                        if self.verbose == 2:
                            print('Lost features')
                        for col in df_features.columns:
                            if col not in self.created_features:
                                if self.verbose == 2:
                                    print(col)
                return df_features
def melt_train_data(input_data):
    data = pd.melt(
        input_data,
        id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
        var_name='day',
        value_name='demand')
    data = reduce_mem_usage(data)

    print('melt_train_data has {} rows and {} columns'.format(
        data.shape[0], data.shape[1]))
    del input_data
    gc.collect()

    return data
예제 #26
0
    def run(self):
        # Now we have 3 sets of features
        data = pd.concat([
            self.load('data4'),
            self.load('data2').iloc[:, 2:],
            self.load('data3').iloc[:, 2:]
        ],
                         axis=1)

        data = reduce_mem_usage(data)

        # Let's check again memory usage
        print("{:>20}: {:>8}".format(
            'Full Grid', sizeof_fmt(data.memory_usage(index=True).sum())))
        print('Size:', data.shape)
        self.save(data)
예제 #27
0
def main(is_eval=False):
    # load csv
    df = pd.read_csv('../input/sell_prices.csv')

    # release week ref https://www.kaggle.com/kyakovlev/m5-simple-fe
    release_df = df.groupby(['store_id', 'item_id'
                             ])['wm_yr_wk'].agg(['min']).reset_index()
    release_df.columns = ['store_id', 'item_id', 'release']

    # merge release week
    df = df.merge(release_df, on=['store_id', 'item_id'], how='left')

    # days from release
    df['days_from_release'] = df['wm_yr_wk'] - df['release']

    # basic aggregations
    df['price_max'] = df.groupby(['store_id',
                                  'item_id'])['sell_price'].transform('max')
    df['price_min'] = df.groupby(['store_id',
                                  'item_id'])['sell_price'].transform('min')
    df['price_std'] = df.groupby(['store_id',
                                  'item_id'])['sell_price'].transform('std')
    df['price_mean'] = df.groupby(['store_id',
                                   'item_id'])['sell_price'].transform('mean')

    # normalized price
    df['price_norm'] = df['sell_price'] / df['price_max']

    # label encoding
    df['price_nunique'] = df.groupby(['store_id', 'item_id'
                                      ])['sell_price'].transform('nunique')
    df['item_nunique'] = df.groupby(['store_id', 'sell_price'
                                     ])['item_id'].transform('nunique')

    # momentum
    df['price_momentum'] = df['sell_price'] / df.groupby(
        ['store_id', 'item_id'])['sell_price'].transform(lambda x: x.shift(1))

    # reduce memory usage
    df = reduce_mem_usage(df)

    # save pkl
    save2pkl('../feats/sell_prices.pkl', df)

    # LINE notify
    line_notify('{} done.'.format(sys.argv[0]))
def get_train_test(conf):
    df = Base.get_df(conf)  # pd.DataFrame

    feature_classes = [KEY_FEATURE_MAP[key] for key in conf.features]
    for feature in feature_classes:
        with timer(f"process {feature.__name__}"):
            f = feature.get_df(conf)
            if "drop_duplicate_column_on_merge" in conf.options and conf.options.drop_duplicate_column_on_merge:
                cols_to_drop = [
                    c for c in f.columns
                    if (c in df.columns) and (c != 'SK_ID_CURR')
                ]
                if cols_to_drop:
                    print(f"drop columns: {cols_to_drop}")
                    f = f.drop(cols_to_drop, axis=1)
            if "reduce_mem_usage" in conf.options and conf.options.reduce_mem_usage:
                with timer("reduce_mem_usaga"):
                    f = reduce_mem_usage(f)
            df = df.merge(f, how='left', on='SK_ID_CURR')
            del f
            gc.collect()

    if "stacking_features" in conf:
        StackingFeaturesWithPasses.set_result_dirs(conf.stacking_features)
        f = StackingFeaturesWithPasses.get_df(conf)
        df = df.merge(f, how='left', on='SK_ID_CURR')

    if "drop_features_list_file" in conf.options:
        with open(conf.options.drop_features_list_file, "r") as fp:
            line = fp.read()
            feature_to_drop = eval(line)
        print(f"drop columns in {conf.options.drop_features_list_file}")
        df = df.drop(feature_to_drop, axis=1)

    if "clean_data" in conf.options and conf.options.clean_data:
        with timer("clean_data"):
            df = clean_data(df)

    train_df = df[df['TARGET'].notnull()].copy()
    test_df = df[df['TARGET'].isnull()].copy()
    del df
    gc.collect()
    return train_df, test_df
def gen_target_aggs(col, updata=False):
    feat_path = os.path.join(feats_root, 'target_aggs_{}.pkl'.format(col))
    if os.path.exists(feat_path) and updata == False:
        print('Found ' + feat_path)
    else:
        print('Generating ' + feat_path)
        dfal = get_nominal_dfal()[[col, 'da', 'is_trade']]
        dmax = dfal.da.max()
        dmin = dfal.da.min()
        for da in sorted(dfal.da.unique())[1:]:
            for win_das in [1, 2, 3]:
                if da - win_das < dmin:
                    continue
                dfal = gen_target_agg_features(dfal, da, win_das, col)
        dfal = dfal.loc[dfal.da > 17, :]
        dfal.drop(['is_trade'], inplace=True, axis=1)
        dfal.drop_duplicates([col, 'da'], inplace=True)
        dfal.fillna(0, inplace=True)
        dfal, _ = reduce_mem_usage(dfal)
        dump_pickle(dfal, feat_path)
예제 #30
0
def input_to_feather():
    files = [f for f in os.listdir(CONST.INDIR) if '.csv' in f]
    for f in files:
        # if os.path.exists(os.path.join(CONST.INDIR, f.split('.')[0] + '.feather')):
        #     print("File '{}' is already exist".format(os.path.join(CONST.INDIR, f.split('.')[0] + '.feather')))
        # else:
        print("to feather '{}'...".format(f))
        df = pd.read_csv(os.path.join(CONST.INDIR, f))

        # datetimeに変換したいカラムがある
        if 'purchase_date' in df.columns:
            df['purchase_date'] = pd.to_datetime(df['purchase_date'])
        if 'first_active_month' in df.columns:
            df['first_active_month'] = pd.to_datetime(df['first_active_month'])
        # Y, Nをbinarizeしたいカラムがある
        if 'authorized_flag' in df.columns or 'category_1' in df.columns or 'category_4' in df.columns:
            df = binarize(df)

        df = reduce_mem_usage(df)

        df.to_feather(os.path.join(CONST.INDIR, f.split('.')[0] + '.feather'))