# 結婚の有無・東京大阪勤務かどうかでグループ分け class partner_capital_group(Feature): def create_features(self): whole = pd.concat([train, test], axis=0) whole['is_tokyo'] = whole['area'].apply(lambda x: 1 if x == '東京都' else 0) whole['is_osaka'] = whole['area'].apply(lambda x: 1 if x == '大阪府' else 0) whole['is_capital'] = whole['is_tokyo'] + whole['is_osaka'] whole['partner_capital_group_str'] = whole['partner'].astype( str) + '_' + whole['is_capital'].astype(str) le = {'0_0': 0, '0_1': 1, '1_0': 2, '1_1': 3} whole[self.__class__. __name__] = whole['partner_capital_group_str'].map(le) self.train[self.__class__.__name__] = whole[ self.__class__.__name__].values[:len_train] self.test[self.__class__.__name__] = whole[ self.__class__.__name__].values[len_train:] if __name__ == '__main__': args = get_arguments() train = pd.read_feather('../data/input/train_data.feather') test = pd.read_feather('../data/input/test_data.feather') len_train = len(train) generate_features(globals(), args.force)
if x == 'male' else 0) def add_meta(self): self.meta_dict['memo'] = 'sex' self.meta_dict['num_or_cat'] = 'cat' self.meta_dict['date'] = '{0:%Y-%m-%d %H:%M:%S}'.format(self.now) if __name__ == '__main__': # log logger, sh, fh = preparation_logger() # do args = Util.get_arguments() train = Util.load_train_data() test = Util.load_test_data() # test mode? if args.debug: fh.setLevel(logging.ERROR) # file書き出ししないという意思表示 sh.setLevel(logging.DEBUG) # stream handler を infoからdebugへ logger.info('*******************************') logger.info('********** test mode **********') logger.info('*******************************') logger.info('-------------------- start') logger.debug(f'\n-train\n {train.head()}') logger.debug(f'\n-test\n {test.head()}') generate_features(globals(), args.force, args.debug) logger.info('-------------------- end')
if len(col) != 0: return writer = csv.writer(f) writer.writerow([col_name, desc]) if __name__ == '__main__': # CSVのヘッダーを書き込み create_memo('特徴量', 'メモ') if 'ipykernel' in sys.modules: # pklファイルを上書きする際はTrueに書き換え overwrite_ok = True else: overwrite_ok = get_arguments().force train = pd.read_csv(RAW_DATA_DIR_NAME + 'train.csv') test = pd.read_csv(RAW_DATA_DIR_NAME + 'test.csv') # globals()でtrain,testのdictionaryを渡す generate_features(globals(), overwrite_ok) # 特徴量メモをソートする feature_df = pd.read_csv(feature_memo_path) feature_df = feature_df.sort_values('特徴量') feature_df.to_csv(feature_memo_path, index=False) # %%