def main(): print 'load datas...' train, test = data_util.load_dataset() train_y = train['y'] train.drop(['y'], axis=1, inplace=True) # 合并训练集和测试集 conbined_data = pd.concat([train, test]) ids = conbined_data['ID'] del conbined_data['ID'] num_columns = conbined_data.select_dtypes(exclude=['object']).columns missing_rates = [0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0] conbined_data = kmeans_impute_datas(conbined_data, num_columns, missing_rates) conbined_data['ID'] = ids train = conbined_data.iloc[:train.shape[0], :] test = conbined_data.iloc[train.shape[0]:, :] train['y'] = train_y.values print 'train:', train.shape, ', test:', test.shape print("Save data...") data_util.save_dataset(train, test)
def main(): print 'load datas...' train, test = data_util.load_dataset() train_y = train['y'] train.drop(['y'], axis=1, inplace=True) # 合并训练集和测试集 conbined_data = pd.concat([train, test]) ids = conbined_data['ID'] remove_features = [ 'X33', 'X39', 'X42', 'X95', 'X105', 'X124', 'X190', 'X204', 'X207', 'X210', 'X236', 'X252', 'X257', 'X259', 'X260', 'X270', 'X278', 'X280', 'X288', 'X295', 'X296', 'X339', 'X372', 'label_encoder_X4_median_y', 'label_encoder_X4_mean_y' ] conbined_data.drop(remove_features, axis=1, inplace=True) train = conbined_data.iloc[:train.shape[0], :] test = conbined_data.iloc[train.shape[0]:, :] train['y'] = train_y.values print 'train:', train.shape, ', test:', test.shape print("Save data...") data_util.save_dataset(train, test)
def main(): print 'load datas...' train, test = data_util.load_dataset() print 'train:', train.shape, ', test:', test.shape if (not os.path.exists(Configure.decomposition_features_train_path)) or \ (not os.path.exists(Configure.decomposition_features_test_path)): # 合并训练集和测试集 conbined_data = pd.concat([train.drop(['y'], axis=1), test]) ids = conbined_data['ID'] conbined_data.drop(['ID'], axis=1, inplace=True) results_df = pd.DataFrame({'ID': ids}) random_state = 100 n_jobs = -1 # PCA n_comp = 50 pca = PCA(n_components=n_comp, random_state=random_state) pca_df = pca.fit_transform(conbined_data) for i in range(0, n_comp): results_df['pca_' + str(i)] = pca_df[:, i] # IncrementalPCA n_comp = 70 ipca_df = IncrementalPCA( n_components=n_comp, batch_size=conbined_data.shape[0]).fit_transform(conbined_data) for i in range(0, n_comp): results_df['ipca_' + str(i)] = ipca_df[:, i] # KernelPCA n_comp = 60 ipca_df = KernelPCA(n_components=n_comp, kernel='linear', random_state=random_state, n_jobs=n_jobs).fit_transform(conbined_data) for i in range(0, n_comp): results_df['kernel_pca_' + str(i)] = ipca_df[:, i] decomposition_features_train = results_df.iloc[:train.shape[0], :] decomposition_features_test = results_df.iloc[train.shape[0]:, :] with open(Configure.decomposition_features_train_path, "wb") as f: cPickle.dump(decomposition_features_train, f, -1) with open(Configure.decomposition_features_test_path, "wb") as f: cPickle.dump(decomposition_features_test, f, -1) else: with open(Configure.decomposition_features_train_path, "rb") as f: decomposition_features_train = cPickle.load(f) with open(Configure.decomposition_features_test_path, "rb") as f: decomposition_features_test = cPickle.load(f) # merge train = pd.merge(train, decomposition_features_train, how='left', on='ID') test = pd.merge(test, decomposition_features_test, how='left', on='ID') print("Save data...") print 'train:', train.shape, ', test:', test.shape data_util.save_dataset(train, test)
def main(): print 'load datas...' train, test = data_util.load_dataset() # 删除 train 中只存在一种值的特征 removed_features = [] for c in train.columns: if len(set(train[c])) == 1: removed_features.append(c) train.drop(removed_features, axis=1, inplace=True) test.drop(removed_features, axis=1, inplace=True) # # 去除 train 中的 outlier 数据 # train = train[train.y < 250] print 'train:', train.shape, ', test:', test.shape print("Save data...") data_util.save_dataset(train, test)
def main(): print 'load datas...' train, test = data_util.load_dataset() train_y = train['y'] train.drop(['y'], axis=1, inplace=True) # 合并训练集和测试集 conbined_data = pd.concat([train, test]) ids = conbined_data['ID'] str_columns = conbined_data.select_dtypes(include=['object']).columns label_encoder_df = pd.DataFrame({'ID': ids}) print 'perform label encoder...' for c in str_columns: lbl = preprocessing.LabelEncoder() lbl.fit(list(conbined_data[c].values)) label_encoder_df['label_encoder_' + c] = lbl.transform(list(conbined_data[c].values)) # print 'perform dummy encoder...' # dummy_encoder_df = pd.DataFrame({'ID': ids}) # for c in str_columns: # dummies_df = pd.get_dummies(conbined_data[c], prefix=c) # dummy_encoder_df = pd.concat([dummy_encoder_df, dummies_df], axis=1) # 合并数据 del label_encoder_df['ID'] conbined_data = pd.concat([conbined_data, label_encoder_df], axis=1) # del dummy_encoder_df['ID'] # conbined_data = pd.concat([conbined_data, dummy_encoder_df], axis=1) # 去除原有的 category features conbined_data.drop(str_columns, axis=1, inplace=True) train = conbined_data.iloc[:train.shape[0], :] test = conbined_data.iloc[train.shape[0]:, :] train['y'] = train_y.values print 'train:', train.shape, ', test:', test.shape print("Save data...") data_util.save_dataset(train, test)
def main(): print 'load datas...' train, test = data_util.load_dataset() print 'train:', train.shape, ', test:', test.shape train_y = train['y'] train.drop(['y'], axis=1, inplace=True) # 合并训练集和测试集 conbined_data = pd.concat([train, test]) dis_features = [c for c in conbined_data.columns if 'pca' in c] for c in dis_features: mingap = (conbined_data[c].max() - conbined_data[c].min()) / 4000.0 conbined_data[c] = conbined_data[c].values // mingap train = conbined_data.iloc[:train.shape[0], :] test = conbined_data.iloc[train.shape[0]:, :] train['y'] = train_y.values print 'train:', train.shape, ', test:', test.shape print("Save data...") data_util.save_dataset(train, test)
def main(): print 'load datas...' train, test = data_util.load_dataset() groupby_features_train = pd.DataFrame({'ID': train['ID']}) groupby_features_test = pd.DataFrame({'ID': test['ID']}) if (not os.path.exists(Configure.groupby_features_train_path)) or \ (not os.path.exists(Configure.groupby_features_test_path)): groupby_features = [] for c in train.columns: if ('label_encoder' in c) or ('tsne_cluster' in c): groupby_features.append(c) for c in groupby_features: print '>>>> perform groupby features, feature : {}...'.format(c) groupby_df = train[[ c, 'y' ]].groupby(c).aggregate('mean')['y'].reset_index() def label_encoder_mean_map(data): values = groupby_df.loc[groupby_df[c] == data, 'y'].values if len(values) == 0: return sum(groupby_df.y) / groupby_df.shape[0] return values[0] groupby_features_train[c + '_mean_y'] = train[c].map( label_encoder_mean_map) groupby_features_test[c + '_mean_y'] = test[c].map( label_encoder_mean_map) groupby_df = train[[ c, 'y' ]].groupby(c).aggregate('median')['y'].reset_index() def label_encoder_median_map(data): values = groupby_df.loc[groupby_df[c] == data, 'y'].values if len(values) == 0: return sum(groupby_df.y) / groupby_df.shape[0] return values[0] groupby_features_train[c + '_median_y'] = train[c].map( label_encoder_median_map) groupby_features_test[c + '_median_y'] = test[c].map( label_encoder_median_map) groupby_df = train[[ c, 'y' ]].groupby(c).aggregate('std')['y'].reset_index() groupby_df.fillna(0, inplace=True) def label_encoder_std_map(data): values = groupby_df.loc[groupby_df[c] == data, 'y'].values if len(values) == 0: return sum(groupby_df.y) / groupby_df.shape[0] return values[0] groupby_features_train[c + '_std_y'] = train[c].map( label_encoder_std_map) groupby_features_test[c + '_std_y'] = test[c].map( label_encoder_std_map) with open(Configure.groupby_features_train_path, "wb") as f: cPickle.dump(groupby_features_train, f, -1) with open(Configure.groupby_features_test_path, "wb") as f: cPickle.dump(groupby_features_test, f, -1) else: with open(Configure.groupby_features_train_path, "rb") as f: groupby_features_train = cPickle.load(f) with open(Configure.groupby_features_test_path, "rb") as f: groupby_features_test = cPickle.load(f) # merge train = pd.merge(train, groupby_features_train, how='left', on='ID') test = pd.merge(test, groupby_features_test, how='left', on='ID') print 'train:', train.shape, ', test:', test.shape print("Save data...") data_util.save_dataset(train, test)
def main(): print 'load datas...' train, test = data_util.load_dataset() print 'train:', train.shape, ', test:', test.shape train_y = train['y'] train.drop(['y'], axis=1, inplace=True) # 合并训练集和测试集 conbined_data = pd.concat([train, test]) ids = conbined_data['ID'] perform_clusters = [7] for n_clusters in perform_clusters: print '>>>> perform kmeans cluser, n_clusters = {}...'.format(n_clusters) feature_train_path = Configure.kmeans_feature_distance_train_path.format(n_clusters) feature_test_path = Configure.kmeans_feature_distance_test_path.format(n_clusters) results_df = pd.DataFrame({'ID': ids}) if (not os.path.exists(feature_train_path)) or \ (not os.path.exists(feature_test_path)): cls = KMeans(n_clusters=n_clusters, n_jobs=-1) kmeans_labels = cls.fit_predict(conbined_data.values) conbined_data['cluster_label'] = kmeans_labels results_df['cluster_{}_cluster_label'.format(n_clusters)] = kmeans_labels # 计算距离 cluster_centers = cls.cluster_centers_ def calc_intra_center_distance(data): center = cluster_centers[int(data['cluster_label']), :] raw_data = data.drop(['cluster_label']) return np.linalg.norm(center - raw_data) def calc_extra_center_distance(data): ci = range(0, n_clusters) ci.remove(int(data['cluster_label'])) distance = 0.0 raw_data = data.drop(['cluster_label']) for i in ci: center = cluster_centers[i, :] dis = np.linalg.norm(center - raw_data) distance += dis return distance def calc_extra_center_distance2(data): ci = range(0, n_clusters) ci.remove(int(data['cluster_label'])) distance = 0.0 raw_data = data.drop(['cluster_label']) for i in ci: center = cluster_centers[i, :] dis = np.linalg.norm(center - raw_data) distance += dis return distance / len(ci) results_df['intra_center_distance_cluster_{}'.format(n_clusters)] = \ conbined_data.apply(calc_intra_center_distance, axis=1, raw=True) results_df['average_extra_center_distance_cluster_{}'.format(n_clusters)] = \ conbined_data.apply(calc_extra_center_distance, axis=1, raw=True) results_df['sum_extra_center_distance_cluster_{}'.format(n_clusters)] = \ conbined_data.apply(calc_extra_center_distance2, axis=1, raw=True) del conbined_data['cluster_label'] center_distance_train = results_df.iloc[:train.shape[0], :] center_distance_test = results_df.iloc[train.shape[0]:, :] with open(feature_train_path, "wb") as f: cPickle.dump(center_distance_train, f, -1) with open(feature_test_path, "wb") as f: cPickle.dump(center_distance_test, f, -1) else: with open(feature_train_path, "rb") as f: center_distance_train = cPickle.load(f) with open(feature_test_path, "rb") as f: center_distance_test = cPickle.load(f) # merge train = pd.merge(train, center_distance_train, how='left', on='ID') test = pd.merge(test, center_distance_test, how='left', on='ID') train['y'] = train_y.values print 'train:', train.shape, ', test:', test.shape print("Save data...") data_util.save_dataset(train, test)
_, _, specgram = log_specgram(resampled, sample_rate=new_sample_rate) y_train.append(label) x_train.append(specgram) x_train = np.array(x_train) x_train = x_train.reshape(tuple(list(x_train.shape) + [1])) y_train = label_transform(y_train, relabel=relabel, get_dummies=True) label_index = y_train.columns.values y_train = y_train.values y_train = np.array(y_train) del labels, fnames gc.collect() print('x_train:', x_train.shape, ', y_train:', y_train.shape) print("Save train data...") data_util.save_dataset(x_train, y_train) del x_train, y_train gc.collect() batch = 16 test_fname = [] x_test = [] fpaths = glob(os.path.join(Configure.test_data_path, '*wav')) i = 0 for path in fpaths: # if i == 0: # imgs = [] # fnames = [] # i += 1 rate, samples = wavfile.read(path)
def main(): print 'load datas...' train, test = data_util.load_dataset() print 'train:', train.shape, ', test:', test.shape print 'perform tsne...' if not os.path.exists(Configure.tsne_transformed_data_path): features = [ 'X118', 'X127', 'X47', 'X315', 'X311', 'X179', 'X314', 'X261' ] tsne = TSNE(random_state=2000, perplexity=80) tsne_transformed = tsne.fit_transform( pd.concat([train[features], test[features]])) tsne_transformed = pd.DataFrame(tsne_transformed) tsne_transformed.columns = ['tsne_transform_x', 'tsne_transform_y'] with open(Configure.tsne_transformed_data_path, "wb") as f: cPickle.dump(tsne_transformed, f, -1) else: with open(Configure.tsne_transformed_data_path, "rb") as f: tsne_transformed = cPickle.load(f) ids = pd.concat([train.drop(['y'], axis=1), test])['ID'] perform_clusters = [7] for n_clusters in perform_clusters: print '>>>> perform kmeans cluser, n_clusters = {}...'.format( n_clusters) feature_train_path = Configure.tsne_feature_train_path.format( n_clusters) feature_test_path = Configure.tsne_feature_test_path.format(n_clusters) if (not os.path.exists(feature_train_path)) or \ (not os.path.exists(feature_test_path)): results_df = pd.DataFrame({'ID': ids}) conbined_data = tsne_transformed.copy() cls = KMeans(n_clusters=n_clusters, n_jobs=-1) kmeans_labels = cls.fit_predict(conbined_data.values) conbined_data['cluster_label'] = kmeans_labels results_df['tsne_cluster_{}_cluster_label'.format( n_clusters)] = kmeans_labels # 计算距离 cluster_centers = cls.cluster_centers_ def calc_intra_center_distance(data): center = cluster_centers[int(data['cluster_label']), :] raw_data = data.drop(['cluster_label']) return np.linalg.norm(center - raw_data) def calc_extra_center_distance(data): ci = range(0, n_clusters) ci.remove(int(data['cluster_label'])) distance = 0.0 raw_data = data.drop(['cluster_label']) for i in ci: center = cluster_centers[i, :] dis = np.linalg.norm(center - raw_data) distance += dis return distance def calc_extra_center_distance2(data): ci = range(0, n_clusters) ci.remove(int(data['cluster_label'])) distance = 0.0 raw_data = data.drop(['cluster_label']) for i in ci: center = cluster_centers[i, :] dis = np.linalg.norm(center - raw_data) distance += dis return distance / len(ci) results_df['tsne_intra_center_distance_cluster_{}'.format(n_clusters)] = \ conbined_data.apply(calc_intra_center_distance, axis=1, raw=True) results_df['tsne_average_extra_center_distance_cluster_{}'.format(n_clusters)] = \ conbined_data.apply(calc_extra_center_distance, axis=1, raw=True) results_df['tsne_sum_extra_center_distance_cluster_{}'.format(n_clusters)] = \ conbined_data.apply(calc_extra_center_distance2, axis=1, raw=True) results_df.drop( ['tsne_cluster_{}_cluster_label'.format(n_clusters)], axis=1, inplace=True) tsne_df_train = results_df.iloc[:train.shape[0], :] tsne_df_test = results_df.iloc[train.shape[0]:, :] tsne_df_train['ID'] = train['ID'] tsne_df_test['ID'] = test['ID'] with open(feature_train_path, "wb") as f: cPickle.dump(tsne_df_train, f, -1) with open(feature_test_path, "wb") as f: cPickle.dump(tsne_df_test, f, -1) else: with open(feature_train_path, "rb") as f: tsne_df_train = cPickle.load(f) with open(feature_test_path, "rb") as f: tsne_df_test = cPickle.load(f) train = pd.merge(train, tsne_df_train, how='left', on='ID') test = pd.merge(test, tsne_df_test, how='left', on='ID') print 'train:', train.shape, ', test:', test.shape print("Save data...") data_util.save_dataset(train, test)
def main(): print 'load datas...' priors, train, orders, products, aisles, departments, sample_submission, order_streaks = data_util.load_data() groupby_features_train = pd.DataFrame() groupby_features_test = pd.DataFrame() if (not os.path.exists(Configure.groupby_features_train_path)) or \ (not os.path.exists(Configure.groupby_features_test_path)): # # Product part # Products information ---------------------------------------------------------------- # add order information to priors set priors_orders_detail = orders.merge(right=priors, how='inner', on='order_id') # create new variables ## _user_buy_product_times: 用户是第几次购买该商品 priors_orders_detail.loc[:,'_user_buy_product_times'] = priors_orders_detail.groupby(['user_id', 'product_id']).cumcount() + 1 # _prod_tot_cnts: 该商品被购买的总次数,表明被喜欢的程度 # _reorder_tot_cnts_of_this_prod: 这件商品被再次购买的总次数 ### 我觉得下面两个很不好理解,考虑改变++++++++++++++++++++++++++ # _prod_order_once: 该商品被购买一次的总次数 # _prod_order_more_than_once: 该商品被购买一次以上的总次数 agg_dict = {'user_id':{'_prod_tot_cnts':'count'}, 'reordered':{'_prod_reorder_tot_cnts':'sum'}, '_user_buy_product_times': {'_prod_buy_first_time_total_cnt':lambda x: sum(x==1), '_prod_buy_second_time_total_cnt':lambda x: sum(x==2)}} prd = ka_add_groupby_features_1_vs_n(priors_orders_detail, ['product_id'], agg_dict) # _prod_reorder_prob: 这个指标不好理解 # _prod_reorder_ratio: 商品复购率 prd['_prod_reorder_prob'] = prd._prod_buy_second_time_total_cnt / prd._prod_buy_first_time_total_cnt prd['_prod_reorder_ratio'] = prd._prod_reorder_tot_cnts / prd._prod_tot_cnts prd['_prod_reorder_times'] = 1 + prd._prod_reorder_tot_cnts / prd._prod_buy_first_time_total_cnt # # User Part # _user_total_orders: 用户的总订单数 # 可以考虑加入其它统计指标++++++++++++++++++++++++++ # _user_sum_days_since_prior_order: 距离上次购买时间(和),这个只能在orders表里面计算,priors_orders_detail不是在order level上面unique # _user_mean_days_since_prior_order: 距离上次购买时间(均值) agg_dict_2 = {'order_number':{'_user_total_orders':'max'}, 'days_since_prior_order':{'_user_sum_days_since_prior_order':'sum', '_user_mean_days_since_prior_order': 'mean'}} users = ka_add_groupby_features_1_vs_n(orders[orders.eval_set == 'prior'], ['user_id'], agg_dict_2) # _user_reorder_ratio: reorder的总次数 / 第一单后买后的总次数 # _user_total_products: 用户购买的总商品数 # _user_distinct_products: 用户购买的unique商品数 # agg_dict_3 = {'reordered': # {'_user_reorder_ratio': # lambda x: sum(priors_orders_detail.ix[x.index,'reordered']==1)/ # sum(priors_orders_detail.ix[x.index,'order_number'] > 1)}, # 'product_id':{'_user_total_products':'count', # '_user_distinct_products': lambda x: x.nunique()}} # us = ka_add_groupby_features_1_vs_n(priors_orders_detail, ['user_id'], agg_dict_3) us = pd.concat([ priors_orders_detail.groupby('user_id')['product_id'].count().rename('_user_total_products'), priors_orders_detail.groupby('user_id')['product_id'].nunique().rename('_user_distinct_products'), (priors_orders_detail.groupby('user_id')['reordered'].sum() / priors_orders_detail[priors_orders_detail['order_number'] > 1].groupby('user_id')['order_number'].count()).rename('_user_reorder_ratio') ], axis=1).reset_index() users = users.merge(us, how='inner') # 平均每单的商品数 # 每单中最多的商品数,最少的商品数++++++++++++++ users['_user_average_basket'] = users._user_total_products / users._user_total_orders us = orders[orders.eval_set != "prior"][['user_id', 'order_id', 'eval_set', 'days_since_prior_order']] us.rename(index=str, columns={'days_since_prior_order': 'time_since_last_order'}, inplace=True) users = users.merge(us, how='inner') # # Database Part # 这里应该还有很多变量可以被添加 # _up_order_count: 用户购买该商品的次数 # _up_first_order_number: 用户第一次购买该商品所处的订单数 # _up_last_order_number: 用户最后一次购买该商品所处的订单数 # _up_average_cart_position: 该商品被添加到购物篮中的平均位置 agg_dict_4 = {'order_number':{'_up_order_count': 'count', '_up_first_order_number': 'min', '_up_last_order_number':'max'}, 'add_to_cart_order':{'_up_average_cart_position': 'mean'}} data = ka_add_groupby_features_1_vs_n(df=priors_orders_detail, group_columns_list=['user_id', 'product_id'], agg_dict=agg_dict_4) data = data.merge(prd, how='inner', on='product_id').merge(users, how='inner', on='user_id') # 该商品购买次数 / 总的订单数 # 最近一次购买商品 - 最后一次购买该商品 # 该商品购买次数 / 第一次购买该商品到最后一次购买商品的的订单数 data['_up_order_rate'] = data._up_order_count / data._user_total_orders data['_up_order_since_last_order'] = data._user_total_orders - data._up_last_order_number data['_up_order_rate_since_first_order'] = data._up_order_count / (data._user_total_orders - data._up_first_order_number + 1) # add user_id to train set train = train.merge(right=orders[['order_id', 'user_id']], how='left', on='order_id') data = data.merge(train[['user_id', 'product_id', 'reordered']], on=['user_id', 'product_id'], how='left') data = pd.merge(data, products[['product_id', 'aisle_id', 'department_id']], how='left', on='product_id') transform_categorical_data(data, ['aisle_id', 'department_id']) data = data.merge(order_streaks[['user_id', 'product_id', 'order_streak']], on=['user_id', 'product_id'], how='left') # release Memory # del train, prd, users # gc.collect() # release Memory #del priors_orders_detail del orders, order_streaks gc.collect() starting_size = sys.getsizeof(data) i = 0 for c, dtype in zip(data.columns, data.dtypes): if 'int' in str(dtype): if min(data[c]) >=0: max_int = max(data[c]) if max_int <= 255: data[c] = data[c].astype(np.uint8) elif max_int <= 65535: data[c] = data[c].astype(np.uint16) elif max_int <= 4294967295: data[c] = data[c].astype(np.uint32) i += 1 print("Number of colums adjusted: {}\n".format(i)) ## Changing known reorderd col to smaller int size data['reordered'] = np.nan_to_num(data['reordered']).astype(np.uint8) data['reordered'][data['reordered']==0] = np.nan print("Reduced size {:.2%}".format(float(sys.getsizeof(data))/float(starting_size))) # # Create Train / Test train = data.loc[data.eval_set == "train",:] #train.drop(['eval_set', 'user_id', 'product_id', 'order_id'], axis=1, inplace=True) #train.loc[:, 'reordered'] = train.reordered.fillna(0) test = data.loc[data.eval_set == "test",:] #test.drop(['eval_set', 'user_id', 'product_id', 'order_id', 'reordered'], axis=1, inplace=True) #groupby_features_train = train #groupby_features_test = test # with open(Configure.groupby_features_train_path, "wb") as f: # cPickle.dump(groupby_features_train, f, -1) # with open(Configure.groupby_features_test_path, "wb") as f: # cPickle.dump(groupby_features_test, f, -1) print 'train:', train.shape, ', test:', test.shape print("Save data...") data_util.save_dataset(train, test) else: with open(Configure.groupby_features_train_path, "rb") as f: groupby_features_train = cPickle.load(f) with open(Configure.groupby_features_test_path, "rb") as f: groupby_features_test = cPickle.load(f)