def make_sparse(test_df,train_df,factors,non_factors,cut_off,data_path): if not os.path.exists(data_path): os.makedirs(data_path) ####################### # CREATING SVM FORMAT # ####################### tmp_df =train_df[:] tmp_df_1=tmp_df[:] tmp_df_1['click_flag']=1 tmp_df_1['ais']=tmp_df_1['clicks'] tmp_df_0=tmp_df[:] tmp_df_0['click_flag']=0 tmp_df_0['ais']=tmp_df_1['instances']-tmp_df_1['clicks'] train_df=tmp_df_0.append(tmp_df_1) train_df=train_df.drop('clicks',1) train_df=train_df.drop('instances',1) train_df.rename(columns={'click_flag': 'clicks'}, inplace=True) tmp_df =test_df[:] tmp_df_1=tmp_df[:] tmp_df_1['click_flag']=1 tmp_df_1['ais']=tmp_df_1['clicks'] tmp_df_0=tmp_df[:] tmp_df_0['click_flag']=0 tmp_df_0['ais']=tmp_df_1['instances']-tmp_df_1['clicks'] test_df=tmp_df_0.append(tmp_df_1) test_df=test_df.drop('clicks',1) test_df=test_df.drop('instances',1) test_df.rename(columns={'click_flag': 'clicks'}, inplace=True) sc=gen_features.SparseCat(factors,non_factors) sc.set_params(count_cutoff=cut_off) sc.fit_weighted(train_df,train_df['ais']) f = file(data_path+'train_SC', 'wb') pkl.dump(sc,f,protocol=pkl.HIGHEST_PROTOCOL) f.close() mad_sparse_train=sc.transform(train_df) np.savetxt(data_path+'train_ais.txt', np.array(train_df['ais']), fmt='%d') gen_features.csr_write_libsvm(data_path+'train_svm.txt',mad_sparse_train, train_df['clicks'], len(factors)+len(non_factors)) mad_sparse_test=sc.transform(test_df) np.savetxt(data_path+'test_ais.txt', np.array(test_df['ais']), fmt='%d') gen_features.csr_write_libsvm(data_path+'test_svm.txt',mad_sparse_test, test_df['clicks'], len(factors)+len(non_factors)) return test_df,train_df
def make_sparse(df_train,W_train,df_val,W_val,df_test,W_test,file_name,cut_off,factors): df_train_y=df_train['clicks'].values.astype(np.int8).squeeze() #factors=factors_arr#,'ad_id','site_id*ad_id']#,'ad*country','ad*device','site*country' non_factors=[] sc=gen_features.SparseCat(factors,non_factors) sc.set_params(count_cutoff=cut_off) sc.fit_weighted(df_train,W_train) f = file(file_name+'train_SC', 'wb') pkl.dump(sc,f,protocol=pkl.HIGHEST_PROTOCOL) f.close() mad_sparse_train=sc.transform(df_train) np.savetxt(file_name+'train_samples.txt', np.array(df_train['samples']), fmt='%d') gen_features.csr_write_libsvm(file_name+'train_svm.txt',mad_sparse_train, df_train_y, len(factors)+len(non_factors)) if df_val.shape[0] != 0: df_val_y=df_val['clicks'].values.astype(np.int8).squeeze() mad_sparse_val=sc.transform(df_val) np.savetxt(file_name+'val_samples.txt', np.array(df_val['samples']), fmt='%d') gen_features.csr_write_libsvm(file_name+'val_svm.txt',mad_sparse_val, df_val_y, len(factors)+len(non_factors)) sc_val=gen_features.SparseCat(factors,non_factors) sc_val.set_params(count_cutoff=cut_off) sc_val.fit_weighted(df_val,W_val) f = file(file_name+'val_SC', 'wb') pkl.dump(sc_val,f,protocol=pkl.HIGHEST_PROTOCOL) f.close() df_test_y=df_test['clicks'].values.astype(np.int8).squeeze() mad_sparse_test=sc.transform(df_test) np.savetxt(file_name+'test_samples.txt', np.array(df_test['samples']), fmt='%d') gen_features.csr_write_libsvm(file_name+'test_svm.txt',mad_sparse_test, df_test_y, len(factors)+len(non_factors)) sc=gen_features.SparseCat(factors,non_factors) sc.set_params(count_cutoff=cut_off) sc.fit_weighted(df_train,W_train) sc_test=gen_features.SparseCat(factors,non_factors) sc_test.set_params(count_cutoff=cut_off) sc_test.fit_weighted(df_test,W_test) f = file(file_name+'test_SC', 'wb') pkl.dump(sc_test,f,protocol=pkl.HIGHEST_PROTOCOL) f.close()
factors=['site_id']#,'ad*country','ad*device','site*country' non_factors=['views'] sc=gen_features.SparseCat(factors,non_factors) sc.set_params(count_cutoff=25) t1=time.time() sc.fit(mad,mad_y) t2=time.time() print (time.time()-t1)/60 # can also do interactions #factors_cross=['ad','campaign','account','site','country','device','ad*site','ad*country','ad*device','site*country'] #sc_cross=gen_features.SparseCat(factors_cross,non_factors) #sc_cross.set_params(count_cutoff=25) mad_sparse=sc.transform(mad) f = file(data_path+'sc', 'wb') pkl.dump(sc,f,protocol=pkl.HIGHEST_PROTOCOL) f.close() # different ways of saving the sparse matrix - using sklearn joblib.dump(mad_sparse, data_path+output_file_name+'_sparse.pkl') # gen_features.save_sparse("mad_sparse",mad_sparse) # or numpy as npz file # save clicks np.savez(data_path+output_file_name+'_y',mad_y=mad_y) mad['samples'].to_csv(data_path+output_file_name+'_samples.csv',header=['sample']) gen_features.csr_write_libsvm(data_path+output_file_name+'_svm.txt',mad_sparse, mad_y, len(factors)+len(non_factors)) #dump_svmlight_file(mad_sparse, mad_y, '../data/mad_svm.txt', zero_based=True, comment=None, query_id=None)