def TestModelOnData(modelname, fdata, flabel): actual_set = com.GetBuySet(flabel) rec_set = set() f_base = util.file_basename(fdata) re_str = f_base.replace('.',r'\.') + r'\.\d+\.csv$' f_list = util.FilterFile(re_str) #['%s.%d.csv' % (f_base, j) for j in range(com.__n_process)] for f in f_list: r , p, y = _ParTestModelOnData((modelname, f)) rec_set |= r pred_prob = np.concatenate([pred_prob,p]) Y_true = np.concatenate([Y_true, y]) TP = len(rec_set & actual_set) TN = len(rec_set - actual_set) FP = len(actual_set - rec_set) PrintConfuseMatrix(TP, TN, FP) P, R, F1 = GetPRF1(TP, TN, FP) PrintPRF1(P, R, F1) print 'AUC:', roc_auc_score( Y_true.astype(int), pred_prob) return TP, TN, FP, P, R, F1, pred_prob,Y_true
fout = 'submit.%s.csv' % sys.argv[1] # load need to be recommanded item fo = open(fout, 'wb') fw = csv.writer(fo, delimiter=',') fw.writerow(['user_id','item_id']) rec_set = set() pool = mp.Pool(com.__n_process) re_str = r'feature_total\.merge\.\d+\.csv$' f_list = util.FilterFile(re_str) rec_set_list = pool.map(GenRecDataFromFeatureFile,[(sys.argv[1], f) for f in f_list]) for r in rec_set_list: rec_set |= r for uid, tid in rec_set: fw.writerow([uid, tid]) fo.close() nrows = len(rec_set) print 'recommand %d record.' % nrows util.notify_me('recommand data are done! %d record.' % nrows)
# coding:utf-8 import util if __name__ == '__main__': fs = util.FilterFile(r'feature\d*\.csv$') for f in fs: header = open(f).readline().split(',') print f for it in header[2:]: print it
index=False) mod = 'a' header = False i = i + len(data) print 'process %d rows.' % i if __name__ == '__main__': if sys.argv[1] == 'train': ff = 'feature.merge.csv' fl = 'label.csv' fd = 'data.csv' elif sys.argv[1] == 'test': ff = 'feature_test.merge.csv' fl = 'label_test.csv' fd = 'data.test.csv' elif sys.argv[1] == 'submit': ff = 'feature_total.merge.csv' else: print __doc__ sys.exit() pool = mp.Pool(com.__n_process) fs = util.FilterFile( util.file_basename(ff).replace('.', r'\.') + r'\.\d+\.csv') #print fs pool.map(FilterCSV, fs)
data[train].to_csv(fname1, mode=mod, header = header,index = False) header = False mod = 'a' train_rows = np.sum(train) + train_rows train_trows = np.sum(data['buy'][train]==1) + train_trows rows = rows + len(data) print '[%s] process %d rows!' % (fn,rows) # print data.head() return (train_rows, train_trows) if __name__ == '__main__': if sys.argv[1]=='train': fs = util.FilterFile(r'data\.\d+\.csv') train_rows, train_trows = 0,0 for f in fs: i,j = Sample(f, 'data.train.csv') train_rows += i train_trows += j print 'sample %d rows, positive %d rows. ' % (train_rows, train_trows)
# coding:utf-8 import util, os, sys from multiprocessing import Pool if __name__ == '__main__': pool = Pool(50) root = r'D:\zuoyuan\alibaba\csv' fs = util.FilterFile(r'featureTEST_\d+-\d+-\d+\.csv$', root=root) cmds = [] for f in fs: cmds.append('python subset.py %s 0,1,2,3,4,5,6' % os.path.join(root, f)) #print cmds pool.map(os.system, cmds)
# coding:utf-8 import os, sys, util root = r'D:\zuoyuan\alibaba\csv' fs = util.FilterFile(r'feature.+subset_0_1_2_3_4_5_6\.csv$',root=root) for f in fs: print f sys.exit() cmd = 'python merge_fast.py ' + ' '.join([os.path.join(root,f) for f in fs]) + ' ' + os.path.join(root,'feature_0_1_2_3_4_5_6.csv') os.system(cmd)