def index_learning(dftrain,rayon,cat1,F1score=0.8): # select Categorie1 from rayon and training set ry = rayon[rayon.Categorie1 == cat1].copy() df = dftrain[dftrain.Categorie1 == cat1].copy() ry['txt'] = map(normalize_txt,ry.Categorie3_Name) add_txt(df) # vectorize Categorie3_Name as index vec = TfidfVectorizer(stop_words = None,min_df = 1,max_features = None,smooth_idf=True,norm='l2',sublinear_tf=False,use_idf=True,ngram_range=(1,3)) Xr = vec.fit_transform(ry.txt) Xt = vec.transform(df.txt) # compute distance from sample to index D = pairwise_distances(Xt,Xr,metric='cosine') a = np.argmin(D,axis=1) df['D'] = D[range(len(a)),a] df['guess'] = ry.Categorie3.values[a] Dmin = {} for d in np.linspace(0,1,21): Yr = df[df.D<d].guess Yt = df[df.D<d].Categorie3 fs = f1_score(Yt,Yr,labels=ry.Categorie3,average=None) for i in np.nonzero(fs > F1score)[0]: cat3 = ry.Categorie3.values[i] if cat3 in Dmin: continue Dmin[cat3] = d joblib.dump((vec,Dmin,F1score),ddir+'joblib/index_'+str(cat1)) del ry,df return vec,Dmin,F1score
def index_guessing(dfsample,rayon,cat1,vec,Dmin,default=None): if 'guess' not in dfsample.columns: dfsample['guess'] = None df = dfsample[dfsample.Categorie1 == cat1].copy() if len(df)==0: return [] ry = rayon[rayon.Categorie1 == cat1].copy() add_txt(df) ry['txt'] = map(normalize_txt,ry.Categorie3_Name) Xr = vec.transform(ry.txt) Xt = vec.transform(df.txt) D = pairwise_distances(Xt,Xr,metric='cosine') a = np.argmin(D,axis=1) df['guess'] = ry.Categorie3.values[a] df['D'] = D[range(len(a)),a] return [r.guess if r.D<Dmin.get(r.guess,0) else None for i,r in df.iterrows()]
print 'training',cat1,'\t\t(',i,') : training=',sct,'validation=',scv joblib.dump((labels,vec,cla),fname) del vec,cla return (sct,scv) ####################### # training # stage1 : Categorie1 # stage3 : Categorie3|Categorie1 ####################### dftrain = pd.read_csv(ddir+'training_sample.csv'+ext,sep=';',names = header()).fillna('') dfvalid = pd.read_csv(ddir+'validation_sample.csv'+ext,sep=';',names = header()).fillna('') dftest = pd.read_csv(ddir+'test_normed.csv',sep=';',names = header(test=True)).fillna('') add_txt(dftrain) add_txt(dfvalid) add_txt(dftest) dftrain = dftrain[['Categorie3','Categorie1','txt']] dfvalid = dfvalid[['Categorie3','Categorie1','txt']] dftest = dftest[['Identifiant_Produit','txt']] # training stage1 dt = -time.time() sct,scv = training_stage1(dftrain,dfvalid) dt += time.time() print '##################################'
from sklearn.externals import joblib from utils import itocat1,itocat2,itocat3 from utils import cat1toi,cat2toi,cat3toi from utils import cat3tocat2,cat3tocat1,cat2tocat1 from utils import cat1count,cat2count,cat3count import sys ddir = '/home/ngaude/workspace/data/cdiscount/' assert len(sys.argv) == 2 ##### usage guess.py $RESULTAT.CSV #### rname = sys.argv[1] assert isfile(ddir+rname) ##### usage guess.py $RESULTAT.CSV #### test_normed = pd.read_csv(ddir+'test_normed.csv',sep=';',names=header(True)).fillna('') add_txt(test_normed) test_num_word = map(lambda t:len(set(t.split())),test_normed.txt) test_nn = pd.read_csv(ddir+'test_nn.csv',sep=';').fillna('') test_nn['Marque'] = test_nn.Marque_nn test_nn['Libelle'] = test_nn.Libelle_nn test_nn['Description'] = test_nn.Description_nn add_txt(test_nn) nn_num_word = map(lambda t:len(set(t.split())),test_nn.txt) test_nn.drop('Marque', axis=1, inplace=True) test_nn.drop('Libelle', axis=1, inplace=True) test_nn.drop('Description', axis=1, inplace=True) best = pd.read_csv(ddir+rname,sep=';') #best = pd.read_csv('proba.auto.merging.60.csv',sep=';') #best.Id_Categorie = 1000015309
from sklearn.externals import joblib from utils import itocat1, itocat2, itocat3 from utils import cat1toi, cat2toi, cat3toi from utils import cat3tocat2, cat3tocat1, cat2tocat1 from utils import cat1count, cat2count, cat3count import sys ddir = "/home/ngaude/workspace/data/cdiscount/" assert len(sys.argv) == 2 ##### usage guess.py $RESULTAT.CSV #### rname = sys.argv[1] assert isfile(ddir + rname) ##### usage guess.py $RESULTAT.CSV #### test_normed = pd.read_csv(ddir + "test_normed.csv", sep=";", names=header(True)).fillna("") add_txt(test_normed) test_num_word = map(lambda t: len(set(t.split())), test_normed.txt) test_nn = pd.read_csv(ddir + "test_nn.csv", sep=";").fillna("") test_nn["Marque"] = test_nn.Marque_nn test_nn["Libelle"] = test_nn.Libelle_nn test_nn["Description"] = test_nn.Description_nn add_txt(test_nn) nn_num_word = map(lambda t: len(set(t.split())), test_nn.txt) test_nn.drop("Marque", axis=1, inplace=True) test_nn.drop("Libelle", axis=1, inplace=True) test_nn.drop("Description", axis=1, inplace=True) best = pd.read_csv(ddir + rname, sep=";") # best = pd.read_csv('proba.auto.merging.60.csv',sep=';') # best.Id_Categorie = 1000015309
@author: ngaude """ from utils import header,add_txt import numpy as np import pandas as pd from sklearn.externals import joblib from utils import itocat1,itocat3 from utils import cat1count,cat2count,cat3count import time ddir = '/home/ngaude/workspace/data/cdiscount.proba/' dftest = pd.read_csv(ddir+'test_normed.csv',sep=';',names = header(test=True)).fillna('') add_txt(dftest) dftest = dftest[['Identifiant_Produit','txt']] stage3_proba_test = np.full(shape=(len(dftest),cat3count),fill_value = 0.,dtype = float) stage1_proba_test = np.full(shape=(len(dftest),cat1count),fill_value = 0.,dtype = float) def submit(df,Y): submit_file = ddir+'resultat.auto.merging.'+str(N)+'.csv' df['Id_Produit']=df['Identifiant_Produit'] df['Id_Categorie'] = Y df= df[['Id_Produit','Id_Categorie']] df.to_csv(submit_file,sep=';',index=False) def save_proba(df,Y,p1,p3): submit_file = ddir+'proba.auto.merging.'+str(N)+'.csv' df['Id_Produit']=df['Identifiant_Produit']
scv = (-1,0) else: # performs a gridsearch Xvs = [ vec.transform(dfv.txt) for dfv in dfvs] Yvs = [ dfv['Categorie3'].values for dfv in dfvs] cla,scv = best_classifier(X,Y,Xvs,Yvs) print 'training',cat1,'\t\t(',i,') N=',len(dft),'K=',len(labels),': mean =',scv[0],'dev=',scv[1] joblib.dump((labels,vec,cla,scv),fname+ext) del vec,cla return scv ################# # prepare train # ################# dftrain = pd.read_csv(ddir+'training_random.csv'+ext,sep=';',names = header()).fillna('') add_txt(dftrain) dftrain = dftrain[['Categorie3','Categorie1','txt']] ################# # prepare valid # ################# dfvs = [pd.read_csv(ddir+'validation_random.csv.'+str(i),sep=';',names = header()).fillna('') for i in range(9)] for i in range(9): add_txt(dfvs[i]) dfvs[i] = dfvs[i][['Identifiant_Produit','Categorie3','Categorie1','txt']] ################# # prepare test # ################# for i,cat1 in enumerate(np.unique(dftrain.Categorie1)):
assert len(sys.argv) == 2 ##### usage guess.py $PROBA.CSV #### assert isfile(sys.argv[1]) ##### usage guess.py $PROBA.CSV #### pname = sys.argv[1] # pname = 'proba.auto.merging.15.csv' pdir = dirname(pname) ################## # FIXME : ensure that confidence level are the same between logistic regression proba et guessing proba # proba_score = 0.6768667 # <==> # sum(proba.Proba_Categorie3)/len(df) # 0.7525964785959941 ################## rayon = pd.read_csv(ddir+'rayon.csv',sep=';') test = pd.read_csv(ddir+'test_normed.csv',sep=';',names = header(True)).fillna('') add_txt(test) proba = pd.read_csv(pname,sep=';') df = test.merge(proba,'left',None,'Identifiant_Produit','Id_Produit') df = df.merge(rayon,'left',None,'Id_Categorie','Categorie3') rg = pd.read_csv(ddir+'rayon_guessing.csv',sep=';') g = rg.groupby('Categorie1') guess_correction = 0 num_correction = 0 best_Categorie3 = df.Categorie3.values #best_Categorie3 = [1000015309]*len(df) for i,r in df.iterrows(): rdf = g.get_group(r.Categorie1)
def bayes_prediction(stage1_log_proba,stage3_log_proba): for i in range(stage3_log_proba.shape[1]): cat3 = itocat3[i] cat1 = cat3tocat1[cat3] j = cat1toi[cat1] stage3_log_proba[:,i] += stage1_log_proba[:,j] bayes_prediction(stage1_log_proba_valid,stage3_log_proba_valid) predict_cat3_valid = [itocat3[i] for i in np.argmax(stage3_log_proba_valid,axis=1)] proba_cat3_valid = np.exp(np.max(stage3_log_proba_valid,axis=1)) valid['Categorie3_lr'] = predict_cat3_valid valid['proba_lr'] = proba_cat3_valid add_txt(valid) ############################################# # get results from a previously trained # logistic regression model ############################################# # head = pd.read_csv(ddir+'training_head.csv',names=header(),sep=';').fillna('') # head = head[head.Produit_Cdiscount == 1] # add_txt(head) # head.to_csv(ddir+'nn_train.csv',sep=';',index=False) train = pd.read_csv(ddir+'nn_train.csv',sep=';') ############################################# # vectorize the full text #
dfsample = pd.concat(dfs) dfsample = dfsample.reset_index(drop=True) dfsample = dfsample.reindex(np.random.permutation(dfsample.index), copy=False) return dfsample ################## # VECTORIZING ################## # vectorize dftest dftest = pd.read_csv(ddir + 'test_normed.csv', sep=';', names=header(test=True)).fillna('') add_txt(dftest) vec, Xtest = vectorizer(dftest.txt) # vectorize dftrain dftrain = pd.read_csv(ddir + 'training_shuffled_normed.csv', sep=';', names=header()).fillna('') add_txt(dftrain) Ytrain = dftrain.Categorie3.values.copy() IDtrain = dftrain.Identifiant_Produit.values.copy() # NOTE : memory error work around... # let's serialize. joblib.dump((vec, IDtrain, Ytrain), '/tmp/vecIDYtrain')
# sample all samples + oversample the remaining dfs.append(df) df = df.iloc[np.random.randint(0, len(df), size=sample_count-len(df))] dfs.append(df) dfsample = pd.concat(dfs) dfsample = dfsample.reset_index(drop=True) dfsample = dfsample.reindex(np.random.permutation(dfsample.index),copy=False) return dfsample ################## # VECTORIZING ################## # vectorize dftest dftest = pd.read_csv(ddir+'test_normed.csv',sep=';',names = header(test=True)).fillna('') add_txt(dftest) vec,Xtest = vectorizer(dftest.txt) # vectorize dftrain dftrain = pd.read_csv(ddir+'training_shuffled_normed.csv',sep=';',names = header()).fillna('') add_txt(dftrain) Ytrain = dftrain.Categorie3.values.copy() IDtrain = dftrain.Identifiant_Produit.values.copy() # NOTE : memory error work around... # let's serialize. joblib.dump((vec,IDtrain,Ytrain),'/tmp/vecIDYtrain') joblib.dump(Xtest,ddir+'joblib/Xtest')