def training_sample_adasyn(df,n = 200,mincount=7): (X,Y) = rf_vectorizer(df) Xt = [] Yt = [] for i,cat in enumerate(np.unique(df.Y)): print 'adasyn :',i Xt.append(adasyn_sample(X,Y,cat,K=5,n=n)) Yt.append([cat,]*Xt[-1].shape[0]) Xt = np.vstack(Xt) Yt = np.concatenate(Yt) shuffle = np.random.permutation(len(Yt)) Xt = Xt[shuffle,:] Yt = Yt[shuffle] return Xt,Yt
def training_sample_adasyn(df,vec,N = 200,mincount=7): X = vec.transform(df.txt) Y = df.Categorie3.values Xt = [] Yt = [] for i,cat in enumerate(np.unique(Y)): print 'adasyn :',i Xt.append(adasyn_sample(X,Y,cat,K=5,n=N)) Yt.append([cat,]*Xt[-1].shape[0]) Xt = np.vstack(Xt) Yt = np.concatenate(Yt) shuffle = np.random.permutation(len(Yt)) Xt = Xt[shuffle,:] Yt = Yt[shuffle] return Xt,Yt
dfsample.to_csv(ddir+'training_sup9.csv',sep=';',index=False,header=False) Y = dfsample.Categorie3.values ID = dfsample.Identifiant_Produit.values print 'vectorizing...' vec,X = vectorizer(dfsample.txt) print 'dumping...' joblib.dump((vec,ID,X,Y),ddir+'joblib/vecIDXY') # use adasyn to get synthetic balanced dataset Xt = [] Yt = [] for i,cat in enumerate(np.unique(Y)): print 'adasyn :',i Xt.append(adasyn_sample(X,Y,cat,K=5,n=200)) Yt.append([cat,]*Xt[-1].shape[0]) Xt = sparse.vstack(Xt) assert Xt.shape[0] == len(Yt) rows = random.sample(Xt,Xt.shape[0]) Xt = Xt[rows] joblib.dump((vec,Xt,Yt),ddir+'joblib/vecXtYt') ################################################# # TRAINING START HERE ################################################# (vec,X,Y) = joblib.load(ddir+'joblib/vecXtYt_200') Z = np.array(map(lambda c:cat3tocat1[c],Y))