def test_hard_vote(): X,y,test_X,test_Y =get_test_data() print("bag of words") bow = BagOfWordsClassifier() bow_probs = bow.get_proba(X,y,test_X,prefix="t") print("direct attribute") da = DirectAttributeClassifier() da_probs = da.get_proba(X,y,test_X,prefix="t") probs = zip(*[item for p in [bow_probs,da_probs] for item in p]) #train_probs = probs[0] test_probs = probs[1] print(len(test_probs)) preds = [x.idxmax(1) for x in test_probs] pred = np.zeros(len(preds[0]),dtype=np.int8) print(len(pred)) for i in range(len(preds[0])): votes = [p[i] for p in preds] print(votes) pred[i]= max(set(votes),key=votes.count) print(pred[i]) from sklearn.preprocessing import LabelEncoder le = LabelEncoder() le.fit(y) pred = le.inverse_transform(pred) print(metrics.accuracy_score(test_Y,pred)) """
def predict(): X,y,test_X,ids =get_predict_data() print("bag of words") bow = BagOfWordsClassifier() bow_probs = bow.get_proba(X,y,test_X,prefix="p_") print("direct attribute") da = DirectAttributeClassifier() da_probs = da.get_proba(X,y,test_X,prefix="p_") probs = zip(*[item for p in [bow_probs,da_probs] for item in p]) train_probs = probs[0] test_probs = probs[1] print(len(train_probs)) for prob in train_probs: print(prob.shape) print(type(prob)) train_attr = pd.concat(train_probs,axis=1) print(train_attr.shape) print(type(train_attr)) test_attr = pd.concat(test_probs,axis=1) print(test_attr.shape) print(type(test_attr)) clf = LogisticRegression() clf.fit(train_attr,y) pred=clf.predict(test_attr) result = pd.DataFrame({'id':ids,'cuisine':pred}) result[['id','cuisine']].to_csv("av_submission.csv",index=False,cols=["id","cuisine"],engine='python')
def test_vote_soft(): X,y,test_X,test_Y =get_test_data() print("bag of words") bow = BagOfWordsClassifier() bow_probs = bow.get_proba(X,y,test_X,prefix="t") print("direct attribute") da = DirectAttributeClassifier() da_probs = da.get_proba(X,y,test_X,prefix="t") probs = zip(*[item for p in [bow_probs,da_probs] for item in p]) train_probs = probs[0] test_probs = probs[1] print(len(train_probs)) for prob in train_probs: print(prob.shape) print(type(prob)) #train_attr = reduce(lambda a,b:a+b,train_probs) test_attr = reduce(lambda a,b:a+b,test_probs) pred = test_attr.idxmax(1) from sklearn.preprocessing import LabelEncoder le = LabelEncoder() le.fit(y) pred = le.inverse_transform(pred) print(metrics.accuracy_score(test_Y,pred))
def test_direct_attribute(): X,y,test_X,test_Y =get_test_data() da = DirectAttributeClassifier() da.test(X,y,test_X,test_Y)
def test(): X,y,test_X,test_Y =get_test_data() print("bag of words") bow = BagOfWordsClassifier() bow_probs = bow.get_proba(X,y,test_X,prefix="t") print("direct attribute") da = DirectAttributeClassifier() da_probs = da.get_proba(X,y,test_X,prefix="t") probs = zip(*[item for p in [bow_probs,da_probs] for item in p]) train_probs = probs[0] test_probs = probs[1] print(len(train_probs)) for prob in train_probs: print(prob.shape) print(type(prob)) train_attr = pd.concat(train_probs,axis=1) print(train_attr.shape) print(type(train_attr)) test_attr = pd.concat(test_probs,axis=1) print(test_attr.shape) print(type(test_attr)) #clf = LogisticRegression() #clf = svm.SVC() """ params={'kernel':('rbf','linear','poly','sigmoid'),'C':[1,10]} clf = grid_search.GridSearchCV(svm.SVC(),params,cv=5) params={'penalty':('l1','l2'),'C':[1,10]} clf = grid_search.GridSearchCV(LogisticRegression(),params,cv=5) """ #clf = SGDClassifier(loss="log") """ params = {'loss':['hinge','log','modified_huber','squared_hinge','perceptron'], 'penalty':['l1','l2','elasticnet'], 'alpha':[0.0001,0.001,0.01,0.1]} clf = grid_search.GridSearchCV(SGDClassifier(),params,cv=5) """ """ clf.fit(train_attr,y) #print(clf.best_params_) pred = clf.predict(test_attr) print(clf) print(metrics.accuracy_score(test_Y,pred)) """ """ clf = RandomForestClassifier(n_estimators=50) benchmark(clf,train_attr,y,test_attr,test_Y) """ """ clf = GradientBoostingClassifier(n_estimators=50) benchmark(clf,train_attr,y,test_attr,test_Y) clf=DecisionTreeClassifier() benchmark(clf,train_attr,y,test_attr,test_Y) """ """ clf = AdaBoostClassifier(base_estimator=SGDClassifier(loss="log")) benchmark(clf,train_attr,y,test_attr,test_Y) clf = BaggingClassifier(LogisticRegression()) benchmark(clf,train_attr,y,test_attr,test_Y) clf = LogisticRegression() benchmark(clf,train_attr,y,test_attr,test_Y) clf = SGDClassifier(loss="log") benchmark(clf,train_attr,y,test_attr,test_Y) clf=Perceptron() benchmark(clf,train_attr,y,test_attr,test_Y) clf =GaussianNB() benchmark(clf,train_attr,y,test_attr,test_Y) """ """