def model_lda(train, test, label): reglin = discriminant_analysis.LinearDiscriminantAnalysis() reglin.fit(train[label], train['hotel_cluster']) prediction = reglin.predict_proba(test[label]) return util.best_proba(prediction), reglin
def model_knn(train, test, label): neigh = KNeighborsClassifier(n_neighbors=30) neigh.fit(train[label], train['hotel_cluster']) prediction = neigh.predict_proba(test[label]) return util.best_proba(prediction), neigh
def model_reglog(train, test, label): reglog = linear_model.LogisticRegression() reglog.fit(train[label], train['hotel_cluster']) prediction = reglog.predict_proba(test[label]) return util.best_proba(prediction), reglog
def model_lasso(train, test, label): C = 0.01 lasso = linear_model.LogisticRegression(C=C, penalty="l2") lasso.fit(train[label], train['hotel_cluster']) prediction = lasso.predict_proba(test[label]) return util.best_proba(prediction), lasso
def model_rforest(train,test,label): rf = ske.RandomForestClassifier(n_estimators=10,criterion="gini") rf.fit(train[label],train['hotel_cluster']) prediction = rf.predict_proba(test[label]) return util.best_proba(prediction), rf
def model_SVM(train, test, label): SVM = svm.SVC(kernel='rbf', probability=True) SVM.fit(train[label], train['hotel_cluster']) prediction = SVM.predict_proba(test[label]) return util.best_proba(prediction), SVM
def model_labelprop(train, test, label): lp = sm.LabelPropagation(kernel='rbf') lp.fit(train[label], train['hotel_cluster']) prediction = lp.predict_proba(test[label]) return util.best_proba(prediction), lp
def model_adaboost(train, test, label): adab = AdaBoostClassifier(learning_rate=0.1, n_estimators=100) adab.fit(train[label], train['hotel_cluster']) prediction = adab.predict_proba(test[label]) return util.best_proba(prediction), adab
def model_dec_tree(train,test,label): dectree = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, max_features=None) dectree.fit(train[label], train["hotel_cluster"]) prediction = dectree.predict_proba(test[label]) return util.best_proba(prediction), dectree
def model_gradboost(train, test, label): gradb = GradientBoostingClassifier(n_estimators=10) gradb.fit(train[label], train['hotel_cluster']) prediction = gradb.predict_proba(test[label]) return util.best_proba(prediction), gradb
def model_xgb(train, test, label): xgb = sklearn.XGBClassifier(nthread=4, n_estimators=10) xgb.fit(train[label], train['hotel_cluster']) prediction = xgb.predict_proba(test[label]) df = pd.DataFrame(prediction).transpose().tail(test[label].shape[0]) return util.best_proba(df.as_matrix()), xgb
def error(weight, matrix_pred, test): prediction_final = map(np.multiply, weight, matrix_pred) prediction_final = np.sum(prediction_final, axis=0) clusters = util.best_proba(prediction_final) def temp(i, val): actual[i] = [val] actual = range(len(test['hotel_cluster'])) map(temp, range(len(test['hotel_cluster'])), test['hotel_cluster']) return 1 - util.mapk(actual, clusters)
def model_neural(train, test, label): model = Sequential() model.add( Dense(12, input_dim=len(label), init='uniform', activation='relu')) model.add(Dense(len(label), init='uniform', activation='relu')) model.add(Dense(1, init='uniform', activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) model.fit(train[label], train['hotel_cluster'], nb_epoch=150, batch_size=10) prediction = model.predict_proba(test[label]) return util.best_proba(prediction)
def model_bagging(train, test, label, grid_elt=None): if grid_elt is None: bag = ensemble.BaggingClassifier(KNeighborsClassifier(), n_jobs=1, n_estimators=30, max_samples=1.0, max_features=0.1) else: bag = ensemble.BaggingClassifier(KNeighborsClassifier(), n_jobs=1, n_estimators=grid_elt[0], max_samples=grid_elt[1], max_features=grid_elt[2]) bag.fit(train[label], train['hotel_cluster']) prediction = bag.predict_proba(test[label]) return util.best_proba(prediction), bag
def model_voting(train, test, label, grid_elt=None): clf1 = LogisticRegression(random_state=1) clf2 = GradientBoostingClassifier(n_estimators=10) clf3 = KNeighborsClassifier(n_neighbors=30) if grid_elt is None: eclf1 = VotingClassifier(estimators=[('lr', clf1), ('gb', clf2), ('knn', clf3)], voting='soft', weights=[1, 1, 1]) else: eclf1 = VotingClassifier(estimators=[('lr', clf1), ('gb', clf2), ('knn', clf3)], voting='soft', weights=grid_elt) eclf1.fit(train[label], train['hotel_cluster']) prediction = eclf1.predict_proba(test[label]) return util.best_proba(prediction), eclf1
def model_weighted(train, test, label): weight = [] reglog = linear_model.LogisticRegression() reglog.fit(train[label], train['hotel_cluster']) prediction_reglog = reglog.predict_proba(test[label]) gradb = GradientBoostingClassifier(n_estimators=10) gradb.fit(train[label], train['hotel_cluster']) prediction_gradboost = gradb.predict_proba(test[label]) neigh = KNeighborsClassifier(n_neighbors=10) neigh.fit(train[label], train['hotel_cluster']) prediction_knn = neigh.predict_proba(test[label]) matrix_pred = [prediction_reglog, prediction_gradboost, prediction_knn] prediction_final = map(np.multiply, weight, matrix_pred) prediction_final = np.sum(prediction_final, axis=0) return util.best_proba(prediction_final)