def decision_tree_train(x_train, y_train, x_validation, y_validation, config: EnsembleConfig, tree_id: int = '', sample_weights=None, raw_x_train=None, raw_y_train=None): model = tree.DecisionTreeClassifier(class_weight='balanced') model.fit(x_train, y_train) if not os.path.exists('model'): os.mkdir('model') if config.ensemble_mode == 'BAGGING': joblib.dump(model, get_model_dir(config) + 'dtree_' + str(tree_id) + '.pkl') elif config.ensemble_mode == 'ADA_BOOST_M1': raw_pred = model.predict(raw_x_train) err = 1. * np.dot( np.array(raw_pred) != np.array(raw_y_train), sample_weights) print("current model(%s) err: " % str(config), err) print("current model(%s) acc on training set: " % str(config), 1 - accuracy_score(raw_pred, raw_y_train)) print("current model(%s) acc on validation: " % str(config), 1 - accuracy_score(model.predict(x_validation), y_validation)) if err > 0.6: return sample_weights, err beta = err / (1.3 - err) update_weights = [ 1 if raw_y_train[i] != raw_pred[i] else beta for i in range(0, len(raw_x_train)) ] sample_weights = np.multiply(sample_weights, update_weights) sample_weights = sample_weights / np.sum( sample_weights) # normalization joblib.dump(model, get_model_dir(config) + 'dtree_' + str(tree_id) + '.pkl') with open( get_model_dir(config) + 'beta_' + str(tree_id) + '.txt', 'w') as f: f.write(str(beta)) print( "current model(%s) rmse: " % str(config), math.sqrt( mean_squared_error(model.predict(x_validation), y_validation))) return sample_weights, err elif config.ensemble_mode == "SINGLE": joblib.dump(model, get_model_dir(config) + 'dtree.pkl') else: print("unimplemented in decision_tree_train!") exit(0) print( "current model(%s) rmse: " % str(config), math.sqrt(mean_squared_error(model.predict(x_validation), y_validation)))
def svm_train(x_train, y_train, x_validation, y_validation, config: EnsembleConfig, svm_id: int = '', sample_weights=None, raw_x_train=None, raw_y_train=None): # Prefer dual=False when n_samples > n_features. model = LinearSVC(multi_class='ovr', class_weight='balanced', verbose=True, dual=False, max_iter=1000) model.fit(x_train, y_train) if config.ensemble_mode == 'BAGGING': joblib.dump(model, get_model_dir(config) + 'svm_' + str(svm_id) + '.pkl') elif config.ensemble_mode == 'ADA_BOOST_M1': raw_pred = model.predict(raw_x_train) err = 1. * np.dot( np.array(raw_pred) != np.array(raw_y_train), sample_weights) if err > 0.5: return sample_weights, err beta = err / (1.0 - err) update_weights = [ 1 if raw_y_train[i] != raw_pred[i] else beta for i in range(0, len(raw_x_train)) ] sample_weights = np.multiply(sample_weights, update_weights) sample_weights = sample_weights / np.sum( sample_weights) # normalization joblib.dump(model, get_model_dir(config) + 'svm_' + str(svm_id) + '.pkl') with open(get_model_dir(config) + 'beta_' + str(svm_id) + '.txt', 'w') as f: f.write(str(beta)) print( "current model(%s) rmse: " % str(config), math.sqrt( mean_squared_error(model.predict(x_validation), y_validation))) return sample_weights, err elif config.ensemble_mode == "SINGLE": joblib.dump(model, get_model_dir(config) + 'svm_model.pkl') else: print("unimplemented in svm_train!") exit(0) print( "current model(%s) rmse: " % str(config), math.sqrt(mean_squared_error(model.predict(x_validation), y_validation)))
def test(x_test, config: EnsembleConfig): y_predict = [] if config.ensemble_mode == 'BAGGING': res = [] for model_id in range(0, config.bagging_times): res.append(predict(x_test, config, model_id)) # get most frequent rating res = np.array(res).T y_predict = [np.mean(item) for item in res] # y_predict = [get_max_appear_num(item) for item in res] elif config.ensemble_mode == 'ADA_BOOST_M1': files = os.listdir(get_model_dir(config)) weight = np.array([0.0 for _ in range(0, config.ada_times)]) ada_len = 0 for filename in files: if filename.split('.')[-1] == 'txt': idx = int(filename.split('.')[0].split('_')[1]) if idx + 1 > ada_len: ada_len = idx + 1 with open(get_model_dir(config) + filename) as file: weight[idx] = math.log(1 / float(file.read())) weight = weight[0:ada_len] res = [] for model_id in range(0, ada_len): res.append(predict(x_test, config, model_id)) res = np.array(res).T for i in range(0, len(res)): avg = 0.0 for j in range(0, ada_len): avg += (weight[j] * res[i][j])/np.sum(weight) y_predict.append(avg) # for i in range(0, len(res)): # tmp = [0, 0, 0, 0, 0, 0] # [1][2][3][4][5] is useful # for j in range(0, ada_len): # tmp[res[i][j]] += weight[j] # y_predict.append(np.argmax(tmp)) elif config.ensemble_mode == 'SINGE': y_predict = predict(x_test, config) if not os.path.exists('result'): os.mkdir('result') with open('result/' + str(config) + "-result.csv", 'w') as file: file.write("id,predicted\n") for i in range(0, len(y_predict)): file.write(str(i + 1) + ',' + str(y_predict[i]) + '\n') file.close()
def svm_predict(words_data, config: EnsembleConfig, model_id: int = None): if config.ensemble_mode == 'BAGGING': model = joblib.load( get_model_dir(config) + 'svm_' + str(model_id) + '.pkl') result = model.predict(words_data) elif config.ensemble_mode == "ADA_BOOST_M1": model = joblib.load( get_model_dir(config) + 'svm_' + str(model_id) + '.pkl') result = model.predict(words_data) elif config.ensemble_mode == 'SINGLE': model = joblib.load(get_model_dir(config) + 'svm_model.pkl') result = model.predict(words_data) else: print("unimplemented in svm_predict!") exit(0) return result
def decision_tree_predict(words_data, config: EnsembleConfig, model_id=''): if config.ensemble_mode == 'BAGGING': model = joblib.load( get_model_dir(config) + 'dtree_' + str(model_id) + '.pkl') result = model.predict(words_data) elif config.ensemble_mode == "ADA_BOOST_M1": model = joblib.load( get_model_dir(config) + 'dtree_' + str(model_id) + '.pkl') result = model.predict(words_data) elif config.ensemble_mode == 'SINGLE': model = joblib.load(get_model_dir(config) + 'dtree.pkl') result = model.predict(words_data) else: print("unimplemented in decision_tree_predict!") exit(0) return result
def validation(x_validation, y_validation, config: EnsembleConfig): from sklearn.metrics import accuracy_score if config.ensemble_mode == 'BAGGING': res = [] for model_id in range(0, config.bagging_times): res.append(predict(x_validation, config, model_id)) # get most frequent rating res = np.array(res).T y_predict = [np.mean(item) for item in res] # print(str(config) + ' acc on validation set: ', accuracy_score(y_validation, y_predict)) elif config.ensemble_mode == 'ADA_BOOST_M1': files = os.listdir(get_model_dir(config)) weight = np.array([0.0 for _ in range(0, config.ada_times)]) ada_len = 0 for filename in files: if filename.split('.')[-1] == 'txt': idx = int(filename.split('.')[0].split('_')[1]) if idx + 1 > ada_len: ada_len = idx + 1 with open(get_model_dir(config) + filename) as file: weight[idx] = 0.5 * math.log(1 / float(file.read())) weight = weight[0:ada_len] res = [] for model_id in range(0, ada_len): res.append(predict(x_validation, config, model_id)) res = np.array(res).T y_predict = [] for i in range(0, len(res)): avg = 0.0 for j in range(0, ada_len): avg += (weight[j] * res[i][j])/np.sum(weight) y_predict.append(avg) # used for calculate accuracy # for i in range(0, len(res)): # tmp = [0, 0, 0, 0, 0, 0] # [1][2][3][4][5] is useful # for j in range(0, ada_len): # tmp[res[i][j]] += weight[j] # y_predict.append(np.argmax(tmp)) # print(str(config), ' acc on validation set: ', accuracy_score(y_validation, y_predict)) elif config.ensemble_mode == 'SINGLE': y_predict = predict(x_validation, config) from sklearn.metrics import mean_squared_error print(str(config), ' rmse on validation set: ', math.sqrt(mean_squared_error(y_predict, y_validation)))