예제 #1
0
def decision_tree_train(x_train,
                        y_train,
                        x_validation,
                        y_validation,
                        config: EnsembleConfig,
                        tree_id: int = '',
                        sample_weights=None,
                        raw_x_train=None,
                        raw_y_train=None):
    model = tree.DecisionTreeClassifier(class_weight='balanced')

    model.fit(x_train, y_train)
    if not os.path.exists('model'):
        os.mkdir('model')

    if config.ensemble_mode == 'BAGGING':
        joblib.dump(model,
                    get_model_dir(config) + 'dtree_' + str(tree_id) + '.pkl')
    elif config.ensemble_mode == 'ADA_BOOST_M1':
        raw_pred = model.predict(raw_x_train)
        err = 1. * np.dot(
            np.array(raw_pred) != np.array(raw_y_train), sample_weights)
        print("current model(%s) err: " % str(config), err)
        print("current model(%s) acc on training set: " % str(config),
              1 - accuracy_score(raw_pred, raw_y_train))
        print("current model(%s) acc on validation: " % str(config),
              1 - accuracy_score(model.predict(x_validation), y_validation))
        if err > 0.6:
            return sample_weights, err
        beta = err / (1.3 - err)
        update_weights = [
            1 if raw_y_train[i] != raw_pred[i] else beta
            for i in range(0, len(raw_x_train))
        ]
        sample_weights = np.multiply(sample_weights, update_weights)
        sample_weights = sample_weights / np.sum(
            sample_weights)  # normalization
        joblib.dump(model,
                    get_model_dir(config) + 'dtree_' + str(tree_id) + '.pkl')
        with open(
                get_model_dir(config) + 'beta_' + str(tree_id) + '.txt',
                'w') as f:
            f.write(str(beta))
        print(
            "current model(%s) rmse: " % str(config),
            math.sqrt(
                mean_squared_error(model.predict(x_validation), y_validation)))
        return sample_weights, err
    elif config.ensemble_mode == "SINGLE":
        joblib.dump(model, get_model_dir(config) + 'dtree.pkl')
    else:
        print("unimplemented in decision_tree_train!")
        exit(0)

    print(
        "current model(%s) rmse: " % str(config),
        math.sqrt(mean_squared_error(model.predict(x_validation),
                                     y_validation)))
예제 #2
0
def svm_train(x_train,
              y_train,
              x_validation,
              y_validation,
              config: EnsembleConfig,
              svm_id: int = '',
              sample_weights=None,
              raw_x_train=None,
              raw_y_train=None):
    # Prefer dual=False when n_samples > n_features.
    model = LinearSVC(multi_class='ovr',
                      class_weight='balanced',
                      verbose=True,
                      dual=False,
                      max_iter=1000)
    model.fit(x_train, y_train)
    if config.ensemble_mode == 'BAGGING':
        joblib.dump(model,
                    get_model_dir(config) + 'svm_' + str(svm_id) + '.pkl')
    elif config.ensemble_mode == 'ADA_BOOST_M1':
        raw_pred = model.predict(raw_x_train)
        err = 1. * np.dot(
            np.array(raw_pred) != np.array(raw_y_train), sample_weights)
        if err > 0.5:
            return sample_weights, err
        beta = err / (1.0 - err)
        update_weights = [
            1 if raw_y_train[i] != raw_pred[i] else beta
            for i in range(0, len(raw_x_train))
        ]
        sample_weights = np.multiply(sample_weights, update_weights)
        sample_weights = sample_weights / np.sum(
            sample_weights)  # normalization
        joblib.dump(model,
                    get_model_dir(config) + 'svm_' + str(svm_id) + '.pkl')
        with open(get_model_dir(config) + 'beta_' + str(svm_id) + '.txt',
                  'w') as f:
            f.write(str(beta))
        print(
            "current model(%s) rmse: " % str(config),
            math.sqrt(
                mean_squared_error(model.predict(x_validation), y_validation)))

        return sample_weights, err
    elif config.ensemble_mode == "SINGLE":
        joblib.dump(model, get_model_dir(config) + 'svm_model.pkl')
    else:
        print("unimplemented in svm_train!")
        exit(0)
    print(
        "current model(%s) rmse: " % str(config),
        math.sqrt(mean_squared_error(model.predict(x_validation),
                                     y_validation)))
예제 #3
0
def test(x_test, config: EnsembleConfig):
    y_predict = []
    if config.ensemble_mode == 'BAGGING':
        res = []
        for model_id in range(0, config.bagging_times):
            res.append(predict(x_test, config, model_id))
        # get most frequent rating
        res = np.array(res).T
        y_predict = [np.mean(item) for item in res]
        # y_predict = [get_max_appear_num(item) for item in res]

    elif config.ensemble_mode == 'ADA_BOOST_M1':
        files = os.listdir(get_model_dir(config))
        weight = np.array([0.0 for _ in range(0, config.ada_times)])
        ada_len = 0
        for filename in files:
            if filename.split('.')[-1] == 'txt':
                idx = int(filename.split('.')[0].split('_')[1])
                if idx + 1 > ada_len:
                    ada_len = idx + 1
                with open(get_model_dir(config) + filename) as file:
                    weight[idx] = math.log(1 / float(file.read()))

        weight = weight[0:ada_len]
        res = []
        for model_id in range(0, ada_len):
            res.append(predict(x_test, config, model_id))
        res = np.array(res).T
        for i in range(0, len(res)):
            avg = 0.0
            for j in range(0, ada_len):
                avg += (weight[j] * res[i][j])/np.sum(weight)
            y_predict.append(avg)
        # for i in range(0, len(res)):
        #     tmp = [0, 0, 0, 0, 0, 0] # [1][2][3][4][5] is useful
        #     for j in range(0, ada_len):
        #         tmp[res[i][j]] += weight[j]
        #     y_predict.append(np.argmax(tmp))
    elif config.ensemble_mode == 'SINGE':
        y_predict = predict(x_test, config)

    if not os.path.exists('result'):
        os.mkdir('result')
    with open('result/' + str(config) + "-result.csv", 'w') as file:
        file.write("id,predicted\n")
        for i in range(0, len(y_predict)):
            file.write(str(i + 1) + ',' + str(y_predict[i]) + '\n')
        file.close()
예제 #4
0
def svm_predict(words_data, config: EnsembleConfig, model_id: int = None):
    if config.ensemble_mode == 'BAGGING':
        model = joblib.load(
            get_model_dir(config) + 'svm_' + str(model_id) + '.pkl')
        result = model.predict(words_data)
    elif config.ensemble_mode == "ADA_BOOST_M1":
        model = joblib.load(
            get_model_dir(config) + 'svm_' + str(model_id) + '.pkl')
        result = model.predict(words_data)
    elif config.ensemble_mode == 'SINGLE':
        model = joblib.load(get_model_dir(config) + 'svm_model.pkl')
        result = model.predict(words_data)
    else:
        print("unimplemented in svm_predict!")
        exit(0)
    return result
예제 #5
0
def decision_tree_predict(words_data, config: EnsembleConfig, model_id=''):
    if config.ensemble_mode == 'BAGGING':
        model = joblib.load(
            get_model_dir(config) + 'dtree_' + str(model_id) + '.pkl')
        result = model.predict(words_data)
    elif config.ensemble_mode == "ADA_BOOST_M1":
        model = joblib.load(
            get_model_dir(config) + 'dtree_' + str(model_id) + '.pkl')
        result = model.predict(words_data)
    elif config.ensemble_mode == 'SINGLE':
        model = joblib.load(get_model_dir(config) + 'dtree.pkl')
        result = model.predict(words_data)
    else:
        print("unimplemented in decision_tree_predict!")
        exit(0)
    return result
예제 #6
0
def validation(x_validation, y_validation, config: EnsembleConfig):
    from sklearn.metrics import accuracy_score
    if config.ensemble_mode == 'BAGGING':
        res = []
        for model_id in range(0, config.bagging_times):
            res.append(predict(x_validation, config, model_id))
        # get most frequent rating
        res = np.array(res).T
        y_predict = [np.mean(item) for item in res]
        # print(str(config) + ' acc on validation set: ', accuracy_score(y_validation, y_predict))

    elif config.ensemble_mode == 'ADA_BOOST_M1':
        files = os.listdir(get_model_dir(config))
        weight = np.array([0.0 for _ in range(0, config.ada_times)])
        ada_len = 0
        for filename in files:
            if filename.split('.')[-1] == 'txt':
                idx = int(filename.split('.')[0].split('_')[1])
                if idx + 1 > ada_len:
                    ada_len = idx + 1
                with open(get_model_dir(config) + filename) as file:
                    weight[idx] = 0.5 * math.log(1 / float(file.read()))

        weight = weight[0:ada_len]
        res = []
        for model_id in range(0, ada_len):
            res.append(predict(x_validation, config, model_id))
        res = np.array(res).T
        y_predict = []
        for i in range(0, len(res)):
            avg = 0.0
            for j in range(0, ada_len):
                avg += (weight[j] * res[i][j])/np.sum(weight)
            y_predict.append(avg)
        # used for calculate accuracy
        # for i in range(0, len(res)):
        #     tmp = [0, 0, 0, 0, 0, 0] # [1][2][3][4][5] is useful
        #     for j in range(0, ada_len):
        #         tmp[res[i][j]] += weight[j]
        #     y_predict.append(np.argmax(tmp))
        # print(str(config), ' acc on validation set: ', accuracy_score(y_validation, y_predict))
    elif config.ensemble_mode == 'SINGLE':
        y_predict = predict(x_validation, config)
    from sklearn.metrics import mean_squared_error
    print(str(config), ' rmse on validation set: ', math.sqrt(mean_squared_error(y_predict, y_validation)))