Пример #1
0
def pred_eval():
    """Function used to evaluate the prediction result of the model

    Args:


    Returns:
        Pandas DataFrame: Root Mean Square Error value of all the three models.
    """
    logging.info("Evaluating the Housing Value of all the three Models....")
    _, housing_labels = data_preprocessing.data_preprocess()

    (
        Linear_Model_prediction,
        DT_Model_prediction,
        RF_Model_prediction,
    ) = prediction.predict()
    lin_mse = mean_squared_error(housing_labels, Linear_Model_prediction)
    lin_rmse = np.sqrt(lin_mse)
    # lin_mae = mean_absolute_error(housing_labels, Linear_Model_prediction)
    tree_mse = mean_squared_error(housing_labels, DT_Model_prediction)
    tree_rmse = np.sqrt(tree_mse)
    _, y_test = data_preprocessing.rfdata()
    final_mse = mean_squared_error(y_test, RF_Model_prediction)
    final_rmse = np.sqrt(final_mse)
    return lin_rmse, tree_rmse, final_rmse
Пример #2
0
def PredictByAllModel():
    train_df, test_df, sale_price, _ = data_preprocess()
    x_train, y_train, x_test, oridata = load_data()
    ave_model = Ave_Model()
    ave_model.fit(x_train, y_train)
    y_predict1 = np.array(ave_model.predict(x_test))
    deep_learning_model = DeepLearningModel(x_train.shape[1])
    deeplearningModel = deep_learning_model.build_model()
    deeplearningModel.fit(x_train,y_train,epochs=300)
    deeplearningModel.save('model.h5')
    prices = []
    for i in range(x_test.shape[0]):
        print(i)
        price = x_test[i].reshape((1, x_test.shape[1]))
        prices.append(deeplearningModel.predict(price))
        i += 1
    y_predict2 = np.array(prices).reshape((-1,))
    w1 = 0.9
    w2 = 0.1
    y_predict = y_predict1 * w1 + y_predict2 * w2
    y_predict *= (oridata['SalePrice']['max']-oridata['SalePrice']['min'])
    y_predict += oridata['SalePrice']['min']
    y_predict = np.expm1(y_predict)
    submission_df = pd.DataFrame(data={'Id':test_df.index,'SalePrice':y_predict})
    print(submission_df.head(10))
    submission_df.to_csv('submission11.csv',index=False)
Пример #3
0
def predict():
    housing_prepared, housing_labels = data_preprocessing.data_preprocess()
    linear, dt, rnd = train_model.model_train()
    Linear_Model_prediction = linear.predict(housing_prepared)
    DT_Model_prediction = dt.predict(housing_prepared)
    X_test_prepared, y_test = data_preprocessing.rfdata()
    RF_Model_prediction = rnd.predict(X_test_prepared)
    return Linear_Model_prediction, DT_Model_prediction, RF_Model_prediction
Пример #4
0
def model_train():
    """Main Function to train all the three models.

    Returns:
        Pandas DataFrame: Linear Model, Decision Tree Model, Random Forest Model.
    """
    logging.info("Running the Main function for training model....")
    housing_prepared, housing_labels = data_preprocessing.data_preprocess()
    linear = linear_model_(housing_prepared, housing_labels)
    dt = dtreg(housing_prepared, housing_labels)
    rnd = rnd_forest(housing_prepared, housing_labels)
    return linear, dt, rnd
Пример #5
0
def PredictByAve_Model():
    train_df, test_df, sale_price, _ = data_preprocess()
    x_train, y_train, x_test, oridata = load_data()
    ave_model = Ave_Model()
    score = rmse_cv(ave_model,x_train,y_train)
    print(score.mean())
    ave_model.fit(x_train,y_train)
    y_predict = np.array(ave_model.predict(x_test))
    y_predict *= (oridata['SalePrice']['max']-oridata['SalePrice']['min'])
    y_predict += oridata['SalePrice']['min']
    y_predict = np.expm1(y_predict)
    submission_df = pd.DataFrame(data={'Id':test_df.index,'SalePrice':y_predict})
    print(submission_df.head(10))
    submission_df.to_csv('submission7.csv',index=False)
Пример #6
0
def pred_eval():
    _, housing_labels = data_preprocessing.data_preprocess()

    (
        Linear_Model_prediction,
        DT_Model_prediction,
        RF_Model_prediction,
    ) = prediction.predict()
    lin_mse = mean_squared_error(housing_labels, Linear_Model_prediction)
    lin_rmse = np.sqrt(lin_mse)
    # lin_mae = mean_absolute_error(housing_labels, Linear_Model_prediction)
    tree_mse = mean_squared_error(housing_labels, DT_Model_prediction)
    tree_rmse = np.sqrt(tree_mse)
    _, y_test = data_preprocessing.rfdata()
    final_mse = mean_squared_error(y_test, RF_Model_prediction)
    final_rmse = np.sqrt(final_mse)
    return lin_rmse, tree_rmse, final_rmse
Пример #7
0
def predict():
    """Predict()

    **Prediction File**

    The function is used to create the prediction file for all the three models

    Args:

    Returns:

        Linear_Model_prediction, DT_Model_prediction, RF_Model_prediction

    """
    logging.info("Predicting the Median Housing Value....")
    housing_prepared, housing_labels = data_preprocessing.data_preprocess()
    linear, dt, rnd = train_model.model_train()
    Linear_Model_prediction = linear.predict(housing_prepared)
    DT_Model_prediction = dt.predict(housing_prepared)
    X_test_prepared, y_test = data_preprocessing.rfdata()
    RF_Model_prediction = rnd.predict(X_test_prepared)
    return Linear_Model_prediction, DT_Model_prediction, RF_Model_prediction
    'parks_percent_change_from_baseline',
    'transit_stations_percent_change_from_baseline',
    'residential_percent_change_from_baseline '
]

column_identifier = {
    'spatial id level 1': 'country_code',
    'temporal id level 1': 'Date',
    'temporal covariates': temporal_covariates,
    'target': target_name
}

history_length = {key: max_history for key in temporal_covariates}

historical_data_list = data_preprocess(
    data=data.copy(),
    forecast_horizon=forecast_horizon,
    history_length=history_length,
    column_identifier=column_identifier,
    spatial_scale_table=None,
    spatial_scale_level=1,
    temporal_scale_level=1,
    target_mode='normal',
    imputation=False,
    aggregation_mode='mean',
    augmentation=False,
    futuristic_covariates=None,
    future_data_table=None,
    save_address='',  # <------------ save address
    verbose=1)
Пример #9
0
 def __init__(self):
     preprocess = pre.data_preprocess()
     self.dataset = preprocess.clean()
     self.file_init = False
     self.file_name = 'feature_outputs/'
Пример #10
0
								"cure": ("./data/cure_and_prevention-add_text.jsonl", "./data/cure_and_prevention.pkl"),
								}

# REDO_DATA_FLAG = True
REDO_DATA_FLAG = False
REDO_FLAG = True
RETRAIN_FLAG = True
# REDO_FLAG = False

# We will save all the tasks and subtask's results and model configs in this dictionary
all_task_results_and_model_configs = dict()
# We will save the list of question_tags AKA subtasks for each event AKA task in this dict
all_task_question_tags = dict()
for taskname, (data_in_file, processed_out_file) in task_type_to_datapath_dict.items():
	if not os.path.exists(processed_out_file) or REDO_DATA_FLAG:
		data_preprocess(data_in_file, processed_out_file)
	else:
		logging.info(f"Preprocessed data for task {taskname} already exists at {processed_out_file}")

	# Read the data statistics
	task_instances_dict, tag_statistics, question_keys_and_tags = load_from_pickle(processed_out_file)

	# We will store the list of subtasks for which we train the classifier
	tested_tasks = list()
	logging.info(f"Training Mutlitask BERT Entity Classifier model on {processed_out_file}")
	# output_dir = os.path.join("results", "multitask_bert_entity_classifier", taskname)
	# NOTE: After fixing the USER and URL tags
	output_dir = os.path.join("results", "multitask_bert_entity_classifier_fixed", taskname)
	make_dir_if_not_exists(output_dir)
	results_file = os.path.join(output_dir, "results.json")
	model_config_file = os.path.join(output_dir, "model_config.json")
Пример #11
0
def model_train():
    housing_prepared, housing_labels = data_preprocessing.data_preprocess()
    linear = linear_model_(housing_prepared, housing_labels)
    dt = dtreg(housing_prepared, housing_labels)
    rnd = rnd_forest(housing_prepared, housing_labels)
    return linear, dt, rnd
Пример #12
0
def ml(class_num,
       epochs,
       method,
       source_data,
       twitter_source,
       google_source,
       ig_source,
       judge=True,
       nan=True):
    if judge == True:
        tmp_data = pie(num=11, source_data=source_data, all_option=True)
        final_data = combine(tmp_data, twitter_source, google_source,
                             ig_source)
        input_x, revised_y = data_preprocess(final_data, nan=nan)
        a = pd.to_datetime(final_data['上映日期'])
        cut = a.dt.weekofyear
        test_list = []
        train_list = []
        test = 0
        train = 0
        # print(cut)
        for i in range(len(final_data)):
            if cut[i] % 4 == 0:
                test += 1
                test_list.append(i)
            else:
                train += 1
                train_list.append(i)
        # print(train_list)
        # print(test_list)
        print(train)
        print(test)
        # print("final",final_data.shape)
        train_final_data = final_data
        test_final_data = final_data
        for i in test_list:
            train_final_data = train_final_data.drop(final_data.index[i])
        # print(len(train_final_data))
        train_final_data = train_final_data.reset_index(drop=True)
        # print(train_new_youtube_file_v3_data)
        for i in train_list:
            test_final_data = test_final_data.drop(final_data.index[i])
        # print(len(test_new_youtube_file_v3_data))
        test_final_data = test_final_data.reset_index(drop=True)
        # print(test_new_youtube_file_v3_data)

        X_train, y_train = data_preprocess(train_final_data, nan=nan)
        X_test, y_test = data_preprocess(test_final_data, nan=nan)

        if method == "random_forest":
            y_test, ans_best = random_forest(input_x,
                                             revised_y,
                                             X_train,
                                             y_train,
                                             X_test,
                                             y_test,
                                             judge=judge)
        elif method == "decision_tree":
            y_test, ans_best = decision_tree(input_x,
                                             revised_y,
                                             X_train,
                                             y_train,
                                             X_test,
                                             y_test,
                                             judge=judge)
        else:
            y_test, ans_best = xgboost(input_x,
                                       revised_y,
                                       X_train,
                                       y_train,
                                       X_test,
                                       y_test,
                                       class_num=class_num,
                                       num=epochs,
                                       judge=judge)

        if class_num == 2:
            classes = ['0', '1']
        elif class_num == 4:
            classes = ['0', '1', '2', '3']

        np.set_printoptions(precision=2)
        plot_confusion_matrix(y_test,
                              ans_best,
                              classes=classes,
                              normalize=False,
                              title=None,
                              cmap=plt.cm.Blues)
        plt.show()

    else:
        tmp_data = pie(num=11, source_data=source_data, all_option=True)
        final_data = combine(tmp_data, twitter_source, google_source,
                             ig_source)
        input_x, revised_y = data_preprocess(final_data, nan=nan)
        if method == "random_forest":
            y_test, ans_best = random_forest(input_x,
                                             revised_y,
                                             X_train=0,
                                             y_train=0,
                                             X_test=0,
                                             y_test=0,
                                             judge=judge)
        elif method == "decision_tree":
            y_test, ans_best = decision_tree(input_x,
                                             revised_y,
                                             X_train=0,
                                             y_train=0,
                                             X_test=0,
                                             y_test=0,
                                             judge=judge)
        else:
            y_test, ans_best = xgboost(input_x,
                                       revised_y,
                                       X_train=0,
                                       y_train=0,
                                       X_test=0,
                                       y_test=0,
                                       class_num=class_num,
                                       num=epochs,
                                       judge=judge)

        if class_num == 2:
            classes = ['0', '1']
        elif class_num == 4:
            classes = ['0', '1', '2', '3']

        np.set_printoptions(precision=2)
        plot_confusion_matrix(y_test,
                              ans_best,
                              classes=classes,
                              normalize=False,
                              title=None,
                              cmap=plt.cm.Blues)
        plt.show()
Пример #13
0
def dl(source_data,
       twitter_source,
       google_source,
       ig_source,
       class_num,
       epochs,
       batch_size,
       optimizer,
       loss,
       judge=True,
       nan=True):
    if judge == True:
        tmp_data = pie(num=11, source_data=source_data, all_option=True)
        final_data = combine(tmp_data, twitter_source, google_source,
                             ig_source)
        a = pd.to_datetime(final_data['上映日期'])
        cut = a.dt.weekofyear
        test_list = []
        train_list = []
        test = 0
        train = 0
        # print(cut)
        for i in range(len(final_data)):
            if cut[i] % 4 == 0:
                test += 1
                test_list.append(i)

            else:
                train += 1
                train_list.append(i)

        # print(train_list)
        # print(test_list)
        print(train)
        print(test)

        train_final_data = final_data
        test_final_data = final_data
        for i in test_list:
            train_final_data = train_final_data.drop(final_data.index[i])
        # print(len(train_final_data))
        train_final_data = train_final_data.reset_index(drop=True)
        # print(train_final_data)

        for i in train_list:
            test_final_data = test_final_data.drop(final_data.index[i])
        # print(len(test_final_data))
        test_final_data = test_final_data.reset_index(drop=True)
        # print(test_new_final_data)

        X_train, y_train = data_preprocess(train_final_data, nan=nan)
        X_test, y_test = data_preprocess(test_final_data, nan=nan)
        # print(X_train.shape)
        ohe = OneHotEncoder()
        y_train = ohe.fit_transform(y_train.reshape(-1, 1)).toarray()
        y_test = ohe.fit_transform(y_test.reshape(-1, 1)).toarray()
        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)
        nn(X_train,
           X_test,
           y_train,
           y_test,
           class_num=class_num,
           input_dim=X_train.shape[1],
           epochs=epochs,
           batch_size=batch_size,
           optimizer=optimizer,
           loss=loss)
    else:
        tmp_data = pie(num=11, source_data=source_data, all_option=True)
        final_data = combine(tmp_data, twitter_source, google_source,
                             ig_source)
        input_x, revised_y = data_preprocess(final_data, nan=nan)
        ohe = OneHotEncoder()
        revised_y = ohe.fit_transform(revised_y.reshape(-1, 1)).toarray()
        X_train, X_test, y_train, y_test = train_test_split(input_x,
                                                            revised_y,
                                                            test_size=0.3,
                                                            random_state=42)

        print("labels")
        check_list = []
        for i in range(len(X_test)):
            # print(X_test[i][input_x.shape[1]-1])
            check_list.append(X_test[i][input_x.shape[1] - 1])
        check_list.sort()
        print(check_list)
        X_train = np.delete(X_train, -1, axis=1)
        X_test = np.delete(X_test, -1, axis=1)

        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)
        nn(X_train,
           X_test,
           y_train,
           y_test,
           class_num=class_num,
           input_dim=input_x.shape[1],
           epochs=epochs,
           batch_size=batch_size,
           optimizer=optimizer,
           loss=loss)
Пример #14
0
def load_data():
    train_df, test_df, sale_price, oridatas = data_preprocess()
    x_train = train_df.values
    x_test = test_df.values
    y_train = np.array(sale_price)
    return x_train, y_train, x_test, oridatas