Пример #1
0
def load_save(config_path):
    config = read_params(config_path)
    df = get_data(config_path)
    df = df.drop(['id', 'Unnamed: 32'], axis=1)
    df['diagnosis'] = pd.get_dummies(df['diagnosis'], drop_first=True)
    raw_data_path = config["load_data"]["raw_data_csv"]
    df.to_csv(raw_data_path, sep=',', index=False, encoding='utf-8')
def train_and_evaluate(config_path):
    config = read_params(config_path)
    test_data_path = config["split_data"]["test_path"]
    train_data_path = config["split_data"]["train_path"]
    random_state = config["base"]["random_state"]
    model_dir = config["model_dir"]


    alpha = config["estimators"]["ElasticNet"]["params"]["alpha"]
    l1_ratio = config["estimators"]["ElasticNet"]["params"]["l1_ratio"]

    target =  [config["base"]["target_col"]]

    train = pd.read_csv(train_data_path, sep = ",")
    test = pd.read_csv(test_data_path, sep=",")

    train_y = train[target]
    test_y =  test[target] 

    train_x = train.drop(target, axis=1)
    test_x = test.drop(target, axis=1)

    lr = ElasticNet(
        alpha=alpha, 
        l1_ratio=l1_ratio,
        random_state=random_state)

    lr.fit(train_x, train_y)

    predicted_qualities = lr.predict(test_x)

    (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

    print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
    print(" RMSE: %s" % rmse)
    print(" MAE: %s" % mae)
    print(" R2: %s" %r2)

#################################################################################
    score_file = config["reports"]["scores"]
    params_file = config["reports"]["params"]

    with open(score_file, "w") as f:
        scores = {
            "rmse": rmse,
            "mae": mae,
            "r2": r2 }
        json.dump(scores, f, indent=4)

    with open(params_file, "w") as f:
        scores = {
            "alpha": alpha,
            "l1_rate": l1_ratio }
        json.dump(scores, f, indent=4)
#################################################################################

    os.makedirs(model_dir, exist_ok=True)
    model_path = os.path.join(model_dir, "model.joblib")

    joblib.dump(lr, model_path)
Пример #3
0
def train_and_evaluate(config_path):
    config = read_params(config_path)
    test_data_path = config["split_data"]["test_path"]
    train_data_path = config["split_data"]["train_path"]
    model_dir = config["model_dir"]
    file_object = open('Training_log.txt', 'a+')
    logger = App_Logger()

    df = pd.read_csv(train_data_path)  #Reading the processed dataset

    df["date"] = pd.to_datetime(df["date"]).dt.date
    X_train = df[df['date'] <= datetime.date(
        2017, 5, 31)]  #splitting the dataset based on date for trainging data
    val_X = df[df['date'] > datetime.date(
        2017, 5, 31)]  #spliting the dataset based on date for validation data
    logger.log(file_object, "Splitting dataset completed")

    X_train = X_train.drop(['date'], axis=1)
    val_X = val_X.drop(['date'], axis=1)

    y_train = np.log1p((X_train["transactionRevenue"]).values)
    val_y = np.log1p((val_X["transactionRevenue"]).values)
    logger.log(file_object,
               "Log transformation of transaction Revenue values completed")
    x1 = X_train.drop(['transactionRevenue'], axis=1)
    val_x1 = val_X.drop(['transactionRevenue'], axis=1)
    y_train = pd.DataFrame(y_train)
    val_y = pd.DataFrame(val_y)

    ################## MLFLOW ######################
    mlflow_config = config["mlflow_config"]
    remote_server_uri = mlflow_config['remote_server_uri']
    mlflow.set_tracking_uri(remote_server_uri)
    mlflow.set_experiment(mlflow_config["experiment_name"])

    with mlflow.start_run(run_name=mlflow_config["run_name"]) as mlops_run:
        model_xgb = run_xgb(x1, y_train)
        y_train_predict = model_xgb.predict(x1)
        rmse, mae, r2 = eval_metrics(y_train, y_train_predict)

        mlflow.log_param("n_estimators", 1200)
        mlflow.log_param("learning_rate", 0.5)
        mlflow.log_param("max_depth", 8)

        mlflow.log_metric('rmse', rmse)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("r2", r2)

        tracking_url_type_store = urlparse(mlflow.get_artifact_uri()).scheme
        if tracking_url_type_store != "file":
            mlflow.sklearn.log_model(
                model_xgb,
                "model",
                registered_model_name=mlflow_config["registered_model_name"])
        else:
            mlflow.sklearn.load_model(model_xgb, "model")

    ##################### Saving the model as pickle file ################################
    logger.log(file_object, "Model file created successfully")
    file_object.close()
Пример #4
0
def load_and_save(config_path):
    config = read_params(config_path)
    df = get_data(config_path)
    new_columns = [col.replace(' ', '_') for col in df.columns]
    raw_data_path = config['load_data']['raw_dataset_csv']

    df.to_csv(raw_data_path, sep=',', index=False, header=new_columns)
Пример #5
0
def load_and_save(config_path):
    config = read_params(config_path)
    train_df, test_df = get_data(config_path)
    raw_train_data_path = config["load_data"]["raw_train_data_csv"]
    raw_test_data_path = config["load_data"]["raw_test_data_csv"]
    train_df.to_csv(raw_train_data_path, index=False)
    test_df.to_csv(raw_test_data_path, index=False)
Пример #6
0
def train_evaluate(config_file):
    config = read_params(config_file)
    train_path = config['split_data']['train_path']
    test_path = config['split_data']['test_path']
    target = config['base']['target_col']

    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)

    train_X, train_Y = train.loc[:, train.columns != target], train[target]
    test_X, test_Y = test.loc[:, test.columns != target], test[target]

    random_state = config['base']['random_state']
    alpha = config['estimators']['ElasticNet']['params']['alpha']
    l1_ratio = config['estimators']['ElasticNet']['params']['l1_ratio']

    model = ElasticNet(alpha=alpha,
                       l1_ratio=l1_ratio,
                       random_state=random_state)
    model.fit(train_X, train_Y)
    predictions = model.predict(test_X)

    model_dir = config['model_dir']
    joblib.dump(model, os.path.join(model_dir, 'model.joblib'))

    (rmse, mae, r2) = evaluate_metrics(test_Y, predictions)

    print("RMSE: ", rmse)
    print("MAE: ", mae)
    print("R2: ", r2)

    scores_file = config['report']['scores']
    with open(scores_file, "w") as f:
        scores = {"RMSE": rmse, "MAE": mae, "R2_Score": r2}
        json.dump(scores, f, indent=4)
Пример #7
0
def train_and_evaluate(config_path):
    config = read_params(config_path)
    test_data_path = config["split_data"]["test_path"]
    train_data_path = config["split_data"]["train_path"]
    random_state=config["base"]["random_state"]
    model_dir=config["model_dir"]

    alpha=config["estimators"]["ElasticNet"]["params"]["alpha"]
    l1_ratio=config["estimators"]["ElasticNet"]["params"]["l1_ratio"]

    target=config["base"]["target_col"]

    train = pd.read_csv(train_data_path, sep=",")
    test = pd.read_csv(test_data_path, sep=",")

    train_y = train[target]
    test_y =test[target]


    train_x =train.drop(target, axis=1)

    test_x =test.drop(target, axis=1)

    lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=random_state)
    lr.fit(train_x,train_y)

    predicted_qualities = lr.predict(test_x)

    (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)
Пример #8
0
def load_and_save(config_path):
    config = read_params(config_path)
    df = get_data(config_path)
    new_cols = [col.replace(" ", "_") for col in df.columns]
    #print(new_cols)
    raw_data_path = config["load_data"]["raw_dataset_csv"]
    df.to_csv(raw_data_path, sep=",", index=False, header=new_cols)
Пример #9
0
def load_and_save(config_path):
    config = read_params(config_path)
    df = get_data(config_path)
    #Replace spaces with _
    new_cols = [col.replace(" ", "_") for col in df.columns]
    raw_data_path = config["load_data"]["raw_dataset_csv"]
    #Get data to csv file
    df.to_csv(raw_data_path, sep=",", index=False, header=new_cols)
Пример #10
0
def load_and_save(config_path):
    print("=== Loading Files ===")
    config = read_params(config_path)
    df = get_data(config_path)
    new_cols = [col.replace(" ", "_") for col in df.columns]
    raw_data_path = config["load_data"]["raw_dataset_csv"]
    df.to_csv(raw_data_path, sep=",", index=False, header=new_cols)
    print("=== Data Loaded and Saved in data/raw ===")
Пример #11
0
def train_and_evaluate(config_path):
    config = read_params(config_path)
    test_data_path = config["split_data"]["test_path"]
    train_data_path = config["split_data"]["train_path"]
    random_state = config["base"]["random_state"]
    model_dir = config["model_dir"]

    alpha = config["estimators"]["ElasticNet"]["params"]["alpha"]
    l1_ratio = config["estimators"]["ElasticNet"]["params"]["l1_ratio"]

    target = [config["base"]["target_col"]]

    train = pd.read_csv(train_data_path, sep=",")
    test = pd.read_csv(test_data_path, sep=",")

    train_y = train[target]
    test_y = test[target]

    train_x = train.drop(target, axis=1)
    test_x = test.drop(target, axis=1)

    ################### MLFLOW ###############################
    mlflow_config = config["mlflow_config"]
    remote_server_uri = mlflow_config["remote_server_uri"]

    mlflow.set_tracking_uri(remote_server_uri)

    mlflow.set_experiment(mlflow_config["experiment_name"])

    with mlflow.start_run(run_name=mlflow_config["run_name"]) as mlops_run:
        lr = ElasticNet(
            alpha=alpha,
            l1_ratio=l1_ratio,
            random_state=random_state)
        lr.fit(train_x, train_y)

        predicted_qualities = lr.predict(test_x)

        (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

        mlflow.log_param("alpha", alpha)
        mlflow.log_param("l1_ratio", l1_ratio)

        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("r2", r2)

        tracking_url_type_store = urlparse(mlflow.get_artifact_uri()).scheme

        if tracking_url_type_store != "file":
            mlflow.sklearn.log_model(
                lr,
                "model",
                registered_model_name=mlflow_config["registered_model_name"])
        else:
            mlflow.sklearn.load_model(lr, "model")
Пример #12
0
def split_and_saved_data(config_path):
    config = read_params(config_path)
    test_data_path = config["split_data"]["test_path"]
    train_data_path = config["split_data"]["train_path"]
    raw_data_path = config["load_data"]["raw_dataset_csv"]
    split_ratio = config["split_data"]["test_size"]
    random_state = config["base"]["random_state"]

    df = pd.read_csv(raw_data_path, sep=",")
    train, test = train_test_split(df, test_size=split_ratio, random_state=random_state)
    train.to_csv(train_data_path, sep=",", index=False, encoding='utf-8')
    test.to_csv(test_data_path, sep=",", index=False, encoding='utf-8')
def train_and_evaluate(config_path):
    config = read_params(config_path)
    test_data_path = config["preprocess"]["test_path"]
    train_data_path = config["preprocess"]["train_path"]
    model_dir = config["model_dir"]

    c = config["estimators"]["SVC"]["params"]["C"]
    gamma = config["estimators"]["SVC"]["params"]["gamma"]

    target = [config["base"]["target_col"]]

    train = pd.read_csv(train_data_path, sep=",")
    test = pd.read_csv(test_data_path, sep=",")

    train_y = train[target]
    test_y = test[target]

    train_x = train.drop(target, axis=1)
    test_x = test.drop(target, axis=1)

    svc = SVC(C=c, gamma=gamma)
    svc.fit(train_x, train_y)

    predicted_attrition = svc.predict(test_x)
    (Recall, Precision, F1_Score, AUC) = eval_metrics(test_y,
                                                      predicted_attrition)

    print("SVC model (C=%f, gamma=%f):" % (c, gamma))
    print("  Recall: %s" % Recall)
    print("  Precision: %s" % Precision)
    print("  F1_Score: %s" % F1_Score)
    print("  AUC: %s" % AUC)

    scores_file = config["reports"]["scores"]
    params_file = config["reports"]["params"]

    with open(scores_file, "w") as f:
        scores = {
            "Recall": Recall,
            "Precision": Precision,
            "F1_Score": F1_Score,
            "AUC": AUC
        }
        json.dump(scores, f, indent=4)

    with open(params_file, "w") as f:
        params = {"C": c, "Gamma": gamma}
        json.dump(params, f, indent=4)

    os.makedirs(model_dir, exist_ok=True)
    model_path = os.path.join(model_dir, "model.joblib")

    joblib.dump(svc, model_path)
Пример #14
0
def split_XY(config_path):
    config = read_params(config_path)
    filter_data_path = config["filter_data"]["filter_data_csv"]
    df = pd.read_csv(filter_data_path, sep=',')
    train_data_path = config["split_data"]["train_path"]
    test_data_path = config["split_data"]["test_path"]
    random_state = config["base"]["random_state"]
    split_ratio = config["split_data"]["test_size"]
    train, test = train_test_split(df,
                                   test_size=split_ratio,
                                   random_state=random_state)
    train.to_csv(train_data_path, sep=',', index=False, encoding='utf-8')
    test.to_csv(test_data_path, sep=',', index=False, encoding='utf-8')
Пример #15
0
def split_and_saved_data(config_path):
    config = read_params(config_path)
    test_data_path = config['split_data']['test_path']
    train_data_path = config['split_data']['train_path']
    raw_dataset_csv = config['load_data']['raw_dataset_csv']
    split_ratio = config['split_data']['test_size']
    random_state = config['base']['random_state']
    dataset = pd.read_csv(raw_dataset_csv)
    train_data, test_data = train_test_split(dataset,
                                             test_size=split_ratio,
                                             random_state=random_state)
    train_data.to_csv(train_data_path, index=False, sep=",")
    test_data.to_csv(test_data_path, index=False, sep=",")
def train_and_evaluate(config_path):
	config = read_params(config_path)
	test_data_path=config["split_data"]["test_path"]
	train_data_path=config["split_data"]["train_path"]
	random_state=config["base"]["random_state"]
	model_dir=config["model_dir"]

	alpha=config["estimators"]["ElasticNet"]["params"]["alpha"]
	l1_ratio=config["estimators"]["ElasticNet"]["params"]["l1_ratio"]

	target=config["base"]["target_col"]

	train = pd.read_csv(train_data_path,sep=",")
	test = pd.read_csv(test_data_path,sep=",")

	train_y = train[target]
	test_y = test[target]
	train_x = train.drop(target,axis=1)
	test_x = test.drop(target,axis=1)

	test_y = test[target]

	lr=ElasticNet(alpha=alpha,l1_ratio=l1_ratio,random_state=random_state)
	lr.fit(train_x,train_y)
	predicted_values=lr.predict(test_x)

	(rmse,mae,r2)=eval_metrics(test_y,predicted_values)

	print("Alpha Ratio: %s):"% alpha)
	print("L1 Ratio: %s):"% l1_ratio)

	scores_file=config["reports"]["scores"]
	params_file=config["reports"]["params"]

	with open(scores_file,"w") as f:
		scores={
			"rmse":rmse,
			"mae":mae,
			"r2":r2
		}
		json.dump(scores, f, indent=4)

	with open(params_file,"w") as f:
		params={
			"alpha":alpha,
			"l1_ratio":l1_ratio
		}
	#4 spaces

	json.dump(params, f, indent=4)
	os.makedirs(model_dir, exist_ok=True)
Пример #17
0
def split_and_saved_data(config_path):
    config = read_params(config_path)
    test_data_path = config['split_data']['test_path']
    train_data_path = config['split_data']['train_path']
    raw_data_path = config['load_data']['raw_dataset_csv']
    split_ratio = config['split_data']['test_size']
    random_state = config['base']['random_state']

    df = pd.read_csv(raw_data_path, sep=',')
    train, test = train_test_split(df,
                                   test_size=split_ratio,
                                   random_state=random_state)
    train.to_csv(train_data_path, sep=',', index=False)
    test.to_csv(test_data_path, sep=',', index=False)
Пример #18
0
def split_and_saved_data(config_path):
    config = read_params(config_path)
    # get all the path form params.yaml file
    # here we need the path for storing the train and test data
    test_data_path = config["split_data"]["test_path"]
    train_data_path = config["split_data"]["train_path"]
    raw_data_path = config["load_data"]["raw_dataset_csv"]
    split_ratio = config["split_data"]["test_size"]
    random_state = config["base"]["random_state"]

    df = pd.read_csv(raw_data_path, sep=",") # read raw data path
    train, test = train_test_split(df,test_size=split_ratio, random_state = random_state)
    train.to_csv(train_data_path, sep=",",index=False, encoding='utf-8')  # save train data into file
    test.to_csv(test_data_path, sep=",",index=False, encoding='utf-8')  # save test data into file
Пример #19
0
def load_n_save(config_path):
    '''
    loads the data from the config_path using the 
    functions from get_data.py file
    and saves to the data folder
    '''
    config = read_params(config_path)
    df = getData(config_path)
    # a liitle preprocessing for changing the name of the columns
    # because the names have spaces between them which can cause issues 
    # in the future
    up_cols = [col.replace(" ","_") for col in df.columns] 
    raw_path = config["load_data"]["raw_dataset"]
    df.to_csv(raw_path, sep=",", index=False, header=up_cols) 
Пример #20
0
def training_evaluation(config_path):
    config = read_params(config_path)

    train_path = config["split_data"]["train_path"]
    test_path = config["split_data"]["test_path"]
    random_state = config["base"]["random_state"]
    alpha = config["estimators"]["ElasticNet"]["params"]["alpha"]
    l1_ratio = config["estimators"]["ElasticNet"]["params"]["l1_ratio"]
    model_dir = config["model_dir"]
    target_col = config["base"]["target_col"]

    train_data = pd.read_csv(train_path, sep=",")
    test_data = pd.read_csv(test_path, sep=",")

    X_train = train_data.drop(target_col, axis=1)
    y_train = train_data[target_col]

    X_test = test_data.drop(target_col, axis=1)
    y_test = test_data[target_col]

    lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=random_state)
    lr.fit(X_train, y_train)

    predicted_val = lr.predict(X_test)

    rmse, mae, r2 = evaluate_metrics(y_test, predicted_val)

    print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
    print("  RMSE: %s" % rmse)
    print("  MAE: %s" % mae)
    print("  R2: %s" % r2)

    scores_file = config["reports"]["scores"]
    params_file = config["reports"]["params"]

    with open(scores_file, "w") as f:
        scores = {"rmse": rmse, "mae": mae, "r2": r2}
        json.dump(scores, f, indent=4)

    with open(params_file, "w") as f:
        params = {
            "alpha": alpha,
            "l1_ratio": l1_ratio,
        }
        json.dump(params, f, indent=4)

    os.makedirs(model_dir, exist_ok=True)
    model_path = os.path.join(model_dir, "model.joblib")

    joblib.dump(lr, model_path)
Пример #21
0
def split_and_save_data(config_path):
    config=read_params(config_path)
    test_data_path=config["split_data"]['test_path']
    train_data_path=config["split_data"]['train_path']
    raw_data_path=config["load_data"]['raw_dataset_csv']
    split_ratio=config["split_data"]['test_size']
    random_state=config["base"]['random_state']

    df=pd.read_csv(raw_data_path,sep=",")
    train,test=train_test_split(df,
    test_size=split_ratio,
    random_state=random_state)
    train.to_csv(train_data_path,sep=",",encoding="utf-8")
    test.to_csv(test_data_path,sep=",",encoding="utf-8")
Пример #22
0
def train_n_evaluate(config_path):
    config = read_params(config_path)
    train_path = config["split_data"]["train_path"]
    test_path = config["split_data"]["test_path"]
    random_state = config["base"]["random_state"]
    model_dir = config["model_dir"]
    alpha = config["estimators"]["Elastic_Net"]["params"]["alpha"]
    l1_ratio = config["estimators"]["Elastic_Net"]["params"]["l1_ratio"]

    target = config["base"]["target_col"]

    train = pd.read_csv(train_path, sep=",")
    test = pd.read_csv(test_path, sep=",")

    y_train = train[target]
    y_test = test[target]

    train_data = train.drop(target, axis=1)
    test_data = test.drop(target, axis=1)

    model = ElasticNet(alpha=alpha,
                       l1_ratio=l1_ratio,
                       random_state=random_state)

    model.fit(train_data, y_train)

    y_pred = model.predict(test_data)
    (mae, mse, r2) = get_metrics(y_pred, y_test)

    params_file = config["reports"]["params"]
    scores_file = config["reports"]["scores"]

    with open(scores_file, "w") as f:
        scores = {'mse': mse, 'mae': mae, 'r2': r2}
        json.dump(scores, f, indent=4)

    with open(params_file, "w") as f:
        params = {'alpha': alpha, 'l1_ratio': l1_ratio}
        json.dump(params, f, indent=4)

    print(f"ElasticNet model(aplha={alpha}, l1_ratio={l1_ratio})")
    print(f"MSE: {mse}")
    print(f"MAE: {mae}")
    print(f"R2: {r2}")

    os.makedirs(model_dir, exist_ok=True)
    model_path = os.path.join(model_dir, "model.joblib")

    joblib.dump(model, model_path)
Пример #23
0
def load_and_save(config_path):
    config = read_params(config_path)
    df = get_data(config_path)

    # there are some gaps between columns in dataset
    # that maybe create some issues that's why we chance col name

    new_cols = [col.replace(" ", "_") for col in df.columns]
    # as we know there are space in beteen thw column name
    # we simply replace that space with "_"
    # Ex:- "first name" ---> "first_name"
    # print(new_cols)
    # path form patams.yaml for do changes in original file
    raw_data_path = config["load_data"]["raw_dataset_csv"]

    # now save that changes in new csv file
    df.to_csv(raw_data_path, sep=",", index=False, header=new_cols)
Пример #24
0
def split_and_saved_data(config_path):
    config = read_params(config_path)
    test_data_path = config["split_data"]["test_path"]
    train_data_path = config["split_data"]["train_path"]
    raw_data_path = config["load_data"]["raw_dataset_csv"]
    #split_ratio = ["split_data"]["train_size"]

    data = convert_to_date(config_path)

    #train = data[:int(split_ratio * (len(data)))]
    #test = data[int(split_ratio * (len(data))):]

    train = data[:int(0.8 * (len(data)))]
    test = data[int(0.8 * (len(data))):]

    train.to_csv(train_data_path,encoding='utf-8')
    test.to_csv(test_data_path,encoding='utf-8')
Пример #25
0
def train_and_evaluate(config_path):
    config = read_params(config_path)
    test_data_path = config['split_data']['test_path']
    train_data_path = config['split_data']['train_path']
    random_state = config['base']['random_state']
    model_dir = config['model_dir']
    alpha = config['estimators']['ElasticNet']['params']['alpha']
    l1_ratio = config['estimators']['ElasticNet']['params']['l1_ratio']
    target = [config['base']['target_col']]

    train = pd.read_csv(train_data_path, sep=',')
    test = pd.read_csv(test_data_path, sep=',')

    train_y = train[target]
    test_y = test[target]

    train_x = train.drop(target, axis=1)
    test_x = test.drop(target, axis=1)
    lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=random_state)
    lr.fit(train_x, train_y)

    predicted_qualities = lr.predict(test_x)

    rmse, mae, r2 = eval_matrix(test_y, predicted_qualities)

    scores_file = config["reports"]["scores"]
    params_file = config["reports"]["params"]

    with open(scores_file, "w") as f:
        scores = {"rmse": rmse, "mae": mae, "r2": r2}
        json.dump(scores, f, indent=4)

    with open(params_file, "w") as f:
        params = {
            "alpha": alpha,
            "l1_ratio": l1_ratio,
        }
        json.dump(params, f, indent=4)

    os.makedirs(model_dir, exist_ok=True)
    model_path = os.path.join(model_dir, "model.joblib")

    joblib.dump(lr, model_path)
Пример #26
0
def log_production_model(config_path):
    config = read_params(config_path)

    mlflow_config = config["mlflow_config"]

    model_name = mlflow_config["registered_model_name"]

    remote_server_uri = mlflow_config["remote_server_uri"]

    mlflow.set_tracking_uri(remote_server_uri)

    runs = mlflow.search_runs(experiment_ids=1)
    lowest = runs["metrics.mae"].sort_values(ascending=True)[0]
    lowest_run_id = runs[runs["metrics.mae"] == lowest]["run_id"][0]

    client = MlflowClient()
    for mv in client.search_model_versions(f"name='{model_name}'"):
        mv = dict(mv)

        if mv["run_id"] == lowest_run_id:
            current_version = mv["version"]
            logged_model = mv["source"]
            pprint(mv, indent=4)
            client.transition_model_version_stage(
                name=model_name,
                version=current_version,
                stage="Production"
            )
        else:
            current_version = mv["version"]
            client.transition_model_version_stage(
                name=model_name,
                version=current_version,
                stage="Staging"
            )

    loaded_model = mlflow.pyfunc.load_model(logged_model)

    model_path = config["webapp_model_dir"]  # "prediction_service/model"

    joblib.dump(loaded_model, model_path)
Пример #27
0
def preprocessing(config_path):
    config = read_params(config_path)
    raw_data_path = config["load_data"]["raw_dataset_csv"]
    preprocessed_data_path = config["preprocess"]["processed_dataset_csv"]
    curr_year = config["preprocess"]["current_year"]

    ### This function is used to get CSV data as dataframe
    df = get_data(config_path)
    update_df = df[[
        'Year', 'Selling_Price', 'Present_Price', 'Kms_Driven', 'Fuel_Type',
        'Seller_Type', 'Transmission', 'Owner'
    ]]

    ### As we see that year is appearing in year format but it is not tuthfull, so it need to fit as no of years from current year.
    update_df['No_Years'] = int(curr_year) - update_df['Year']
    update_df.drop(['Year'], axis=1, inplace=True)
    update_df = pd.get_dummies(update_df, drop_first=True)
    update_df.to_csv(preprocessed_data_path,
                     sep=",",
                     index=False,
                     encoding="utf-8")
Пример #28
0
def split_n_save(config_path):
    '''
    Function will split the data into train and test set
    and will save it in data\processed folder
    '''
    config = read_params(config_path)
    # Fetching the configurations
    train_path = config["split_data"]["train_path"]
    test_path = config["split_data"]["test_path"]
    raw_path = config["load_data"]["raw_dataset"]
    test_split = config["split_data"]["test_size"]
    random_state=config["base"]["random_state"]
    #Processing and saving
    print(train_path)
    print(test_path)
    df = pd.read_csv(raw_path, sep=",")
    train,test = train_test_split(df, 
                    test_size=test_split, 
                    random_state=random_state)   
    train.to_csv(train_path, sep=",", index=False)
    test.to_csv(test_path, sep=",", index=False)
def train_and_evaluate(config_path):
    config = read_params(config_path)
    test_data_path = config["split_data"]["test_path"]
    train_data_path = config["split_data"]["train_path"]
    model_dir = config["model_dir"]
    random_state = config["base"]["random_state"]
    target = config["base"]["target"]

    train = pd.read_csv(train_data_path, sep=',')
    test = pd.read_csv(test_data_path, sep=',')

    train_x = train.drop(target, axis=1)
    test_x = test.drop(target, axis=1)

    train_y = train[target]
    test_y = test[target]

    RF = RandomForestClassifier(random_state=random_state)
    RF.fit(train_x, train_y)
    RFPrediction = RF.predict(test_x)

    accuracy = evaluate_accuracy(test_y, RFPrediction)
    print("Model accuracy: %s" % accuracy)

    scores_file = config["report"]["scores"]
    with open(scores_file, 'w') as f:
        scores = {"acurracy": accuracy}
        json.dump(scores, f, indent=4)

    k, corr_value = feature_select(config_path)

    params_file = config["report"]["params"]
    with open(params_file, 'w') as f:
        params = {"k": k, "correlation": corr_value}
        json.dump(params, f, indent=4)

    os.makedirs(model_dir, exist_ok=True)
    model_path = os.path.join(model_dir, "model.joblib")

    joblib.dump(RF, model_path)
def train_and_evaluate(config_path):
    config = read_params(config_path)
    test_data_path = config["split_data"]["test_path"]
    train_data_path = config["split_data"]["train_path"]
    random_state = config["base"]["random_state"]
    model_dir = config["model_dir"]

    alpha = config["estimators"]["ElasticNet"]["params"]["alpha"]
    l1_ratio = config["estimators"]["ElasticNet"]["params"]["l1_ratio"]

    target = [config["base"]["target_col"]]

    train = pd.read_csv(train_data_path, sep=",")
    test = pd.read_csv(test_data_path, sep=",")

    train_y = train[target]
    test_y = test[target]

    train_x = train.drop(target, axis=1)
    test_x = test.drop(target, axis=1)

    lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=random_state)

    lr.fit(train_x, train_y)

    predicted_qualities = lr.predict(test_x)

    (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

    print(f"Elasticnet model (alpha: {alpha}  l1_ratio: {l1_ratio})")
    print(f"  RMSE: {rmse}")
    print(f"  MAE: {mae}")
    print(f"  R2: {r2}")

    os.makedirs(model_dir, exist_ok=True)
    model_path = os.path.join(model_dir, "model.joblib")

    joblib.dump(lr, model_path)