def clustering(dbHost="", dbPort="", userName="", password="", dbName="", query="", xValues="", parametersObj="", columnsArray=""): ############## # connecting to BD ############## # data = pd.read_csv("E:/Work/aspirantura/kafedra/upwork/RtoP/Models_sep/Datasets/50000_Sales_Records_Dataset_e.csv") # data = pd.read_excel("E:/Work/aspirantura/kafedra/upwork/RtoP/Models_sep/Datasets/quine.xlsx") # data = pd.read_excel("E:/Work/aspirantura/kafedra/upwork/RtoP/Models_sep/Datasets/airfares.xlsx") data = pd.read_csv( "E:/Work/aspirantura/kafedra/upwork/RtoP/Models_sep/Datasets/AAPL.csv") data, columnsArray_e = columns_data_type(data[0:100], columnsArray) ent_cor, chisq_dependency, data, rm_cols, miss_cols, obj_t = correlations( data, columnsArray=columnsArray_e, method='predict', no_rem_col='none') print('Data info before clustering') print(data.info()) output = kmeans_model(data[xValues], xValues) return output
def forecasting(dbHost="",dbPort="",userName="",password="",dbName="",query="",xValues="",yValue="",parametersObj="",columnsArray=""): ############## # connecting to BD ############## # data = pd.read_csv("E:/Work/aspirantura/kafedra/upwork/RtoP/Models_sep/Datasets/50000_Sales_Records_Dataset_e.csv") # data = pd.read_excel("E:/Work/aspirantura/kafedra/upwork/RtoP/Models_sep/Datasets/quine.xlsx") # data = pd.read_excel("E:/Work/aspirantura/kafedra/upwork/RtoP/Models_sep/Datasets/airfares.xlsx") data = pd.read_csv("E:/Work/aspirantura/kafedra/upwork/RtoP/Models_sep/Datasets/AAPL.csv") data, columnsArray_e = columns_data_type(data[0:100], columnsArray) ent_cor,chisq_dependency,data,rm_cols, miss_cols, obj_t = correlations(data, columnsArray=columnsArray_e, method = 'impute', no_rem_col =['date']) output = forecasting_model(data,variable_col = xValues,date_col = yValue, model = parametersObj, independentVariables='',test_split = 0.4) return output
def decisiontree(dbHost="",dbPort="",userName="",password="",dbName="",query="",yValue="",xValues="",parametersObj="",columnsArray=""): ############## # connecting to BD ############## # data = pd.read_csv("E:/Work/aspirantura/kafedra/upwork/RtoP/Models_sep/Datasets/50000_Sales_Records_Dataset_e.csv") # data = pd.read_excel("E:/Work/aspirantura/kafedra/upwork/RtoP/Models_sep/Datasets/quine.xlsx") data = pd.read_excel("E:/Work/aspirantura/kafedra/upwork/RtoP/Models_sep/Datasets/airfares.xlsx") # data = pd.read_csv("E:/Work/aspirantura/kafedra/upwork/RtoP/Models_sep/Datasets/AAPL.csv") data, columnsArray_e = columns_data_type(data[0:100], columnsArray) ent_cor,chisq_dependency,data,rm_cols, miss_cols, obj_t = correlations(data, columnsArray=columnsArray_e, method='predict', no_rem_col='none') feature = xValues feature.append(yValue) df = data[feature] hf = h2o.H2OFrame(df) train, valid, test = hf.split_frame(ratios=[.8, .1]) # GBM model gbm = H2OGradientBoostingEstimator() gbm.train(xValues, yValue, training_frame= train, validation_frame=valid) var_imp = pd.DataFrame(gbm.varimp()) var_imp.columns = ['variable','relative_importance','scaled_importance','percentage'] # metrics variable_importance = var_imp[var_imp['relative_importance']>=0.1] variable_importance = variable_importance['variable'] gbm_cm = gbm.confusion_matrix() gbm_cm = gbm_cm.table.as_data_frame() gbm_prec = gbm.precision()[0][0] gbm_f1 = gbm.F1()[0][0] gbm_acc = gbm.accuracy()[0][0] gbm_rec = gbm_f1*gbm_prec/(2*gbm_prec-gbm_f1) # Tree info from h2o.tree import H2OTree tree = H2OTree(model = gbm, tree_number = 1, tree_class = None) nlg_tree = tree.descriptions tree_json = tree2json(tree) output = [tree_json, gbm_prec, gbm_rec, gbm_acc, gbm_cm, variable_importance, nlg_tree] return output
def linear_regression(dbHost="", dbPort="", userName="", password="", dbName="", query="", yValue="", xValues="", parametersObj="", columnsArray=""): ############## # connecting to BD ############## data = pd.read_csv( "E:/Work/aspirantura/kafedra/upwork/RtoP/Models_sep/Datasets/50000_Sales_Records_Dataset_e.csv" ) data, columnsArray_e = columns_data_type(data[0:100], columnsArray) ent_cor, chisq_dependency, data, rm_cols, miss_cols, obj_t = correlations( data, columnsArray=columnsArray_e, method='predict') #print(ent_cor,chisq_dependency, rm_cols, miss_cols) num_data = data.select_dtypes(include=['number']).copy() variable_imp = variable_importance_h2o(data, list(num_data.columns), yValue) vif_var = vif(num_data, yValue, 10) #print(variable_imp, vif_var) num_data_inv = transformation_inv(num_data, obj_t) x_mean = num_data_inv[xValues].mean() x_min = num_data_inv[xValues].min() x_max = num_data_inv[xValues].max() x_quant25 = num_data_inv[xValues].quantile(0.25) x_quant50 = num_data_inv[xValues].quantile(0.5) x_quant75 = num_data_inv[xValues].quantile(0.75) x_skew = scipy.stats.skew(num_data_inv[xValues]) if (data[yValue].dtypes == 'float') or (data[yValue].dtypes == 'int'): print( "Finding variable importance by taking given numeric variable as a dependent variable" ) hf = h2o.H2OFrame(num_data) #hf.col_names train, valid, test = hf.split_frame(ratios=[.8, .1]) glm_model = H2OGeneralizedLinearEstimator(family='gaussian') glm_model.train(xValues, yValue, training_frame=train, validation_frame=valid) print(glm_model) predicted = glm_model.predict(test_data=test) test_inv = transformation_inv(test.as_data_frame(), obj_t) true_y = test_inv[yValue] test[yValue] = predicted test_inv = transformation_inv(test.as_data_frame(), obj_t) pred_y = test_inv[yValue] linear_regr = [ pred_y, true_y, glm_model.r2(), glm_model.rmse(), ent_cor, glm_model.coef(), variable_imp, vif_var, [x_mean, x_min, x_max, x_quant25, x_quant50, x_quant75, x_skew] ] else: print( "Finding variable importance by taking categorical variables as dependent variable" ) gbm = H2OGradientBoostingEstimator() gbm.train(xValues, yValue, training_frame=train, validation_frame=valid) # print(gbm) predicted2 = gbm.predict(test_data=test) test_inv = transformation_inv(test.as_data_frame(), obj_t) true_y = test_inv[yValue] test[yValue] = predicted2 test_inv = transformation_inv(test.as_data_frame(), obj_t) pred2_y = test_inv[yValue] linear_regr = [ pred2_y, true_y, glm_model.r2(), glm_model.rmse(), ent_cor, glm_model.coef(), variable_imp, vif_var ] return linear_regr