Пример #1
0
def text_analysis(dbHost="",
                  dbPort="",
                  userName="",
                  password="",
                  dbName="",
                  query="",
                  textColumns="",
                  parametersObj="",
                  columnsArray=""):
    ##############
    # connecting to BD
    ##############

    data_text = pd.read_csv(
        "E:/Work/aspirantura/kafedra/upwork/RtoP/Models_sep/Datasets/googleplaystore_user_reviews.csv"
    )
    data, columnsArray_e = columns_data_type(data_text, columnsArray)

    data = data_text[textColumns].dropna()

    corpus_bow, corpus_tfidf, id2word = clean_text(data)
    key_words, doc_topic = lda_model(corpus=corpus_bow,
                                     id2word=id2word,
                                     num_topics=5,
                                     num_term=10)
    sentiment = sentiment_analysis(data_text=data_text,
                                   data=data,
                                   doc_topic=doc_topic,
                                   sent_col=parametersObj)

    return [sentiment, doc_topic, key_words]
Пример #2
0
def clustering(dbHost="",
               dbPort="",
               userName="",
               password="",
               dbName="",
               query="",
               xValues="",
               parametersObj="",
               columnsArray=""):
    ##############
    # connecting to BD
    ##############

    #    data = pd.read_csv("E:/Work/aspirantura/kafedra/upwork/RtoP/Models_sep/Datasets/50000_Sales_Records_Dataset_e.csv")
    #    data = pd.read_excel("E:/Work/aspirantura/kafedra/upwork/RtoP/Models_sep/Datasets/quine.xlsx")
    #    data = pd.read_excel("E:/Work/aspirantura/kafedra/upwork/RtoP/Models_sep/Datasets/airfares.xlsx")
    data = pd.read_csv(
        "E:/Work/aspirantura/kafedra/upwork/RtoP/Models_sep/Datasets/AAPL.csv")
    data, columnsArray_e = columns_data_type(data[0:100], columnsArray)
    ent_cor, chisq_dependency, data, rm_cols, miss_cols, obj_t = correlations(
        data, columnsArray=columnsArray_e, method='predict', no_rem_col='none')
    print('Data info before clustering')
    print(data.info())

    output = kmeans_model(data[xValues], xValues)

    return output
Пример #3
0
def forecasting(dbHost="",dbPort="",userName="",password="",dbName="",query="",xValues="",yValue="",parametersObj="",columnsArray=""):
    ##############
    # connecting to BD
    ##############

#    data = pd.read_csv("E:/Work/aspirantura/kafedra/upwork/RtoP/Models_sep/Datasets/50000_Sales_Records_Dataset_e.csv")
#    data = pd.read_excel("E:/Work/aspirantura/kafedra/upwork/RtoP/Models_sep/Datasets/quine.xlsx")
#    data = pd.read_excel("E:/Work/aspirantura/kafedra/upwork/RtoP/Models_sep/Datasets/airfares.xlsx")
    data = pd.read_csv("E:/Work/aspirantura/kafedra/upwork/RtoP/Models_sep/Datasets/AAPL.csv")

    data, columnsArray_e = columns_data_type(data[0:100], columnsArray)
    ent_cor,chisq_dependency,data,rm_cols, miss_cols, obj_t = correlations(data, columnsArray=columnsArray_e, method = 'impute', no_rem_col =['date'])

    output = forecasting_model(data,variable_col = xValues,date_col = yValue, model = parametersObj, independentVariables='',test_split = 0.4)
   
    return output
Пример #4
0
def decisiontree(dbHost="",dbPort="",userName="",password="",dbName="",query="",yValue="",xValues="",parametersObj="",columnsArray=""):
    ##############
    # connecting to BD
    ##############

#    data = pd.read_csv("E:/Work/aspirantura/kafedra/upwork/RtoP/Models_sep/Datasets/50000_Sales_Records_Dataset_e.csv")
#    data = pd.read_excel("E:/Work/aspirantura/kafedra/upwork/RtoP/Models_sep/Datasets/quine.xlsx")
    data = pd.read_excel("E:/Work/aspirantura/kafedra/upwork/RtoP/Models_sep/Datasets/airfares.xlsx")
#    data = pd.read_csv("E:/Work/aspirantura/kafedra/upwork/RtoP/Models_sep/Datasets/AAPL.csv")


    data, columnsArray_e = columns_data_type(data[0:100], columnsArray)
    ent_cor,chisq_dependency,data,rm_cols, miss_cols, obj_t = correlations(data, columnsArray=columnsArray_e, method='predict', no_rem_col='none')
    
    feature = xValues
    feature.append(yValue) 
    
    df = data[feature]
    
    hf = h2o.H2OFrame(df)
    
    train, valid, test = hf.split_frame(ratios=[.8, .1])    
    # GBM model
    gbm = H2OGradientBoostingEstimator()
    gbm.train(xValues, yValue, training_frame= train, validation_frame=valid)
    var_imp = pd.DataFrame(gbm.varimp())
    var_imp.columns = ['variable','relative_importance','scaled_importance','percentage']
    # metrics
    variable_importance = var_imp[var_imp['relative_importance']>=0.1]
    variable_importance = variable_importance['variable']
    gbm_cm = gbm.confusion_matrix()
    gbm_cm = gbm_cm.table.as_data_frame()
    gbm_prec = gbm.precision()[0][0]
    gbm_f1 = gbm.F1()[0][0]
    gbm_acc = gbm.accuracy()[0][0]
    gbm_rec = gbm_f1*gbm_prec/(2*gbm_prec-gbm_f1)
    # Tree info
    from h2o.tree import H2OTree
    tree = H2OTree(model = gbm, tree_number = 1, tree_class = None)
    nlg_tree = tree.descriptions
    tree_json = tree2json(tree)
    
    output = [tree_json, gbm_prec, gbm_rec, gbm_acc, gbm_cm, variable_importance, nlg_tree] 
    return output
Пример #5
0
def linear_regression(dbHost="",
                      dbPort="",
                      userName="",
                      password="",
                      dbName="",
                      query="",
                      yValue="",
                      xValues="",
                      parametersObj="",
                      columnsArray=""):
    ##############
    # connecting to BD
    ##############

    data = pd.read_csv(
        "E:/Work/aspirantura/kafedra/upwork/RtoP/Models_sep/Datasets/50000_Sales_Records_Dataset_e.csv"
    )
    data, columnsArray_e = columns_data_type(data[0:100], columnsArray)

    ent_cor, chisq_dependency, data, rm_cols, miss_cols, obj_t = correlations(
        data, columnsArray=columnsArray_e, method='predict')
    #print(ent_cor,chisq_dependency, rm_cols, miss_cols)

    num_data = data.select_dtypes(include=['number']).copy()
    variable_imp = variable_importance_h2o(data, list(num_data.columns),
                                           yValue)
    vif_var = vif(num_data, yValue, 10)
    #print(variable_imp, vif_var)
    num_data_inv = transformation_inv(num_data, obj_t)
    x_mean = num_data_inv[xValues].mean()
    x_min = num_data_inv[xValues].min()
    x_max = num_data_inv[xValues].max()
    x_quant25 = num_data_inv[xValues].quantile(0.25)
    x_quant50 = num_data_inv[xValues].quantile(0.5)
    x_quant75 = num_data_inv[xValues].quantile(0.75)
    x_skew = scipy.stats.skew(num_data_inv[xValues])

    if (data[yValue].dtypes == 'float') or (data[yValue].dtypes == 'int'):
        print(
            "Finding variable importance by taking given numeric variable as a dependent variable"
        )
        hf = h2o.H2OFrame(num_data)
        #hf.col_names
        train, valid, test = hf.split_frame(ratios=[.8, .1])

        glm_model = H2OGeneralizedLinearEstimator(family='gaussian')
        glm_model.train(xValues,
                        yValue,
                        training_frame=train,
                        validation_frame=valid)
        print(glm_model)
        predicted = glm_model.predict(test_data=test)

        test_inv = transformation_inv(test.as_data_frame(), obj_t)
        true_y = test_inv[yValue]
        test[yValue] = predicted
        test_inv = transformation_inv(test.as_data_frame(), obj_t)
        pred_y = test_inv[yValue]

        linear_regr = [
            pred_y, true_y,
            glm_model.r2(),
            glm_model.rmse(), ent_cor,
            glm_model.coef(), variable_imp, vif_var,
            [x_mean, x_min, x_max, x_quant25, x_quant50, x_quant75, x_skew]
        ]

    else:
        print(
            "Finding variable importance by taking categorical variables as dependent variable"
        )
        gbm = H2OGradientBoostingEstimator()
        gbm.train(xValues,
                  yValue,
                  training_frame=train,
                  validation_frame=valid)
        #        print(gbm)
        predicted2 = gbm.predict(test_data=test)

        test_inv = transformation_inv(test.as_data_frame(), obj_t)
        true_y = test_inv[yValue]
        test[yValue] = predicted2
        test_inv = transformation_inv(test.as_data_frame(), obj_t)
        pred2_y = test_inv[yValue]

        linear_regr = [
            pred2_y, true_y,
            glm_model.r2(),
            glm_model.rmse(), ent_cor,
            glm_model.coef(), variable_imp, vif_var
        ]

    return linear_regr