def _random_forest_classification_train(table, feature_cols, label_col,
                                 n_estimators=10, criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1,
                                 min_weight_fraction_leaf=0, max_features="sqrt",
                                 max_leaf_nodes=None, min_impurity_decrease=0, class_weight=None, random_state=None):
    
    feature_names, features_train = check_col_type(table, feature_cols)
    # X_train = table[feature_cols]
    y_train = table[label_col]

    if(type_of_target(y_train) == 'continuous'):
        raise_error('0718', 'label_col')
    
    if max_features == "n":
        max_features = None
        
    class_labels = y_train.unique()
    if class_weight is not None:
        if len(class_weight) != len(class_labels):
            raise ValueError("Number of class weights should match number of labels.")
        else:
            classes = sorted(class_labels)              
            class_weight = {classes[i] : class_weight[i] for i in range(len(classes))}
            
    classifier = RandomForestClassifier(n_estimators=n_estimators,
                                        criterion=criterion,
                                        max_depth=max_depth,
                                        min_samples_split=min_samples_split,
                                        min_samples_leaf=min_samples_leaf,
                                        min_weight_fraction_leaf=min_weight_fraction_leaf,
                                        max_features=max_features,
                                        max_leaf_nodes=max_leaf_nodes,
                                        min_impurity_decrease=min_impurity_decrease,
                                        class_weight=class_weight,
                                        random_state=random_state)
    classifier.fit(features_train, y_train)

    params = {'feature_cols': feature_cols,
             'label_col': label_col,
             'n_estimators': n_estimators,
             'criterion': criterion,
             'max_depth': max_depth,
             'min_samples_split': min_samples_split,
             'min_samples_leaf': min_samples_leaf,
             'min_weight_fraction_leaf': min_weight_fraction_leaf,
             'max_features': max_features,
             'max_leaf_nodes': max_leaf_nodes,
             'min_impurity_decrease': min_impurity_decrease,
             'class_weight': class_weight,
             'random_state': random_state}
    
    model = _model_dict('random_forest_classification_model')
    model['classifier'] = classifier
    model['params'] = params

    fig_feature_importances = _plot_feature_importances(feature_names, classifier)
           
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## Random Forest Classification Train Result
    |
    | ### Parameters
    | {params}
    |
    | ### Feature Importance
    | {fig_feature_importances}
    |
    """.format(params=dict2MD(params), fig_feature_importances=fig_feature_importances)))
        
    model['_repr_brtc_'] = rb.get()
    feature_importance = classifier.feature_importances_
    feature_importance_table = pd.DataFrame([[feature_cols[i], feature_importance[i]] for i in range(len(feature_cols))], columns=['feature_name', 'importance'])
    model['feature_importance_table'] = feature_importance_table
    return {'model' : model}
Exemplo n.º 2
0
def _cross_table(table, input_cols_1, input_cols_2, result='N', margins=False):

    df1 = [table[col] for col in input_cols_1]
    df2 = [table[col] for col in input_cols_2]

    # cross table
    if result == 'N':
        result_table = pd.crosstab(df1, df2, margins=margins)
    elif result == 'N / Row Total':
        result_table = pd.crosstab(df1,
                                   df2,
                                   margins=margins,
                                   normalize='index')
    elif result == 'N / Column Total':
        result_table = pd.crosstab(df1,
                                   df2,
                                   margins=margins,
                                   normalize='columns')
    elif result == 'N / Total':
        result_table = pd.crosstab(df1, df2, margins=margins, normalize='all')
    else:
        raise_runtime_error("Please check 'result'.")

    # each row and column name
    row_names = list(result_table.index)[:]
    if len(input_cols_1) == 1:
        joined_row_name = [str(i) for i in row_names]
    else:
        if margins == False:
            joined_row_name = [
                '_'.join(str(s) for s in row_names[i])
                for i in range(len(row_names))
            ]
        elif margins == True:
            joined_row_name = [
                '_'.join(str(s) for s in row_names[i])
                for i in range(len(row_names) - 1)
            ] + [row_names[-1][0]]

    column_names = list(result_table.columns)[:]
    if len(input_cols_2) == 1:
        joined_column_name = [str(i) for i in column_names]
    else:
        if margins == False:
            joined_column_name = [
                '_'.join(str(s) for s in column_names[i])
                for i in range(len(column_names))
            ]
        elif margins == True:
            joined_column_name = [
                '_'.join(str(s) for s in column_names[i])
                for i in range(len(column_names) - 1)
            ] + [column_names[-1][0]]

    # cross table
    if result == 'N':
        result_table.insert(loc=0, column=' ', value=joined_row_name)
        result_table.columns = np.append('N', joined_column_name)
    # cross table normalize by row
    elif result == 'N / Row Total':
        result_table.insert(loc=0, column=' ', value=joined_row_name)
        result_table.columns = np.append('N / Row Total', joined_column_name)
    # cross table normalize by column
    elif result == 'N / Column Total':
        result_table.insert(loc=0, column=' ', value=joined_row_name)
        result_table.columns = np.append('N / Column Total',
                                         joined_column_name)
    # cross table normalize by all values
    elif result == 'N / Total':
        result_table.insert(loc=0, column=' ', value=joined_row_name)
        result_table.columns = np.append('N / Total', joined_column_name)
    else:
        raise_runtime_error("Please check 'result'.")

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Cross Table Result
    | ### Result Type : {result}
    |
    | #### Result Table
    |
    | {result_table}
    |
    """.format(result=result,
               result_table=pandasDF2MD(result_table,
                                        num_rows=len(result_table.index) +
                                        1))))

    model = _model_dict('cross_table')
    model['result'] = result
    model['result_table'] = result_table
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
def _decision_tree_classification_train(
        table,
        feature_cols,
        label_col,  # fig_size=np.array([6.4, 4.8]), 
        criterion='gini',
        splitter='best',
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        min_weight_fraction_leaf=0.0,
        max_features=None,
        random_state=None,
        max_leaf_nodes=None,
        min_impurity_decrease=0.0,
        min_impurity_split=None,
        class_weight=None,
        presort=False,
        sample_weight=None,
        check_input=True,
        X_idx_sorted=None):

    param_validation_check = [
        greater_than_or_equal_to(min_samples_split, 2, 'min_samples_split'),
        greater_than_or_equal_to(min_samples_leaf, 1, 'min_samples_leaf'),
        greater_than_or_equal_to(min_weight_fraction_leaf, 0.0,
                                 'min_weight_fraction_leaf')
    ]
    if max_depth is not None:
        param_validation_check.append(
            greater_than_or_equal_to(max_depth, 1, 'max_depth'))

    validate(*param_validation_check)

    classifier = DecisionTreeClassifier(
        criterion, splitter, max_depth, min_samples_split, min_samples_leaf,
        min_weight_fraction_leaf, max_features, random_state, max_leaf_nodes,
        min_impurity_decrease, min_impurity_split, class_weight, presort)
    classifier.fit(table[feature_cols], table[label_col], sample_weight,
                   check_input, X_idx_sorted)

    try:
        from sklearn.externals.six import StringIO
        from sklearn.tree import export_graphviz
        import pydotplus
        dot_data = StringIO()
        export_graphviz(classifier,
                        out_file=dot_data,
                        feature_names=feature_cols,
                        class_names=table[label_col].astype('str').unique(),
                        filled=True,
                        rounded=True,
                        special_characters=True)
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
        from brightics.common.repr import png2MD
        fig_tree = png2MD(graph.create_png())
    except:
        fig_tree = "Graphviz is needed to draw a Decision Tree graph. Please download it from http://graphviz.org/download/ and install it to your computer."

    # json
    model = _model_dict('decision_tree_classification_model')
    model['feature_cols'] = feature_cols
    model['label_col'] = label_col
    model['classes'] = classifier.classes_
    feature_importance = classifier.feature_importances_
    model['feature_importance'] = feature_importance
    model['max_features'] = classifier.max_features_
    model['n_classes'] = classifier.n_classes_
    model['n_features'] = classifier.n_features_
    model['n_outputs'] = classifier.n_outputs_
    model['tree'] = classifier.tree_
    get_param = classifier.get_params()
    model['parameters'] = get_param
    model['classifier'] = classifier

    # report

    indices = np.argsort(feature_importance)
    sorted_feature_cols = np.array(feature_cols)[indices]

    plt.title('Feature Importances')
    plt.barh(range(len(indices)),
             feature_importance[indices],
             color='b',
             align='center')
    for i, v in enumerate(feature_importance[indices]):
        plt.text(v,
                 i,
                 " {:.2f}".format(v),
                 color='b',
                 va='center',
                 fontweight='bold')
    plt.yticks(range(len(indices)), sorted_feature_cols)
    plt.xlabel('Relative Importance')
    plt.xlim(0, 1.1)
    plt.tight_layout()
    fig_feature_importances = plt2MD(plt)
    plt.clf()

    params = dict2MD(get_param)
    feature_importance_df = pd.DataFrame(data=feature_importance,
                                         index=feature_cols).T

    # Add tree plot

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Decision Tree Classification Train Result
    | ### Decision Tree
    | {fig_tree}
    |
    | ### Feature Importance
    | {fig_feature_importances}
    |
    | ### Parameters
    | {list_parameters}
    |
    """.format(fig_tree=fig_tree,
               fig_feature_importances=fig_feature_importances,
               list_parameters=params)))
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Exemplo n.º 4
0
def _mlp_regression_train(table,
                          feature_cols,
                          label_col,
                          hidden_layer_sizes=(100, ),
                          activation='relu',
                          solver='adam',
                          alpha=0.0001,
                          batch_size_auto=True,
                          batch_size='auto',
                          learning_rate='constant',
                          learning_rate_init=0.001,
                          max_iter=200,
                          random_state=None,
                          tol=0.0001):
    _, features = check_col_type(table, feature_cols)
    label = table[label_col]

    mlp_model = MLPRegressor(hidden_layer_sizes=hidden_layer_sizes,
                             activation=activation,
                             solver=solver,
                             alpha=alpha,
                             batch_size=batch_size,
                             learning_rate=learning_rate,
                             learning_rate_init=learning_rate_init,
                             max_iter=max_iter,
                             shuffle=True,
                             random_state=random_state,
                             tol=tol)
    mlp_model.fit(features, label)

    predict = mlp_model.predict(features)

    intercepts = mlp_model.intercepts_
    coefficients = mlp_model.coefs_
    loss = mlp_model.loss_

    _mean_absolute_error = mean_absolute_error(label, predict)
    _mean_squared_error = mean_squared_error(label, predict)
    _r2_score = r2_score(label, predict)

    result_table = pd.DataFrame.from_items(
        [['Metric', ['Mean Absolute Error', 'Mean Squared Error', 'R2 Score']],
         ['Score', [_mean_absolute_error, _mean_squared_error, _r2_score]]])

    label_name = {
        'hidden_layer_sizes': 'Hidden Layer Sizes',
        'activation': 'Activation Function',
        'solver': 'Solver',
        'alpha': 'Alpha',
        'batch_size': 'Batch Size',
        'learning_rate': 'Learning Rate',
        'learning_rate_init': 'Learning Rate Initial',
        'max_iter': 'Max Iteration',
        'random_state': 'Seed',
        'tol': 'Tolerance'
    }
    get_param = mlp_model.get_params()
    param_table = pd.DataFrame.from_items(
        [['Parameter', list(label_name.values())],
         ['Value', [get_param[x] for x in list(label_name.keys())]]])

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ### MLP Classification Result
    | {result}
    | ### Parameters
    | {list_parameters}
    """.format(result=pandasDF2MD(result_table),
               list_parameters=pandasDF2MD(param_table))))

    model = _model_dict('mlp_regression_model')
    model['features'] = feature_cols
    model['label'] = label_col
    model['intercepts'] = mlp_model.intercepts_
    model['coefficients'] = mlp_model.coefs_
    model['loss'] = mlp_model.loss_
    model['mean_absolute_error'] = _mean_absolute_error
    model['mean_squared_error'] = _mean_squared_error
    model['r2_score'] = _r2_score
    model['activation'] = activation
    model['solver'] = solver
    model['alpha'] = alpha
    model['batch_size'] = batch_size
    model['learning_rate'] = learning_rate
    model['learning_rate_init'] = learning_rate_init
    model['max_iter'] = max_iter
    model['random_state'] = random_state
    model['tol'] = tol
    model['mlp_model'] = mlp_model
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Exemplo n.º 5
0
def _linear_regression_train(table,
                             feature_cols,
                             label_col,
                             fit_intercept=True,
                             is_vif=False,
                             vif_threshold=10):
    features = table[feature_cols]
    label = table[label_col]

    if fit_intercept == True:
        features = sm.add_constant(features, has_constant='add')
        lr_model_fit = sm.OLS(label, features).fit()
    else:
        lr_model_fit = sm.OLS(label, features).fit()

    predict = lr_model_fit.predict(features)
    residual = label - predict

    summary = lr_model_fit.summary()
    summary_tables = simple_tables2df_list(summary.tables, drop_index=True)
    summary0 = summary_tables[0]
    summary1 = summary_tables[1]
    if is_vif:
        summary1['VIF'] = [
            variance_inflation_factor(features.values, i)
            for i in range(features.shape[1])
        ]
        summary1['VIF>{}'.format(vif_threshold)] = summary1['VIF'].apply(
            lambda _: 'true' if _ > vif_threshold else 'false')
    summary.tables[1] = _df_to_simpletable(summary1)
    summary2 = summary_tables[2]

    html_result = summary.as_html()

    plt.figure()
    plt.scatter(predict, label)
    plt.xlabel('Predicted values for ' + label_col)
    plt.ylabel('Actual values for ' + label_col)
    x = predict
    p1x = np.min(x)
    p2x = np.max(x)
    plt.plot([p1x, p2x], [p1x, p2x], 'r--')
    fig_actual_predict = plt2MD(plt)

    plt.figure()
    plt.scatter(predict, residual)
    plt.xlabel('Predicted values for ' + label_col)
    plt.ylabel('Residuals')
    plt.axhline(y=0, color='r', linestyle='--')
    fig_residual_1 = plt2MD(plt)

    plt.figure()
    sm.qqplot(residual, line='s')
    plt.ylabel('Residuals')
    fig_residual_2 = plt2MD(plt)

    plt.figure()
    sns.distplot(residual)
    plt.xlabel('Residuals')
    fig_residual_3 = plt2MD(plt)

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Linear Regression Result
    | ### Summary
    |
    """))
    rb.addHTML(html_result)
    rb.addMD(
        strip_margin("""
    |
    | ### Predicted vs Actual
    | {image1}
    |
    | ### Fit Diagnostics
    | {image2}
    | {image3}
    | {image4}
    """.format(image1=fig_actual_predict,
               image2=fig_residual_1,
               image3=fig_residual_2,
               image4=fig_residual_3)))

    model = _model_dict('linear_regression_model')
    model['features'] = feature_cols
    model['label'] = label_col
    model['coefficients'] = lr_model_fit.params
    model['fit_intercept'] = fit_intercept
    model['r2'] = lr_model_fit.rsquared
    model['adjusted_r2'] = lr_model_fit.rsquared_adj
    model['aic'] = lr_model_fit.aic
    model['bic'] = lr_model_fit.bic
    model['f_static'] = lr_model_fit.fvalue
    model['tvalues'] = lr_model_fit.tvalues
    model['pvalues'] = lr_model_fit.pvalues
    model['_repr_brtc_'] = rb.get()

    model['summary0'] = summary0
    model['summary1'] = summary1
    model['summary2'] = summary2
    lr_model_fit.remove_data()
    model['lr_model'] = lr_model_fit
    return {'model': model}
Exemplo n.º 6
0
def _autocorrelation(table, input_col, nlags=20, conf_level=0.95):
    data = table[input_col]

    plt.figure()
    plot_acf(data, lags=nlags, alpha=1 - conf_level)
    fig_plt_acf = plt2MD(plt)
    plt.clf()

    plt.figure()
    plot_pacf(data, lags=nlags, alpha=1 - conf_level)
    fig_plt_pacf = plt2MD(plt)
    plt.clf()

    acf_ret = acf(data, nlags=nlags, alpha=1 - conf_level)
    pacf_ret = pacf(data, nlags=nlags, alpha=1 - conf_level)

    result_table1 = pd.DataFrame([])
    result_table1['lag'] = list(range(nlags + 1))
    result_table1['ACF'] = acf_ret[0]

    if conf_level is not None:
        result_table1['%g%% confidence Interval' % (conf_level * 100)] = [
            str((acf_ret[1][i][0], acf_ret[1][i][1])) for i in range(nlags + 1)
        ]

    result_table2 = pd.DataFrame([])
    result_table2['lag'] = list(range(nlags + 1))
    result_table2['PACF'] = pacf_ret[0]

    if conf_level is not None:
        result_table2['%g%% confidence Interval' % (conf_level * 100)] = [
            str((pacf_ret[1][i][0], pacf_ret[1][i][1]))
            for i in range(nlags + 1)
        ]

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""# Autocorrelation / Partial Autocorrelation Result"""))
    rb.addMD(
        strip_margin("""
    |## Autocorrelation
    |
    |{image1}
    |
    |### Autocorrelation Table
    |
    |{result_table1}
    |
    |## Partial Autocorrelation
    |
    |{image2}
    |
    |### Partial Autocorrelation Table
    |
    |{result_table2}
    |
    """.format(image1=fig_plt_acf,
               result_table1=pandasDF2MD(result_table1, num_rows=nlags + 1),
               image2=fig_plt_pacf,
               result_table2=pandasDF2MD(result_table2, num_rows=nlags + 1))))

    model = _model_dict('autocorrelation')
    model['autocorrelation_table'] = result_table1
    model['partial_autocorrelation_table'] = result_table2
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Exemplo n.º 7
0
def _one_hot_encoder(table,
                     input_cols,
                     prefix='list',
                     prefix_list=None,
                     suffix='index',
                     n_values='auto',
                     categorical_features='all',
                     sparse=True,
                     handle_unknown='error',
                     drop_last=False):
    out_table = table.copy()
    sparse = False
    enc_list = []
    le_list = []
    if drop_last:
        new_col_names_list_with_true_drop_last = []
    new_col_names_list = []
    prefix_list_index = 0
    if prefix == 'list':
        len_prefix_list = 0 if prefix_list is None else len(prefix_list)
        if len(input_cols) != len_prefix_list:
            # TODO: make the error message code
            raise_runtime_error(
                'The number of Input Columns and the number of Prefixes should be equal.'
            )
    for col_name in input_cols:
        enc = OneHotEncoder(n_values=n_values,
                            categorical_features=categorical_features,
                            sparse=sparse,
                            handle_unknown=handle_unknown)
        le = LabelEncoder()
        new_col_names = []
        if suffix == 'index':
            if prefix == 'list':
                for i in range(0, len(np.unique(out_table[col_name].values))):
                    new_col_names.append(prefix_list[prefix_list_index] + '_' +
                                         str(i))
            else:
                for i in range(0, len(np.unique(out_table[col_name].values))):
                    new_col_names.append(col_name + '_' + str(i))
        else:
            if prefix == 'list':
                for i in np.unique(out_table[col_name].values):
                    new_col_names.append(prefix_list[prefix_list_index] + '_' +
                                         str(i))
            else:
                for i in np.unique(out_table[col_name].values):
                    new_col_names.append(col_name + '_' + str(i))

        transformed_table = pd.DataFrame(enc.fit_transform(
            le.fit_transform(out_table[col_name]).reshape(-1, 1)),
                                         columns=new_col_names)
        new_col_names_list.append(new_col_names)
        if drop_last:
            new_col_names = new_col_names[:-1]
            new_col_names_list_with_true_drop_last.append(new_col_names)
        for new_col_name in new_col_names:
            out_table[new_col_name] = transformed_table[new_col_name]

        enc_list.append(enc)
        le_list.append(le)
        prefix_list_index = prefix_list_index + 1

    out_model = _model_dict('one_hot_encoder')
    out_model['one_hot_encoder_list'] = enc_list
    out_model['label_encoder_list'] = le_list
    out_model['input_cols'] = input_cols
    out_model['classes'] = le.classes_
    out_model['active_features'] = enc.active_features_
    out_model['feature_indices'] = enc.feature_indices_
    out_model['n_values'] = enc.n_values_
    out_model['prefix'] = prefix
    out_model['prefix_list'] = prefix_list
    out_model['suffix'] = suffix
    out_model['drop_last'] = drop_last
    if drop_last:
        out_model[
            'new_col_names_list_with_true_drop_last'] = new_col_names_list_with_true_drop_last
    out_model['new_col_names_list'] = new_col_names_list

    return {'out_table': out_table, 'model': out_model}
Exemplo n.º 8
0
def _chi_square_test_of_independence(table,
                                     response_cols,
                                     factor_col,
                                     correction=False):
    label_list = []
    feature_list = []
    alternative_hypothesis_list = []
    dof_list = []
    stat_chi_list = []
    p_chi_list = []
    for response_col in response_cols:
        response = table[response_col]
        contingency_table = pd.crosstab(table[response_col],
                                        table[factor_col],
                                        margins=True)
        response_index = len(contingency_table) - 1
        factor_index = len(contingency_table.columns) - 1
        temporary = contingency_table.iloc[0:response_index, 0:factor_index]
        f_object = np.array(temporary)
        test = stats.chi2_contingency(f_object, correction, 1)[0:3]
        label = '{factor_col}'.format(factor_col=factor_col)
        feature = '{response_col}'.format(response_col=response_col)
        if test[1] < 0.05:
            dependence = 'Reject the null hypothesis that two categorical variables are independent at 5% significance level.'
        elif test[1] >= 0.05:
            dependence = 'No association was found between two categorical variables at 5% significance level.'
        elif math.isnan(test[1]):
            dependence = 'Independence of two categorical variables cannot be decided.'
        conclusion = '{dependence}'.format(dependence=dependence)
        alternative_hypothesis = 'Two categorical variables are dependent.'
        dof = 'chi-square distribution with {dof} degrees of freedom'.format(
            dof=test[2])
        stat_chi = '{stat_chi}'.format(stat_chi=test[0])
        p_chi = '{p_chi}'.format(p_chi=test[1])
        label_list.append(label)
        feature_list.append(feature)
        alternative_hypothesis_list.append(alternative_hypothesis)
        dof_list.append(dof)
        stat_chi_list.append(stat_chi)
        p_chi_list.append(p_chi)

    result_table = pd.DataFrame.from_items(
        [['label', label_list], ['feature', feature_list],
         ['alternative_hypothesis', alternative_hypothesis_list],
         ['df', dof_list], ['estimate', stat_chi_list],
         ['p_value', p_chi_list]])

    result = dict()
    result['result_table'] = result_table

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | ## Chi-square Test of Independence Result
    |  - H0: the two categorical variables are independent.
    |  - H1: the two categorical variables are dependent.
    """))
    for response_col in response_cols:
        response = table[response_col]
        contingency_table = pd.crosstab(table[response_col],
                                        table[factor_col],
                                        margins=True)
        response_index = len(contingency_table) - 1
        factor_index = len(contingency_table.columns) - 1
        temporary = contingency_table.iloc[0:response_index, 0:factor_index]
        f_object = np.array(temporary)
        test = stats.chi2_contingency(f_object, correction, 1)[0:3]
        label = '{factor_col}'.format(factor_col=factor_col)
        feature = '{response_col}'.format(response_col=response_col)
        if test[1] < 0.05:
            dependence = 'Reject the null hypothesis that two categorical variables are independent at 5% significance level.'
        elif test[1] >= 0.05:
            dependence = 'No association was found between two categorical variables at 5% significance level.'
        elif math.isnan(test[1]):
            dependence = 'Independence of two categorical variables cannot be decided.'
        dof_simplelist = []
        stat_chi_simplelist = []
        p_chi_simplelist = []
        dof = '{dof}'.format(dof=test[2])
        stat_chi = '{stat_chi}'.format(stat_chi=test[0])
        p_chi = '{p_chi}'.format(p_chi=test[1])
        stat_chi_simplelist.append(stat_chi)
        dof_simplelist.append(dof)
        p_chi_simplelist.append(p_chi)
        result_table_simple = pd.DataFrame.from_items(
            [['estimate', stat_chi_simplelist], ['df', dof_simplelist],
             ['p_value', p_chi_simplelist]])

        # test statistic = {test_statistic}, df = {dof}, p_value = {p_value}
        # test_statistic = stats.chi2_contingency(f_object,correction,lambda_)[0], dof=stats.chi2_contingency(f_object,correction,lambda_)[2], p_value=stats.chi2_contingency(f_object,correction,lambda_)[1]
        rb.addMD(
            strip_margin("""
        |### Label: {label}, Feature: {feature}
        |  
        |{result_table_simple}
        |
        |{dependence}
        |
        |
        """.format(label=factor_col,
                   feature=response_col,
                   result_table_simple=pandasDF2MD(result_table_simple),
                   dependence=dependence)))

    model = _model_dict('Chi-square test of independence')

    model['report'] = rb.get()

    result_table = result_table.copy()

    return {'model': model}
Exemplo n.º 9
0
def _association_rule_visualization(table,
                                    option='multiple_to_single',
                                    edge_length_scaling=1,
                                    font_size=10,
                                    node_size_scaling=1,
                                    figure_size_muliplier=1,
                                    display_rule_num=False):

    if (option == 'single_to_single'):
        result_network = table.copy()

        length_ante = []
        string_ante = []
        length_conse = []
        string_conse = []
        for row in result_network['antecedent']:
            length_ante += [len(row)]
            string_ante += [row[0]]
        for row in result_network['consequent']:
            length_conse += [len(row)]
            string_conse += [row[0]]
        result_network['length_ante'] = length_ante
        result_network['string_ante'] = string_ante
        result_network['length_conse'] = length_conse
        result_network['string_conse'] = string_conse
        result_network = result_network[result_network.length_ante == 1]
        result_network = result_network[result_network.length_conse == 1]
        result_network['support_ante'] = result_network[
            'support'] / result_network['confidence']
        result_network['support_conse'] = result_network[
            'confidence'] / result_network['lift']
        #edges_colors = preprocessing.LabelEncoder()
        #edges_colors.fit(result_network['lift'])

        #edges_colors = edges_colors.transform(result_network['lift'])
        #result_network['edge_colors'] = edges_colors

        result_network = result_network.reset_index()
        edges = []
        for i in range(len(result_network.string_ante)):
            edges += [(result_network.string_ante[i],
                       result_network.string_conse[i])]

        G = nx.DiGraph()
        G.add_edges_from(edges)
        nodes = G.nodes()
        plt.figure(figsize=(4 * len(nodes)**0.5 * figure_size_muliplier,
                            4 * len(nodes)**0.5 * figure_size_muliplier))
        pos = nx.spring_layout(G, k=0.4 * edge_length_scaling)

        node_tmp = list(result_network.string_ante) + list(
            result_network.string_conse)
        support_tmp = list(result_network.support_ante) + list(
            result_network.support_conse)
        tmp_node_support = []
        for i in range(len(node_tmp)):
            tmp_node_support += [[node_tmp[i], support_tmp[i]]]
        nodes_table = pd.DataFrame.from_records(tmp_node_support,
                                                columns=['name', 'support'])
        nodes_table = nodes_table.drop_duplicates(['name'])
        node_color = []
        nodes_table = nodes_table.reset_index()
        scaled_support = _scaling(nodes_table.support)
        for node in nodes:
            for i in range(len(nodes_table.name)):
                if nodes_table.name[i] == node:
                    node_color += [
                        scaled_support[i] * 2500 * node_size_scaling
                    ]
                    break
        #if(scaling==True):
    #     edge_color = [result_network['edge_colors'][n] for n in range(len(result_network['length_conse']))]
    #else:
        scaled_support = _scaling(result_network['confidence'])
        edge_size = [
            scaled_support[n] * 8
            for n in range(len(result_network['length_conse']))
        ]
        edge_color = [
            result_network['lift'][n]
            for n in range(len(result_network['length_conse']))
        ]
        nx.draw(G,
                pos,
                node_color=node_color,
                edge_color=edge_color,
                node_size=node_color,
                arrowsize=20 * (0.2 + 0.8 * node_size_scaling),
                font_family='NanumGothic',
                with_labels=True,
                cmap=plt.cm.Blues,
                edge_cmap=plt.cm.Reds,
                arrows=True,
                edge_size=edge_color,
                width=edge_size,
                font_size=font_size)

        fig_digraph = plt2MD(plt)

        graph_min_support = np.min(nodes_table.support)
        graph_max_support = np.max(nodes_table.support)
        graph_min_confidence = np.min(result_network['confidence'])
        graph_max_confidence = np.max(result_network['confidence'])
        graph_min_lift = np.min(result_network['lift'])
        graph_max_lift = np.max(result_network['lift'])

        rb = BrtcReprBuilder()
        rb.addMD(
            strip_margin("""
        | ### Network Digraph
        | ##### Node color, size : support ({graph_min_support}~{graph_max_support})
        | ##### Edge color : lift ({graph_min_lift}~{graph_max_lift})
        | ##### Edge size : confidence ({graph_min_confidence}~{graph_max_confidence})
        | {image1}
        |
        """.format(image1=fig_digraph,
                   graph_min_support=graph_min_support,
                   graph_max_support=graph_max_support,
                   graph_min_lift=graph_min_lift,
                   graph_max_lift=graph_max_lift,
                   graph_min_confidence=graph_min_confidence,
                   graph_max_confidence=graph_max_confidence)))

    elif (option == 'multiple_to_single'):

        result_network = table.copy()
        length_ante = []
        string_ante = []
        length_conse = []
        string_conse = []
        for row in result_network['consequent']:
            length_conse += [len(row)]
            string_conse += [row[0]]
        result_network['length_conse'] = length_conse
        result_network['consequent'] = string_conse
        result_network = result_network[result_network.length_conse == 1]
        index_list = result_network.index.tolist()
        rownum = []
        for i in range(len(result_network['consequent'])):
            if display_rule_num:
                rownum += ['R%d' % (i + 1)]
            else:
                rownum += [_n_blank_strings(i + 1)]
        result_network['row_number'] = rownum
        edges = []
        nodes = []
        for i in index_list:
            for j in range(len(result_network.antecedent[i])):
                edges += [(result_network.antecedent[i][j],
                           result_network['row_number'][i])]
            edges += [(result_network['row_number'][i],
                       result_network.consequent[i])]
            nodes += [result_network['row_number'][i]]

        G = nx.DiGraph()
        G.add_nodes_from(nodes)
        G.add_edges_from(edges)
        plt.figure(figsize=(2 * len(nodes)**0.5 * figure_size_muliplier,
                            2 * len(nodes)**0.5 * figure_size_muliplier))
        pos = nx.spring_layout(G, k=0.2 * edge_length_scaling)
        nodes_color = []
        nodes_size = []
        scaled_lift = _scaling(result_network.lift)
        for node in range(len(G.nodes())):
            if node < len(nodes):
                nodes_color += [result_network.support[index_list[node]]]
                nodes_size += [scaled_lift[node] * 2000 * node_size_scaling]
            else:
                nodes_color += [0]
                nodes_size += [0]

        nx.draw(G,
                pos,
                node_color=nodes_color,
                node_size=nodes_size,
                font_family='NanumGothic',
                with_labels=True,
                cmap=plt.cm.Reds,
                arrows=True,
                edge_color='Grey',
                font_weight='bold',
                arrowsize=20 * (0.2 + 0.8 * node_size_scaling),
                font_size=font_size)
        fig_digraph = plt2MD(plt)

        graph_min_support = np.min(result_network.support)
        graph_max_support = np.max(result_network.support)
        graph_min_lift = np.min(result_network.lift)
        graph_max_lift = np.max(result_network.lift)

        rb = BrtcReprBuilder()
        rb.addMD(
            strip_margin("""
        | ### Network Digraph
        | ##### Size of circle : support ({graph_min_support}~{graph_max_support})
        | ##### Color of circle : lift ({graph_min_lift}~{graph_max_lift})
        | {image1}
        |
        """.format(image1=fig_digraph,
                   graph_min_support=graph_min_support,
                   graph_max_support=graph_max_support,
                   graph_min_lift=graph_min_lift,
                   graph_max_lift=graph_max_lift)))

    else:

        result_network = table.copy()
        length_ante = []
        string_ante = []
        length_conse = []
        string_conse = []
        for row in result_network['consequent']:
            length_conse += [len(row)]
        result_network['length_conse'] = length_conse
        result_network = result_network.reset_index()
        rownum = []
        for i in range(len(result_network['consequent'])):
            if display_rule_num:
                rownum += ['R%d' % i]
            else:
                rownum += [_n_blank_strings(i + 1)]
        result_network['row_number'] = rownum
        edges = []
        nodes = []
        for i in range(len(result_network.consequent)):
            for j in range(len(result_network.antecedent[i])):
                edges += [(result_network.antecedent[i][j],
                           result_network['row_number'][i])]
            for j in range(len(result_network.consequent[i])):
                edges += [(result_network['row_number'][i],
                           result_network.consequent[i][j])]
            nodes += [result_network['row_number'][i]]

        G = nx.DiGraph()
        G.add_nodes_from(nodes)
        G.add_edges_from(edges)
        plt.figure(figsize=(2 * len(nodes)**0.5 * figure_size_muliplier,
                            2 * len(nodes)**0.5 * figure_size_muliplier))
        pos = nx.spring_layout(G, k=0.2 * edge_length_scaling)
        nodes_color = []
        nodes_size = []
        scaled_lift = _scaling(result_network.lift)
        for node in range(len(G.nodes())):
            if node < len(nodes):
                nodes_color += [result_network.support[node]]
                nodes_size += [scaled_lift[node] * 2000 * node_size_scaling]
            else:
                nodes_color += [0]
                nodes_size += [0]

        nx.draw(G,
                pos,
                node_color=nodes_color,
                node_size=nodes_size,
                font_family='NanumGothic',
                with_labels=True,
                cmap=plt.cm.Reds,
                arrows=True,
                edge_color='Grey',
                font_weight='bold',
                arrowsize=20 * (0.2 + 0.8 * node_size_scaling),
                font_size=font_size)
        fig_digraph = plt2MD(plt)

        graph_min_support = np.min(result_network.support)
        graph_max_support = np.max(result_network.support)
        graph_min_lift = np.min(result_network.lift)
        graph_max_lift = np.max(result_network.lift)

        rb = BrtcReprBuilder()
        rb.addMD(
            strip_margin("""
        | ### Network Digraph
        | ##### Size of circle : support ({graph_min_support}~{graph_max_support})
        | ##### Color of circle : lift ({graph_min_lift}~{graph_max_lift})
        | {image1}
        |
        """.format(image1=fig_digraph,
                   graph_min_support=graph_min_support,
                   graph_max_support=graph_max_support,
                   graph_min_lift=graph_min_lift,
                   graph_max_lift=graph_max_lift)))

    model = _model_dict('Association rule')
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
def naive_bayes_train(table,
                      feature_cols,
                      label_col,
                      alpha=1.0,
                      fit_prior=True,
                      class_prior=None):

    features = table[feature_cols]
    label = table[label_col]
    label_encoder = preprocessing.LabelEncoder()
    label_encoder.fit(label)
    label_correspond = label_encoder.transform(label)

    if class_prior is not None:
        tmp_class_prior = [0 for x in range(len(class_prior))]
        for elems in class_prior:
            tmp = elems.split(":")
            tmp_class_prior[label_encoder.transform([tmp[0]
                                                     ])[0]] = float(tmp[1])
        class_prior = tmp_class_prior

    nb_model = MultinomialNB(alpha, fit_prior, class_prior)
    nb_model.fit(features, label_correspond)
    prediction_correspond = nb_model.predict(features)

    get_param = dict()
    get_param['Lambda'] = alpha
    # get_param['Prior Probabilities of the Classes'] = class_prior
    # get_param['Fit Class Prior Probability'] = fit_prior
    get_param['Feature Columns'] = feature_cols
    get_param['Label Column'] = label_col

    cnf_matrix = confusion_matrix(label_correspond, prediction_correspond)

    plt.figure()
    plot_confusion_matrix(cnf_matrix,
                          classes=label_encoder.classes_,
                          title='Confusion Matrix')
    fig_confusion_matrix = plt2MD(plt)
    accuracy = nb_model.score(features, label_correspond) * 100

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | ## Naive Bayes Classification Result
    | ### Parameters
    | {table_parameter} 
    | ### Predicted vs Actual
    | {image1}
    | #### Accuacy = {accuracy}%
    |
    """.format(image1=fig_confusion_matrix,
               accuracy=accuracy,
               table_parameter=dict2MD(get_param))))

    model = _model_dict('naive_bayes_model')
    model['features'] = feature_cols
    model['label_col'] = label_col
    model['label_encoder'] = label_encoder
    model['nb_model'] = nb_model
    model['report'] = rb.get()

    return {'model': model}
Exemplo n.º 11
0
def _mlp_classification_train(table, feature_cols, label_col, hidden_layer_sizes=(100, ), activation='relu', solver='adam', alpha=0.0001, batch_size_auto=True, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, max_iter=200, random_state=None, tol=0.0001):

    feature_names, features = check_col_type(table,feature_cols)
    label = table[label_col]

    if(sklearn_utils.multiclass.type_of_target(label) == 'continuous'):
        raise_error('0718', 'label_col')
    
    mlp_model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver, alpha=alpha, batch_size=batch_size, learning_rate=learning_rate, learning_rate_init=learning_rate_init, max_iter=max_iter, shuffle=True, random_state=random_state, tol=tol)
    mlp_model.fit(features, label)
    
    predict = mlp_model.predict(features)

    intercepts = mlp_model.intercepts_
    coefficients = mlp_model.coefs_
    classes = mlp_model.classes_
    # is_binary = len(classes) == 2
    loss = mlp_model.loss_
    
    _accuracy_score = accuracy_score(label, predict)
    _f1_score = f1_score(label, predict, average='micro')
    _precision_score = precision_score(label, predict, average='micro')
    _recall_score = recall_score(label, predict, average='micro')
    
    # summary = pd.DataFrame({'features': feature_names})
    # coef_trans = np.transpose(coefficients)
    
    # summary = pd.concat((summary, pd.DataFrame(coef_trans, columns=classes)), axis=1)
        
    result_table = pd.DataFrame.from_items([
        ['Metric', ['Accuracy Score', 'F1 Score', 'Precision Score', 'Recall Score']],
        ['Score', [_accuracy_score, _f1_score, _precision_score, _recall_score]]
    ])
    
    label_name = {
        'hidden_layer_sizes': 'Hidden Layer Sizes',
        'activation': 'Activation Function',
        'solver': 'Solver',
        'alpha': 'Alpha',
        'batch_size': 'Batch Size',
        'learning_rate': 'Learning Rate',
        'learning_rate_init': 'Learning Rate Initial',
        'max_iter': 'Max Iteration',
        'random_state': 'Seed',
        'tol': 'Tolerance'}
    get_param = mlp_model.get_params()
    param_table = pd.DataFrame.from_items([
        ['Parameter', list(label_name.values())],
        ['Value', [get_param[x] for x in list(label_name.keys())]]
    ])

    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ### MLP Classification Result
    | {result}
    | ### Parameters
    | {list_parameters}
    """.format(result=pandasDF2MD(result_table), list_parameters=pandasDF2MD(param_table)
               )))

    model = _model_dict('mlp_classification_model')
    model['features'] = feature_cols
    model['label'] = label_col
    model['intercepts'] = mlp_model.intercepts_
    model['coefficients'] = mlp_model.coefs_
    model['class'] = mlp_model.classes_
    model['loss'] = mlp_model.loss_
    model['accuracy_score'] = _accuracy_score
    model['f1_score'] = _f1_score
    model['precision_score'] = _precision_score
    model['recall_score'] = _recall_score
    model['activation'] = activation
    model['solver'] = solver
    model['alpha'] = alpha
    model['batch_size'] = batch_size
    model['learning_rate'] = learning_rate
    model['learning_rate_init'] = learning_rate_init
    model['max_iter'] = max_iter
    model['random_state'] = random_state
    model['tol'] = tol
    model['mlp_model'] = mlp_model
    model['_repr_brtc_'] = rb.get()
    # model['summary'] = summary

    return {'model' : model}
Exemplo n.º 12
0
def _lda(table,
         input_col,
         num_voca=1000,
         num_topic=3,
         num_topic_word=3,
         max_iter=20,
         learning_method='online',
         learning_offset=10.,
         random_state=None):
    corpus = table[input_col]
    tf_vectorizer = CountVectorizer(max_df=0.95,
                                    min_df=2,
                                    max_features=num_voca,
                                    stop_words='english')
    term_count = tf_vectorizer.fit_transform(corpus)
    tf_feature_names = tf_vectorizer.get_feature_names()

    if learning_method == 'online':
        lda_model = LatentDirichletAllocation(
            n_components=num_topic,
            max_iter=max_iter,
            learning_method=learning_method,
            learning_offset=learning_offset,
            random_state=random_state).fit(term_count)
    elif learning_method == 'batch':
        lda_model = LatentDirichletAllocation(
            n_components=num_topic,
            max_iter=max_iter,
            learning_method=learning_method,
            random_state=random_state).fit(term_count)
    else:
        raise_runtime_error("Please check 'learning_method'.")

    topic_model = pd.DataFrame([])
    topic_idx_list = []
    voca_weights_list = []
    for topic_idx, weights in enumerate(lda_model.components_):
        topic_idx_list.append("Topic {}".format(topic_idx))
        pairs = []
        for term_idx, value in enumerate(weights):
            pairs.append((abs(value), tf_feature_names[term_idx]))
        pairs.sort(key=lambda x: x[0], reverse=True)
        voca_weights = []
        for pair in pairs[:num_topic_word]:
            voca_weights.append("{}: {}".format(pair[1], pair[0]))
        voca_weights_list.append(voca_weights)
    topic_model['topic idx'] = topic_idx_list
    topic_model['topic vocabularies'] = voca_weights_list

    doc_topic = lda_model.transform(term_count)

    doc_classification = pd.DataFrame()
    doc_classification['documents'] = [doc for doc in corpus]
    doc_classification['top topic'] = [
        "Topic {}".format(doc_topic[i].argmax()) for i in range(len(corpus))
    ]

    params = {
        'Input Column': input_col,
        'Number of Vocabularies': num_voca,
        'Number of Topics': num_topic,
        'Number of Terminologies': num_topic_word,
        'Iterations': max_iter,
        'Learning Method': learning_method,
    }

    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""# Latent Dirichlet Allocation Result"""))
    rb.addMD(
        strip_margin("""
    |
    |### Parameters
    |
    | {display_params}
    |
    |### Topic Model
    |
    |{topic_model}
    |
    |### Documents Classification
    |
    |{doc_classification}
    |
    """.format(display_params=dict2MD(params),
               topic_model=pandasDF2MD(topic_model, num_rows=num_topic + 1),
               doc_classification=pandasDF2MD(doc_classification,
                                              num_rows=len(corpus) + 1))))

    model = _model_dict('lda')
    model['parameter'] = params
    model['topic_model'] = topic_model
    model['documents_classification'] = doc_classification
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Exemplo n.º 13
0
def _xgb_regression_train(table,
                          feature_cols,
                          label_col,
                          max_depth=3,
                          learning_rate=0.1,
                          n_estimators=100,
                          silent=True,
                          objectibe='reg:linear',
                          booster='gbtree',
                          n_jobs=1,
                          nthread=None,
                          gamma=0,
                          min_child_weight=1,
                          max_delta_step=0,
                          subsample=1,
                          colsample_bytree=1,
                          colsample_bylevel=1,
                          reg_alpha=0,
                          reg_lambda=1,
                          scale_pos_weight=1,
                          base_score=0.5,
                          random_state=0,
                          seed=None,
                          missing=None,
                          sample_weight=None,
                          eval_set=None,
                          eval_metric=None,
                          early_stopping_rounds=None,
                          verbose=True,
                          xgb_model=None,
                          sample_weight_eval_set=None):

    regressor = XGBRegressor(max_depth, learning_rate, n_estimators, silent,
                             objectibe, booster, n_jobs, nthread, gamma,
                             min_child_weight, max_delta_step, subsample,
                             colsample_bytree, colsample_bylevel, reg_alpha,
                             reg_lambda, scale_pos_weight, base_score,
                             random_state, seed, missing)
    regressor.fit(table[feature_cols], table[label_col], sample_weight,
                  eval_set, eval_metric, early_stopping_rounds, verbose,
                  xgb_model, sample_weight_eval_set)

    # json
    get_param = regressor.get_params()
    feature_importance = regressor.feature_importances_
    #     plt.rcdefaults()
    plot_importance(regressor)
    plt.tight_layout()
    fig_plot_importance = plt2MD(plt)
    plt.clf()
    #     plt.rcParams['figure.dpi'] = figure_dpi
    #     plot_tree(regressor)
    #     fig_plot_tree_UT = plt2MD(plt)
    #     plt.clf()
    #     plt.rcParams['figure.dpi'] = figure_dpi
    #     plot_tree(regressor, rankdir='LR')
    #     fig_plot_tree_LR = plt2MD(plt)
    #     plt.rcdefaults()
    #     plt.clf()

    out_model = _model_dict('xgb_regression_model')
    out_model['feature_cols'] = feature_cols
    out_model['label_col'] = label_col
    out_model['parameters'] = get_param
    out_model['feature_importance'] = feature_importance
    out_model['regressor'] = regressor
    out_model['plot_importance'] = fig_plot_importance
    #     out_model['plot_tree_UT'] = fig_plot_tree_UT
    #     out_model['plot_tree_LR'] = fig_plot_tree_LR
    #         out_model['to_graphviz'] = md_to_graphviz

    # report
    get_param_list = []
    get_param_list.append(['feature_cols', feature_cols])
    get_param_list.append(['label_col', label_col])
    for key, value in get_param.items():
        temp = [key, value]
        get_param_list.append(temp)
    get_param_df = pd.DataFrame(data=get_param_list,
                                columns=['parameter', 'value'])
    feature_importance_df = pd.DataFrame(data=feature_importance,
                                         index=feature_cols).T

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | ## XGB Regression Result
    |
    | ### Plot Importance
    | {image_importance}
    |
    | ### Feature Importance
    | {table_feature_importance}
    |
    | ### Parameters
    | {table_parameter}
    |
    """.format(image_importance=fig_plot_importance,
               table_feature_importance=pandasDF2MD(feature_importance_df, 20),
               table_parameter=pandasDF2MD(get_param_df))))
    out_model['report'] = rb.get()

    return {'model': out_model}
Exemplo n.º 14
0
def _pca(table,
         input_cols,
         new_column_name='projected_',
         n_components=None,
         copy=True,
         whiten=False,
         svd_solver='auto',
         tol=0.0,
         iterated_power='auto',
         random_state=None):

    num_feature_cols = len(input_cols)
    if n_components is None:
        n_components = num_feature_cols

    pca = PCA(n_components, copy, whiten, svd_solver, tol, iterated_power,
              random_state)
    pca_model = pca.fit(table[input_cols])

    column_names = []
    for i in range(0, n_components):
        column_names.append(new_column_name + str(i))
    # print(column_names)

    pca_result = pca_model.transform(table[input_cols])
    out_df = pd.DataFrame(data=pca_result, columns=[column_names])

    res_components = pca_model.components_
    res_components_df = pd.DataFrame(data=res_components, columns=[input_cols])
    res_explained_variance = pca_model.explained_variance_
    res_explained_variance_ratio = pca_model.explained_variance_ratio_
    res_singular_values = pca_model.singular_values_
    res_mean = pca_model.mean_
    res_n_components = pca_model.n_components_
    res_noise_variance = pca_model.noise_variance_

    res_get_param = pca_model.get_params()
    res_get_covariance = pca_model.get_covariance()
    res_get_precision = pca_model.get_precision()

    # visualization
    plt.figure()
    if res_n_components == 1:
        plt.scatter(pca_result[:, 0], pca_result[:, 0])
    else:
        plt.scatter(pca_result[:, 0], pca_result[:, 1])
    # plt.title('PCA result with two components')
    # plt.show()
    plt_two = plt2MD(plt)
    plt.clf()

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | 
    | ### Plot
    | The x-axis and y-axis of the following plot is projected0 and projected1, respectively.    
    | {image1}
    |
    | ### Result
    | {table1}
    | only showing top 20 rows
    |
    | ### Parameters
    | {parameter1}
    |
    | ### Components
    | {table2}
    | 
    | ### Mean
    | {array1}
    | 
    | ### Explained Variance 
    | {array2}
    |
    """.format(table1=pandasDF2MD(out_df, 20),
               image1=plt_two,
               parameter1=dict2MD(res_get_param),
               table2=pandasDF2MD(res_components_df),
               array1=res_mean,
               array2=res_explained_variance)))

    model = _model_dict('pca')
    model['components'] = res_components
    model['explained_variance'] = res_explained_variance
    model['explained_variance_ratio'] = res_explained_variance_ratio
    model['singular_values'] = res_singular_values
    model['mean'] = res_mean
    model['n_components'] = res_n_components
    model['noise_variance'] = res_noise_variance
    model['parameters'] = res_get_param
    model['covariance'] = res_get_covariance
    model['precision'] = res_get_precision
    model['report'] = rb.get()
    model['pca_model'] = pca_model
    model['input_cols'] = input_cols

    out_df = pd.concat([table.reset_index(drop=True), out_df], axis=1)
    out_df.columns = table.columns.values.tolist() + column_names

    return {'out_table': out_df, 'model': model}
Exemplo n.º 15
0
def _collaborative_filtering_train(table,
                                   user_col,
                                   item_col,
                                   rating_col,
                                   N=10,
                                   filter=True,
                                   k=5,
                                   based='item',
                                   mode='train',
                                   method='cosine',
                                   weighted=True,
                                   centered=True,
                                   targets=None,
                                   normalize=True,
                                   workers=1,
                                   filter_minus=False,
                                   maintain_already_scored=True):
    if based == 'item':
        normalize = False
    table_user_col = table[user_col]
    table_item_col = table[item_col]
    rating_col = table[rating_col]
    user_encoder = preprocessing.LabelEncoder()
    item_encoder = preprocessing.LabelEncoder()
    user_encoder.fit(table_user_col)
    item_encoder.fit(table_item_col)
    user_correspond = user_encoder.transform(table_user_col)
    item_correspond = item_encoder.transform(table_item_col)
    if based == 'item':
        item_users = csr_matrix(
            (rating_col, (item_correspond, user_correspond)))
        check_cen = csr_matrix(
            (rating_col + 1, (item_correspond, user_correspond)))
    else:
        item_users = csr_matrix(
            (rating_col, (user_correspond, item_correspond)))
        check_cen = csr_matrix(
            (rating_col + 1, (user_correspond, item_correspond)))
    centered_ratings = item_users.copy()

    num_item, num_user = item_users.shape
    if centered:
        update_item = []
        update_user = []
        update_rating = []
        for item in range(num_item):
            index = 0
            sum = 0
            for user, rating in _nonzeros(check_cen, item):
                index += 1
                sum += rating
            avg = sum / index - 1
            for user, rating in _nonzeros(check_cen, item):
                update_item.append(item)
                update_user.append(user)
                update_rating.append(avg)

        centered_ratings -= csr_matrix(
            (update_rating, (update_item, update_user)))
    if (method == 'adjusted' or normalize) and based == 'item':
        check_cen = check_cen.transpose().tocsr()
    if based == 'user':
        tmp = num_user
        num_user = num_item
        num_item = tmp
    user_avg = []
    if normalize:
        for user in range(num_user):
            index = 0
            sum = 0
            for user, rating in _nonzeros(check_cen, user):
                index += 1
                sum += rating
            avg = sum / index
            user_avg.append(avg)
    if method == 'adjusted':
        update_item = []
        update_user = []
        update_rating = []
        for user in range(num_user):
            sum = 0
            for item, rating in _nonzeros(check_cen, user):
                sum += rating
            avg = sum / num_item
            for item in range(num_item):
                update_item.append(item)
                update_user.append(user)
                update_rating.append(avg)
        if based == 'item':
            centered_ratings -= csr_matrix(
                (update_rating, (update_item, update_user)))
        else:
            centered_ratings -= csr_matrix(
                (update_rating, (update_user, update_item)))
        method = 'cosine'
    if based == 'user':
        tmp = num_user
        num_user = num_item
        num_item = tmp

    if method == 'cosine':
        similar_coeff = cosine_similarity(centered_ratings)
    elif method == 'pearson':
        result = []
        for i in centered_ratings.toarray():
            result.append(i - np.average(i))
        similar_coeff = cosine_similarity(result)
    elif method == 'jaccard':
        similar_coeff = 1 - pairwise_distances(centered_ratings.toarray(),
                                               metric="hamming")
    if based == 'user':
        item_users = item_users.transpose().tocsr()

    if mode == 'Topn':
        if targets is None:
            targets = user_encoder.classes_
        if table_user_col.dtype in (np.floating, float, np.int, int, np.int64):
            targets = [float(i) for i in targets]
        targets_en = user_encoder.transform(targets)
        Topn_result = []
        if workers == 1:
            for user in targets_en:
                recommendations_corre = _recommend(user, item_users,
                                                   similar_coeff, N, k, method,
                                                   weighted, centered, based,
                                                   normalize, user_avg, filter,
                                                   filter_minus,
                                                   maintain_already_scored)
                recommendations = []
                for (item, rating) in recommendations_corre:
                    recommendations += [
                        item_encoder.inverse_transform([item])[0], rating
                    ]
                Topn_result += [recommendations]
        else:
            Topn_result_tmp = apply_by_multiprocessing_list_to_list(
                targets_en,
                _recommend_multi,
                item_users=item_users,
                similar_coeff=similar_coeff,
                N=N,
                k=k,
                method=method,
                weighted=weighted,
                centered=centered,
                based=based,
                normalize=normalize,
                user_avg=user_avg,
                item_encoder=item_encoder,
                workers=workers,
                filter_minus=filter_minus,
                maintain_already_scored=maintain_already_scored)
            Topn_result = []
            for i in range(workers):
                Topn_result += Topn_result_tmp[i]
        Topn_result = pd.DataFrame(Topn_result)
        Topn_result = pd.concat([pd.DataFrame(targets), Topn_result],
                                axis=1,
                                ignore_index=True)
        column_names = ['user_name']
        for i in range(int((Topn_result.shape[1] - 1) / 2)):
            column_names += ['item_top%d' % (i + 1), 'rating_top%d' % (i + 1)]
        Topn_result.columns = column_names
        return {'out_table': Topn_result}

    parameters = dict()
    parameters['Number of Neighbors'] = k
    parameters['Based'] = based
    if method == 'cosine':
        parameters['Similarity method'] = 'Cosine'
    elif method == 'jaccard':
        parameters['Similarity method'] = 'Jaccard'
    elif method == 'pearson':
        parameters['Similarity method'] = 'Pearson'
    else:
        parameters['Similarity method'] = 'Adjusted Cosine'
    parameters['Use Centered Mean'] = centered
    parameters['Use Weighted Rating'] = weighted
    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Collaborative Filtering Result
    |
    | ### Parameters
    | {parameters} 
    |
    """.format(parameters=dict2MD(parameters))))

    model = _model_dict('collaborative filtering')
    model['weighted'] = weighted
    model['k'] = k
    model['method'] = method
    model['centered_ratings'] = centered_ratings
    model['item_encoder'] = item_encoder
    model['user_encoder'] = user_encoder
    model['item_users'] = item_users
    model['user_col'] = user_col
    model['item_col'] = item_col
    model['based'] = based
    model['_repr_brtc_'] = rb.get()
    model['normalize'] = normalize
    model['user_avg'] = user_avg
    return {'model': model}
Exemplo n.º 16
0
def _doc2vec(table,
             input_col,
             dm=1,
             vector_size=100,
             window=10,
             min_count=1,
             max_vocab_size=None,
             train_epoch=100,
             workers=4,
             alpha=0.025,
             min_alpha=0.025,
             seed=None,
             hs=1,
             negative=5,
             ns_exponent=0.75):

    if seed is None:
        random_state = seed
        seed = randint(0, 0xffffffff)
    else:
        random_state = seed

    docs = table[input_col]
    tagged_docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs)]

    if dm == "1":
        dm = 1
        algo = 'PV-DM'
    else:
        dm = 0
        algo = 'PV-DBOW'

    d2v = Doc2Vec(documents=tagged_docs,
                  dm=dm,
                  vector_size=vector_size,
                  window=window,
                  alpha=alpha,
                  min_alpha=min_alpha,
                  seed=seed,
                  min_count=min_count,
                  max_vocab_size=max_vocab_size,
                  workers=workers,
                  epochs=train_epoch,
                  hs=hs,
                  negative=negative,
                  ns_exponent=ns_exponent)

    vocab = d2v.wv.vocab

    params = {
        'Input column': input_col,
        'Training algorithm': algo,
        'Dimension of Vectors': vector_size,
        'Window': window,
        'Minimum count': min_count,
        'Max vocabulary size': max_vocab_size,
        'Train epoch': train_epoch,
        'Number of workers': workers,
        'Alpha': alpha,
        'Minimum alpha': min_alpha,
        'Seed': random_state,
        'Hierarchical softmax': hs,
        'Negative': negative,
        'Negative sampling exponent': ns_exponent
    }

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Doc2Vec Result
    |
    | ### Parameters
    | {params}
    """.format(params=dict2MD(params))))

    model = _model_dict('doc2vec_model')
    model['params'] = params
    model['d2v'] = d2v
    model['_repr_brtc_'] = rb.get()

    out_table1 = table.copy()
    out_table1['document_vectors'] = [
        d2v.infer_vector(doc.words).tolist() for doc in tagged_docs
    ]
    out_table2 = pd.DataFrame({
        'words': d2v.wv.index2word,
        'word_vectors': d2v.wv[vocab].tolist()
    })

    return {'model': model, 'doc_table': out_table1, 'word_table': out_table2}
Exemplo n.º 17
0
def _penalized_linear_regression_train(table,
                                       feature_cols,
                                       label_col,
                                       regression_type='ridge',
                                       alpha=1.0,
                                       l1_ratio=0.5,
                                       fit_intercept=True,
                                       max_iter=1000,
                                       tol=0.0001,
                                       random_state=None):
    out_table = table.copy()
    features = out_table[feature_cols]
    label = out_table[label_col]
    if regression_type == 'ridge':
        regression_model = Ridge(alpha=alpha,
                                 fit_intercept=fit_intercept,
                                 max_iter=None,
                                 tol=tol,
                                 solver='auto',
                                 random_state=random_state)
    elif regression_type == 'lasso':
        regression_model = Lasso(alpha=alpha,
                                 fit_intercept=fit_intercept,
                                 max_iter=max_iter,
                                 tol=tol,
                                 random_state=random_state,
                                 selection='random')
    elif regression_type == 'elastic_net':
        regression_model = ElasticNet(alpha=alpha,
                                      l1_ratio=l1_ratio,
                                      fit_intercept=fit_intercept,
                                      max_iter=max_iter,
                                      tol=tol,
                                      random_state=random_state,
                                      selection='random')
    else:
        raise_runtime_error("Please check 'regression_type'.")

    regression_model.fit(features, label)

    out_table1 = pd.DataFrame([])
    out_table1['x_variable_name'] = [variable for variable in feature_cols]
    out_table1['coefficient'] = regression_model.fit(features, label).coef_
    intercept = pd.DataFrame(
        [['intercept',
          regression_model.fit(features, label).intercept_]],
        columns=['x_variable_name', 'coefficient'])
    if fit_intercept == True:
        out_table1 = out_table1.append(intercept, ignore_index=True)

    predict = regression_model.predict(features)
    residual = label - predict

    out_table['predict'] = predict
    out_table['residual'] = residual

    if regression_type == 'elastic_net':
        params = {
            'Feature Columns': feature_cols,
            'Label Column': label_col,
            'Regression Type': regression_type,
            'Regularization (Penalty Weight)': alpha,
            'L1 Ratio': l1_ratio,
            'Fit Intercept': fit_intercept,
            'Maximum Number of Iterations': max_iter,
            'Tolerance': tol
        }
    else:
        params = {
            'Feature Columns': feature_cols,
            'Label Column': label_col,
            'Regression Type': regression_type,
            'Regularization (Penalty Weight)': alpha,
            'Fit Intercept': fit_intercept,
            'Maxium Number of Iterations': max_iter,
            'Tolerance': tol
        }

    score = {
        'MSE': mean_squared_error(label, predict),
        'R2': r2_score(label, predict)
    }

    plt.figure()
    plt.scatter(predict, label)
    plt.xlabel('Predicted values for ' + label_col)
    plt.ylabel('Actual values for ' + label_col)
    x = predict
    p1x = np.min(x)
    p2x = np.max(x)
    plt.plot([p1x, p2x], [p1x, p2x], 'r--')
    fig_actual_predict = plt2MD(plt)
    plt.clf()

    plt.figure()
    plt.scatter(predict, residual)
    plt.xlabel('Predicted values for ' + label_col)
    plt.ylabel('Residuals')
    plt.axhline(y=0, color='r', linestyle='--')
    fig_residual_1 = plt2MD(plt)
    plt.clf()

    plt.figure()
    sm.qqplot(residual, line='s')
    plt.ylabel('Residuals')
    fig_residual_2 = plt2MD(plt)
    plt.clf()

    plt.figure()
    sns.distplot(residual)
    plt.xlabel('Residuals')
    fig_residual_3 = plt2MD(plt)
    plt.clf()

    # checking the magnitude of coefficients

    plt.figure()
    predictors = features.columns
    coef = Series(regression_model.coef_, predictors).sort_values()
    coef.plot(kind='bar', title='Model Coefficients')
    plt.tight_layout()
    fig_model_coefficients = plt2MD(plt)
    plt.clf()

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | # Penalized Linear Regression Result
    | ### Selected Parameters: 
    | {params}
    |
    | ## Results
    | ### Model Parameters
    | {out_table1}
    |
    | ### Prediction and Residual
    | {out_table2}
    |
    | ### Regression Score
    | {score}
    |
    """.format(params=dict2MD(params),
               out_table1=pandasDF2MD(out_table1),
               out_table2=pandasDF2MD(out_table, num_rows=len(out_table) + 1),
               score=dict2MD(score))))
    rb.addMD(
        strip_margin("""
    |
    | ### Predicted vs Actual
    | {image1}
    |
    | ### Fit Diagnostics
    | {image2}
    | {image3}
    | {image4}
    |
    | ### Magnitude of Coefficients
    | {image5}
    |
    """.format(image1=fig_actual_predict,
               image2=fig_residual_1,
               image3=fig_residual_2,
               image4=fig_residual_3,
               image5=fig_model_coefficients)))

    model = _model_dict('penalized_linear_regression_model')
    model['feature_cols'] = feature_cols
    model['label_col'] = label_col
    model['regression_type'] = regression_type
    model['regression_model'] = regression_model
    model['parameters'] = params
    model['model_parameters'] = out_table1
    model['prediction_residual'] = out_table
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Exemplo n.º 18
0
def _gaussian_mixture_train(table, input_cols, number_of_components=1, covariance_type='full', tolerance=0.001, \
                            regularize_covariance=1e-06, max_iteration=100, initial_params='kmeans', seed=None):

    gmm = GaussianMixture(n_components=number_of_components, covariance_type=covariance_type, tol=tolerance, \
                          reg_covar=regularize_covariance, max_iter=max_iteration, init_params=initial_params, random_state=seed)
    X_train = table[input_cols]
    gmm.fit(X_train)
    
    out_table = pd.DataFrame()
    
    comp_num_arr = []
    for i in range(0, number_of_components):
        comp_num_arr.append(i)
    
    mean_arr = []
    for i in range(0, number_of_components):
        mean_arr.append(gmm.means_[i].tolist())
        
    covar_arr = []
    for i in range(0, number_of_components):
        covar_arr.append(gmm.covariances_[i].tolist())
        
    out_table['component_number'] = comp_num_arr
    out_table['weight'] = gmm.weights_
    out_table['mean_coordinate'] = mean_arr
    out_table['covariance_matrix'] = covar_arr
    
    rb = BrtcReprBuilder()
    params = { 
        'Input Columns': input_cols,
        'Number of Components': number_of_components,
        'Covariance Type': covariance_type,
        'Tolerance': tolerance,
        'Regularization of Covariance': regularize_covariance,
        'Number of Iteration': max_iteration,
        'Method to Initialize': initial_params
    }

    rb.addMD(strip_margin("""
    |## Gaussian Mixture Train Result 
    |
    |### Parameters
    |
    | {params}
    |
    |### Summary
    |
    |{result_table}
    |
    """.format(params=dict2MD(params), result_table=pandasDF2MD(out_table))))

    model = _model_dict('gaussian_mixture_train')
    model['input_cols'] = input_cols
    model['number_of_components'] = number_of_components
    model['covariance_type'] = covariance_type
    model['tolerance'] = tolerance
    model['regularize_covariance'] = regularize_covariance
    model['max_iteration'] = max_iteration
    model['initial_params'] = initial_params
    model['seed'] = seed
    model['summary'] = out_table
    model['gmm'] = gmm
    model['_repr_brtc_'] = rb.get()
    return {'model':model}
def _naive_bayes_train(table,
                       feature_cols,
                       label_col,
                       alpha=1.0,
                       fit_prior=True,
                       class_prior=None):

    features = table[feature_cols]
    label = table[label_col]
    label_encoder = preprocessing.LabelEncoder()
    label_encoder.fit(label)
    label_correspond = label_encoder.transform(label)

    if class_prior is not None:
        tmp_class_prior = [0] * len(class_prior)
        for elems in class_prior:
            tmp = elems.split(":")
            tmp_class_prior[label_encoder.transform([tmp[0]
                                                     ])[0]] = float(tmp[1])
        class_prior = tmp_class_prior

    nb_model = MultinomialNB(alpha, fit_prior, class_prior)
    nb_model.fit(features, label_correspond)
    class_log_prior = nb_model.class_log_prior_
    feature_log_prob_ = nb_model.feature_log_prob_
    tmp_result = np.hstack(
        (list(map(list, zip(*[label_encoder.classes_] + [class_log_prior]))),
         (feature_log_prob_)))
    column_names = ['labels', 'pi']
    for feature_col in feature_cols:
        column_names += ['theta_' + feature_col]
    result_table = pd.DataFrame.from_records(tmp_result, columns=column_names)
    prediction_correspond = nb_model.predict(features)

    get_param = dict()
    get_param['Lambda'] = alpha
    # get_param['Prior Probabilities of the Classes'] = class_prior
    get_param['Fit Class Prior Probability'] = fit_prior
    get_param['Feature Columns'] = feature_cols
    get_param['Label Column'] = label_col

    cnf_matrix = confusion_matrix(label_correspond, prediction_correspond)

    plt.figure()
    _plot_confusion_matrix(cnf_matrix,
                           classes=label_encoder.classes_,
                           title='Confusion Matrix')
    fig_confusion_matrix = plt2MD(plt)
    accuracy = nb_model.score(features, label_correspond) * 100

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Naive Bayes Classification Result
    |
    | ### Model:Multinomial
    | {result_table}
    | ### Parameters
    | {table_parameter} 
    | ### Predicted vs Actual
    | {image1}
    | #### Accuacy = {accuracy}%
    |
    """.format(image1=fig_confusion_matrix,
               accuracy=accuracy,
               result_table=pandasDF2MD(result_table),
               table_parameter=dict2MD(get_param))))

    model = _model_dict('naive_bayes_model')
    model['features'] = feature_cols
    model['label_col'] = label_col
    model['label_encoder'] = label_encoder
    model['nb_model'] = nb_model
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Exemplo n.º 20
0
def _svm_classification_train(table,
                              feature_cols,
                              label_col,
                              gamma_val,
                              c=1.0,
                              kernel='rbf',
                              degree=3,
                              gamma='auto',
                              coef0=0.0,
                              shrinking=True,
                              probability=True,
                              tol=1e-3,
                              max_iter=-1,
                              random_state=None,
                              class_weight=None):
    _table = table.copy()

    feature_names, features = check_col_type(table, feature_cols)
    _label_col = _table[label_col]

    if (sklearn_utils.multiclass.type_of_target(_label_col) == 'continuous'):
        raise_runtime_error('''Label Column should not be continuous.''')

    class_labels = sorted(set(_label_col))
    if class_weight is not None:
        if len(class_weight) != len(class_labels):
            raise ValueError(
                "Number of class weights should match number of labels.")
        else:
            class_weight = {
                class_labels[i]: class_weight[i]
                for i in range(len(class_labels))
            }

    if gamma == 'other':
        _gamma = gamma_val
    else:
        _gamma = gamma
    _svc = svm.SVC(C=c,
                   kernel=kernel,
                   degree=degree,
                   gamma=_gamma,
                   coef0=coef0,
                   shrinking=shrinking,
                   probability=probability,
                   tol=tol,
                   max_iter=max_iter,
                   random_state=random_state,
                   class_weight=class_weight)
    _svc_model = _svc.fit(features, _label_col)

    get_param = _svc.get_params()
    get_param['feature_cols'] = feature_names
    get_param['label_col'] = label_col

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## SVM Classification Result
    | ### Parameters
    | {table_parameter} 
    """.format(table_parameter=dict2MD(get_param))))

    _model = _model_dict('svc_model')
    _model['svc_model'] = _svc_model
    _model['features'] = feature_cols
    _model['_repr_brtc_'] = rb.get()

    return {'model': _model}
Exemplo n.º 21
0
def _logistic_regression_train(table,
                               feature_cols,
                               label_col,
                               penalty='l2',
                               dual=False,
                               tol=0.0001,
                               C=1.0,
                               fit_intercept=True,
                               intercept_scaling=1,
                               class_weight=None,
                               random_state=None,
                               solver='liblinear',
                               max_iter=100,
                               multi_class='ovr',
                               verbose=0,
                               warm_start=False,
                               n_jobs=1):

    feature_names, features = check_col_type(table, feature_cols)
    features = pd.DataFrame(features, columns=feature_names)

    label = table[label_col]

    if (sklearn_utils.multiclass.type_of_target(label) == 'continuous'):
        raise_error('0718', 'label_col')

    lr_model = LogisticRegression(penalty, dual, tol, C, fit_intercept,
                                  intercept_scaling, class_weight,
                                  random_state, solver, max_iter, multi_class,
                                  verbose, warm_start, n_jobs)
    lr_model.fit(features, label)
    new_features = pd.DataFrame({
        "Constant": np.ones(len(features))
    }).join(pd.DataFrame(features))
    intercept = lr_model.intercept_
    coefficients = lr_model.coef_
    classes = lr_model.classes_
    is_binary = len(classes) == 2
    prob = lr_model.predict_proba(features)
    prob_trans = prob.T
    classes_dict = dict()
    for i in range(len(classes)):
        classes_dict[classes[i]] = i
    tmp_label = np.array([classes_dict[i] for i in label])
    likelihood = 1
    for i in range(len(table)):
        likelihood *= prob_trans[tmp_label[i]][i]
    if fit_intercept:
        k = len(feature_cols) + 1
    else:
        k = len(feature_cols)
    aic = 2 * k - 2 * np.log(likelihood)
    bic = np.log(len(table)) * k - 2 * np.log(likelihood)
    if is_binary:
        if fit_intercept:
            x_design = np.hstack([np.ones((features.shape[0], 1)), features])
        else:
            x_design = features.values
        v = np.product(prob, axis=1)
        x_design_modi = (x_design.T * v).T
        cov_logit = np.linalg.inv(np.dot(x_design_modi.T, x_design))
        std_err = np.sqrt(np.diag(cov_logit))
        if fit_intercept:
            logit_params = np.insert(coefficients, 0, intercept)
        else:
            logit_params = coefficients
        wald = (logit_params / std_err)**2
        p_values = 1 - chi2.cdf(wald, 1)
    else:
        if fit_intercept:
            x_design = np.hstack([np.ones((features.shape[0], 1)), features])
        else:
            x_design = features.values
        std_err = []
        for i in range(len(classes)):
            v = prob.T[i] * (1 - prob.T[i])
            x_design_modi = (x_design.T * v).T
            cov_logit = np.linalg.inv(np.dot(x_design_modi.T, x_design))
            std_err.append(np.sqrt(np.diag(cov_logit)))
        std_err = np.array(std_err)

        #print(math.log(likelihood))

    if (fit_intercept == True):
        summary = pd.DataFrame({'features': ['intercept'] + feature_names})
        coef_trans = np.concatenate(([intercept], np.transpose(coefficients)),
                                    axis=0)

    else:
        summary = pd.DataFrame({'features': feature_names})
        coef_trans = np.transpose(coefficients)

    if not is_binary:
        summary = pd.concat(
            (summary, pd.DataFrame(coef_trans, columns=classes)), axis=1)
    else:
        summary = pd.concat(
            (summary, pd.DataFrame(coef_trans, columns=[classes[0]])), axis=1)
    if is_binary:
        summary = pd.concat(
            (summary, pd.DataFrame(std_err, columns=['standard_error']),
             pd.DataFrame(wald, columns=['wald_statistic']),
             pd.DataFrame(p_values, columns=['p_value'])),
            axis=1)
    else:
        columns = [
            'standard_error_{}'.format(classes[i]) for i in range(len(classes))
        ]
        summary = pd.concat(
            (summary, pd.DataFrame(std_err.T, columns=columns)), axis=1)
        arrange_col = ['features']
        for i in range(len(classes)):
            arrange_col.append(classes[i])
            arrange_col.append('standard_error_{}'.format(classes[i]))
        summary = summary[arrange_col]
    if is_binary:
        rb = BrtcReprBuilder()
        rb.addMD(
            strip_margin("""
        | ## Logistic Regression Result
        | ### Summary
        | {table1}
        |
        | ##### Column '{small}' is the coefficients under the assumption ({small} = 0, {big} = 1).
        |
        | #### AIC : {aic}
        |
        | #### BIC : {bic}
        """.format(small=classes[0],
                   big=classes[1],
                   table1=pandasDF2MD(summary, num_rows=100),
                   aic=aic,
                   bic=bic)))
    else:
        rb = BrtcReprBuilder()
        rb.addMD(
            strip_margin("""
        | ## Logistic Regression Result
        | ### Summary
        | {table1}
        |
        | ##### Each column whose name is one of classes of Label Column is the coefficients under the assumption it is 1 and others are 0.
        |
        | ##### For example, column '{small}' is the coefficients under the assumption ({small} = 1, others = 0).
        |
        | #### AIC : {aic}
        |
        | #### BIC : {bic}
        """.format(small=classes[0],
                   table1=pandasDF2MD(summary, num_rows=100),
                   aic=aic,
                   bic=bic)))

    model = _model_dict('logistic_regression_model')
    model['standard_errors'] = std_err
    model['aic'] = aic
    model['bic'] = bic
    if is_binary:
        model['wald_statistics'] = wald
        model['p_values'] = p_values
    model['features'] = feature_cols
    model['label'] = label_col
    model['intercept'] = lr_model.intercept_
    model['coefficients'] = lr_model.coef_
    model['class'] = lr_model.classes_
    model['penalty'] = penalty
    model['solver'] = solver
    model['lr_model'] = lr_model
    model['_repr_brtc_'] = rb.get()
    model['summary'] = summary
    return {'model': model}
Exemplo n.º 22
0
def _kmeans_silhouette_train_predict(table,
                                     input_cols,
                                     n_clusters_list=range(2, 10),
                                     prediction_col='prediction',
                                     init='k-means++',
                                     n_init=10,
                                     max_iter=300,
                                     tol=1e-4,
                                     precompute_distances='auto',
                                     seed=None,
                                     n_jobs=1,
                                     algorithm='auto',
                                     n_samples=None):
    if n_samples is None:
        n_samples = len(table)
    inputarr = table[input_cols]

    validate(all_elements_greater_than(n_clusters_list, 1, 'n_clusters_list'),
             greater_than_or_equal_to(n_init, 1, 'n_init'),
             greater_than_or_equal_to(max_iter, 1, 'max_iter'),
             greater_than(tol, 0.0, 'tol'),
             greater_than_or_equal_to(n_jobs, 1, 'n_jobs'),
             greater_than_or_equal_to(n_samples, 0, 'n_samples'))

    pca2_model = PCA(n_components=2).fit(inputarr)
    pca2 = pca2_model.transform(inputarr)

    silhouette_list = []
    silouette_samples_list = []
    models = []
    centers_list = []
    images = []
    for k in n_clusters_list:
        k_means = SKKMeans(n_clusters=k,
                           init=init,
                           n_init=n_init,
                           max_iter=max_iter,
                           tol=tol,
                           precompute_distances=precompute_distances,
                           verbose=0,
                           random_state=seed,
                           copy_x=True,
                           n_jobs=n_jobs,
                           algorithm=algorithm)
        k_means.fit(inputarr)
        models.append(k_means)
        predict = k_means.labels_
        centersk = k_means.cluster_centers_
        centers_list.append(centersk)

        score = silhouette_score(inputarr, predict)
        silhouette_list.append(score)
        samples = silhouette_samples(inputarr, predict)
        silouette_samples_list.append(samples)

        pca2_centers = pca2_model.transform(centersk)

        _, (ax1, ax2) = plt.subplots(1, 2)
        colors = cm.nipy_spectral(np.arange(k).astype(float) / k)
        y_lower = 0

        for i, color in zip(range(k), colors):
            si = samples[predict == i]
            si.sort()

            sizei = si.shape[0]
            y_upper = y_lower + sizei

            ax1.fill_betweenx(np.arange(y_lower, y_upper),
                              0,
                              si,
                              facecolor=color,
                              edgecolor=color,
                              alpha=0.7)

            y_lower = y_upper

            ax2.scatter(pca2[:, 0][predict == i],
                        pca2[:, 1][predict == i],
                        color=color)

        ax1.axvline(x=score, color="red")
        ax2.scatter(pca2_centers[:, 0],
                    pca2_centers[:, 1],
                    marker='x',
                    edgecolors=1,
                    s=200,
                    color=colors)

        imagek = plt2MD(plt)
        plt.clf()
        images.append(imagek)

    argmax = np.argmax(silhouette_list)
    best_k = n_clusters_list[argmax]
    best_model = models[argmax]
    predict = best_model.predict(inputarr)
    best_centers = best_model.cluster_centers_
    best_labels = best_model.labels_

    fig_centers = _kmeans_centers_plot(input_cols, best_centers)
    fig_samples = _kmeans_samples_plot(table, input_cols, n_samples,
                                       best_centers)
    fig_pca = _kmeans_pca_plot(predict, best_centers, pca2_model, pca2)

    x_clusters = range(len(n_clusters_list))
    plt.xticks(x_clusters, n_clusters_list)
    plt.plot(x_clusters, silhouette_list, '.-')
    fig_silhouette = plt2MD(plt)
    plt.clf()

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Kmeans Silhouette Result
    | - silloutte metrics:
    | {fig_silhouette}
    | - best K: {best_k} 
    | - best centers:
    | {fig_pca}
    | {fig_centers}
    | {fig_samples}
    |
    """.format(fig_silhouette=fig_silhouette,
               best_k=best_k,
               fig_pca=fig_pca,
               fig_centers=fig_centers,
               fig_samples=fig_samples)))

    for k, image in zip(n_clusters_list, images):
        rb.addMD(
            strip_margin("""
        | ### k = {k}
        | {image}
        |
        """.format(k=k, image=image)))

    model = _model_dict('kmeans_silhouette')
    model['best_k'] = best_k
    model['best_centers'] = best_centers
    model['best_model'] = best_model
    model['input_cols'] = input_cols
    model['_repr_brtc_'] = rb.get()

    out_table = table.copy()
    out_table[prediction_col] = predict

    return {'out_table': out_table, 'model': model}
Exemplo n.º 23
0
def _svd2(table,
          input_cols,
          new_column_name='projected_',
          full_matrices=False):
    A = table[input_cols]

    u, s, vh = np.linalg.svd(A, full_matrices=full_matrices)
    projection = []
    for i in range(len(s)):
        projection += [(u.T[i] * s[i])]
    projection = np.array(projection).T
    s_normal = []
    for i in range(len(s)):
        if i == 0:
            s_normal += [s[i] / s.sum()]
        else:
            s_normal += [s[i] / s.sum() + s_normal[i - 1]]
    s = [s] + [s_normal]
    s = np.array(s)
    v = vh.T
    column_name_u = []
    column_name_s = []
    column_name_v = []
    column_name_projection = []
    for i in range(u.shape[1]):
        column_name_u += ['u%d' % (i + 1)]
    for i in range(s.shape[1]):
        column_name_s += ['s%d' % (i + 1)]
    for i in range(v.shape[1]):
        column_name_v += ['v%d' % (i + 1)]
    for i in range(s.shape[1]):
        column_name_projection += [new_column_name + '%d' % (i + 1)]

    out_table4 = pd.DataFrame(data=projection,
                              columns=[column_name_projection])
    out_table4 = pd.concat([table.reset_index(drop=True), out_table4], axis=1)
    out_table4.columns = table.columns.values.tolist() + column_name_projection

    res_param1 = {}
    res_param1['Input Columns'] = input_cols
    res_param1['full_matrices'] = full_matrices

    res_param2 = {}
    res_param2['u'] = u.shape
    res_param2['s'] = s.shape
    res_param2['v'] = v.shape
    res_param2['Projected Matrix'] = projection.shape

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## SVD Result
    |
    | ### Dimensions of Matrices
    | {parameter2}
    |
    | ### Parameters
    | {parameter1}
    """.format(parameter1=dict2MD(res_param1),
               parameter2=dict2MD(res_param2))))

    model = _model_dict('svd')
    model['right_singular_vectors'] = pd.DataFrame(v, columns=column_name_v)
    model['input_cols'] = input_cols
    model['parameters'] = res_param1
    model['_repr_brtc_'] = rb.get()

    return {
        'out_table1': pd.DataFrame(u, columns=column_name_u),
        'out_table2': pd.DataFrame(s, columns=column_name_s),
        'out_table3': pd.DataFrame(v, columns=column_name_v),
        'out_table4': out_table4,
        'model': model
    }
Exemplo n.º 24
0
def _kmeans_train_predict(table,
                          input_cols,
                          n_clusters=3,
                          prediction_col='prediction',
                          init='k-means++',
                          n_init=10,
                          max_iter=300,
                          tol=1e-4,
                          precompute_distances='auto',
                          seed=None,
                          n_jobs=1,
                          algorithm='auto',
                          n_samples=None):
    inputarr = table[input_cols]
    if n_samples is None:
        n_samples = len(inputarr)

    validate(greater_than_or_equal_to(n_clusters, 1, 'n_clusters'),
             greater_than_or_equal_to(n_init, 1, 'n_init'),
             greater_than_or_equal_to(max_iter, 1, 'max_iter'),
             greater_than(tol, 0.0, 'tol'),
             greater_than_or_equal_to(n_jobs, 1, 'n_jobs'),
             greater_than_or_equal_to(n_samples, 0, 'n_samples'))

    k_means = SKKMeans(n_clusters=n_clusters,
                       init=init,
                       n_init=n_init,
                       max_iter=max_iter,
                       tol=tol,
                       precompute_distances=precompute_distances,
                       verbose=0,
                       random_state=seed,
                       copy_x=True,
                       n_jobs=n_jobs,
                       algorithm=algorithm)

    k_means.fit(inputarr)

    params = {
        'input_cols': input_cols,
        'n_clusters': n_clusters,
        'init': init,
        'n_init': n_init,
        'max_iter': max_iter,
        'tol': tol,
        'precompute_distances': precompute_distances,
        'seed': seed,
        'n_jobs': n_jobs,
        'algorithm': algorithm
    }

    cluster_centers = k_means.cluster_centers_
    labels = k_means.labels_

    pca2_model = PCA(n_components=2).fit(inputarr)
    pca2 = pca2_model.transform(inputarr)

    fig_centers = _kmeans_centers_plot(input_cols, cluster_centers)
    fig_samples = _kmeans_samples_plot(table, input_cols, n_samples,
                                       cluster_centers)
    fig_pca = _kmeans_pca_plot(labels, cluster_centers, pca2_model, pca2)

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Kmeans Result
    | - Number of iterations run: {n_iter_}.
    | - Coordinates of cluster centers
    | {fig_cluster_centers} 
    | - Samples
    | {fig_pca}
    | {fig_samples}
    |
    | ### Parameters
    | {params}
    """.format(n_iter_=k_means.n_iter_,
               fig_cluster_centers=fig_centers,
               fig_pca=fig_pca,
               fig_samples=fig_samples,
               params=dict2MD(params))))

    model = _model_dict('kmeans')
    model['model'] = k_means
    model['input_cols'] = input_cols
    model['_repr_brtc_'] = rb.get()

    out_table = table.copy()
    out_table[prediction_col] = labels
    return {'out_table': out_table, 'model': model}
Exemplo n.º 25
0
def _glm_train(table, feature_cols, label_col, family="Gaussian", link="ident", fit_intercept=True):
    features = table[feature_cols]
    label = table[label_col]

    if label_col in feature_cols:
        raise Exception("%s is duplicated." % label_col)

    if family == "Gaussian": 
        sm_family = sm.families.Gaussian()
    elif family == "inv_Gaussian":
        sm_family = sm.families.InverseGaussian()
    elif family == "binomial":
        sm_family = sm.families.Binomial()
    elif family == "Poisson":
        sm_family = sm.families.Poisson()
    elif family == "neg_binomial":
        sm_family = sm.families.NegativeBinomial()
    elif family == "gamma":
        sm_family = sm.families.Gamma()
    elif family == "Tweedie":
        sm_family = sm.families.Tweedie()

    if link == "ident":
        sm_link = sm.families.links.identity
    elif link == "log":
        sm_link = sm.families.links.log
    elif link == "logit":
        sm_link = sm.families.links.logit
    elif link == "probit":
        sm_link = sm.families.links.probit
    elif link == "cloglog":
        sm_link = sm.families.links.cLogLog
    elif link == "pow":
        sm_link = sm.families.links.Power
    elif link == "nbinom":
        sm_link = sm.families.links.binom

    if fit_intercept == True:
        glm_model = sm.GLM(label, sm.add_constant(features), family=sm_family, link=sm_link).fit()
    else:
        glm_model = sm.GLM(label, features, family=sm_family, link=sm_link).fit()
    summary = glm_model.summary().as_html()

    rb = ReportBuilder()
    rb.addMD(strip_margin("""
    | ## GLM Result
    | ### Summary
    |
    """))
    rb.addHTML(summary)

    model = _model_dict('glm_model')
    model['features'] = feature_cols
    model['label'] = label_col
    model['family'] = family
    model['link'] = link
    model['coefficients'] = glm_model.params
    model['aic'] = glm_model.aic
    model['bic'] = glm_model.bic
    model['tvalues'] = glm_model.tvalues
    model['pvalues'] = glm_model.pvalues
    model['fit_intercept'] = fit_intercept
    model['glm_model'] = glm_model
    model['report'] = rb.get()

    return {'model' : model}
Exemplo n.º 26
0
def _xgb_classification_train(table,
                              feature_cols,
                              label_col,
                              max_depth=3,
                              learning_rate=0.1,
                              n_estimators=100,
                              silent=True,
                              objective='binary:logistic',
                              booster='gbtree',
                              n_jobs=1,
                              nthread=None,
                              gamma=0,
                              min_child_weight=1,
                              max_delta_step=0,
                              subsample=1,
                              colsample_bytree=1,
                              colsample_bylevel=1,
                              reg_alpha=0,
                              reg_lambda=1,
                              scale_pos_weight=1,
                              base_score=0.5,
                              random_state=0,
                              seed=None,
                              missing=None,
                              sample_weight=None,
                              eval_set=None,
                              eval_metric=None,
                              early_stopping_rounds=None,
                              verbose=True,
                              xgb_model=None,
                              sample_weight_eval_set=None):
    validate(greater_than_or_equal_to(max_depth, 1, 'max_depth'),
             greater_than_or_equal_to(learning_rate, 0.0, 'learning_rate'),
             greater_than_or_equal_to(n_estimators, 1, 'n_estimators'))

    classifier = XGBClassifier(max_depth, learning_rate, n_estimators, silent,
                               objective, booster, n_jobs, nthread, gamma,
                               min_child_weight, max_delta_step, subsample,
                               colsample_bytree, colsample_bylevel, reg_alpha,
                               reg_lambda, scale_pos_weight, base_score,
                               random_state, seed, missing)
    classifier.fit(table[feature_cols], table[label_col], sample_weight,
                   eval_set, eval_metric, early_stopping_rounds, verbose,
                   xgb_model, sample_weight_eval_set)

    # json
    get_param = classifier.get_params()
    feature_importance = classifier.feature_importances_
    #     plt.rcdefaults()
    plot_importance(classifier)
    plt.tight_layout()
    fig_plot_importance = plt2MD(plt)
    plt.clf()
    #     plt.rcParams['figure.dpi'] = figure_dpi
    #     plot_tree(classifier)
    #     fig_plot_tree_UT = plt2MD(plt)
    #     plt.clf()
    #     plt.rcParams['figure.dpi'] = figure_dpi
    #     plot_tree(classifier, rankdir='LR')
    #     fig_plot_tree_LR = plt2MD(plt)
    #     plt.rcdefaults()
    #     plt.clf()

    model = _model_dict('xgb_classification_model')
    model['feature_cols'] = feature_cols
    model['label_col'] = label_col
    model['parameters'] = get_param
    model['feature_importance'] = feature_importance
    model['classifier'] = classifier

    # report
    #     get_param_list = []
    #     get_param_list.append(['feature_cols', feature_cols])
    #     get_param_list.append(['label_col', label_col])

    params = dict2MD(get_param)
    #     for key, value in get_param.items():
    #         temp = [key, value]
    #         get_param_list.append(temp)
    #     get_param_df = pd.DataFrame(data=get_param_list, columns=['parameter', 'value'])
    feature_importance_df = pd.DataFrame(data=feature_importance,
                                         index=feature_cols).T

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## XGB Classification Train Result
    |
    | ### Plot Importance
    | {fig_importance}
    |
    | ### Feature Importance
    | {table_feature_importance}
    |
    | ### Parameters
    | {list_parameters}
    |
    """.format(fig_importance=fig_plot_importance,
               table_feature_importance=pandasDF2MD(feature_importance_df, 20),
               list_parameters=params)))
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Exemplo n.º 27
0
Arquivo: pca.py Projeto: shovsj/studio
def _pca(table,
         input_cols,
         new_column_name='projected_',
         n_components=None,
         copy=True,
         whiten=False,
         svd_solver='auto',
         tol=0.0,
         iterated_power='auto',
         random_state=None,
         hue=None,
         alpha=0,
         key_col=None):

    num_feature_cols = len(input_cols)
    if n_components is None:
        n_components = num_feature_cols

    validate(greater_than_or_equal_to(n_components, 1, 'n_components'))

    pca = PCA(None, copy, whiten, svd_solver, tol, iterated_power,
              random_state)
    pca_model = pca.fit(table[input_cols])

    column_names = []
    for i in range(0, n_components):
        column_names.append(new_column_name + str(i))
    # print(column_names)

    pca_result = pca_model.transform(table[input_cols])
    out_df = pd.DataFrame(data=pca_result[:, :n_components],
                          columns=[column_names])

    out_df = pd.concat([table.reset_index(drop=True), out_df], axis=1)
    out_df.columns = table.columns.values.tolist() + column_names

    res_components = pca_model.components_
    res_components_df = pd.DataFrame(data=res_components[:n_components],
                                     columns=[input_cols])
    res_explained_variance = pca_model.explained_variance_
    res_explained_variance_ratio = pca_model.explained_variance_ratio_
    res_singular_values = pca_model.singular_values_
    res_mean = pca_model.mean_
    res_n_components = pca_model.n_components_
    res_noise_variance = pca_model.noise_variance_

    res_get_param = pca_model.get_params()
    res_get_covariance = pca_model.get_covariance()
    res_get_precision = pca_model.get_precision()

    # visualization
    plt.figure()
    if n_components == 1:
        sns.scatterplot(column_names[0], column_names[0], hue=hue, data=out_df)
        plt_two = plt2MD(plt)
        plt.clf()
    else:
        plt_two = _biplot(
            0,
            1,
            pc_columns=column_names,
            columns=input_cols,
            singular_values=res_singular_values,
            components=res_components,
            explained_variance_ratio=res_explained_variance_ratio,
            alpha=alpha,
            hue=hue,
            data=out_df,
            ax=plt.gca(),
            key_col=key_col)

    plt.figure()
    fig_scree = _screeplot(res_explained_variance,
                           res_explained_variance_ratio, n_components)

    table_explained_variance = pd.DataFrame(res_explained_variance,
                                            columns=['explained_variance'])
    table_explained_variance[
        'explained_variance_ratio'] = res_explained_variance_ratio
    table_explained_variance[
        'cum_explained_variance_ratio'] = res_explained_variance_ratio.cumsum(
        )

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## PCA Result
    | ### Plot
    | {image1}
    |
    | ### Explained Variance
    | {fig_scree}
    | {table_explained_variance}    
    |
    | ### Components
    | {table2}
    |
    | ### Parameters
    | {parameter1}
    """.format(image1=plt_two,
               fig_scree=fig_scree,
               table_explained_variance=pandasDF2MD(table_explained_variance),
               parameter1=dict2MD(res_get_param),
               table2=pandasDF2MD(res_components_df))))

    model = _model_dict('pca')
    model['components'] = res_components
    model['explained_variance'] = res_explained_variance
    model['explained_variance_ratio'] = res_explained_variance_ratio
    model['singular_values'] = res_singular_values
    model['mean'] = res_mean
    model['n_components'] = res_n_components
    model['noise_variance'] = res_noise_variance
    model['parameters'] = res_get_param
    model['covariance'] = res_get_covariance
    model['precision'] = res_get_precision
    model['_repr_brtc_'] = rb.get()
    model['pca_model'] = pca_model
    model['input_cols'] = input_cols

    return {'out_table': out_df, 'model': model}
Exemplo n.º 28
0
def _doc2vec(table,
             input_col,
             dm=1,
             vector_size=100,
             window=10,
             min_count=1,
             max_vocab_size=None,
             train_epoch=100,
             workers=1,
             alpha=0.025,
             min_alpha=0.025,
             seed=None,
             hs=1,
             negative=5,
             ns_exponent=0.75,
             topn=30,
             hashfxn=hash):
    if seed is None:
        random_state = seed
        seed = randint(0, 0xffffffff)
    else:
        random_state = seed

    docs = table[input_col]
    tagged_docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs)]

    # hs = 1 if hs is True else 0
    if isinstance(dm, str):
        dm = int(dm)
    algo = {1: 'PV-DM', 0: 'PV_DBOW'}[dm]

    d2v = Doc2Vec(documents=tagged_docs,
                  dm=dm,
                  vector_size=vector_size,
                  window=window,
                  alpha=alpha,
                  min_alpha=min_alpha,
                  seed=seed,
                  min_count=min_count,
                  max_vocab_size=max_vocab_size,
                  workers=workers,
                  epochs=train_epoch,
                  hs=hs,
                  negative=negative,
                  ns_exponent=ns_exponent,
                  hashfxn=hashfxn)

    vocab = d2v.wv.vocab

    params = {
        'Input column': input_col,
        'Training algorithm': algo,
        'Dimension of Vectors': vector_size,
        'Window': window,
        'Minimum count': min_count,
        'Max vocabulary size': max_vocab_size,
        'Train epoch': train_epoch,
        'Number of workers': workers,
        'Alpha': alpha,
        'Minimum alpha': min_alpha,
        'Seed': random_state,
        'Hierarchical softmax': hs,
        'Negative': negative,
        'Negative sampling exponent': ns_exponent
    }

    # tsne visualization
    length = len(vocab)
    if length < topn:
        topn = length
    topn_words = sorted(vocab, key=vocab.get, reverse=True)[:topn]

    X = d2v[topn_words]
    tsne = TSNE(n_components=min(2, topn), random_state=seed)
    X_tsne = tsne.fit_transform(X)
    df = pd.DataFrame(X_tsne, index=topn_words, columns=['x', 'y'])

    fig = plt.figure()
    fig.set_size_inches(50, 40)
    ax = fig.add_subplot(1, 1, 1)

    ax.scatter(df['x'], df['y'], s=1000)
    ax.tick_params(axis='both', which='major', labelsize=50)

    for word, pos in df.iterrows():
        ax.annotate(word, pos, fontsize=80)
    plt.show()
    fig = plt2MD(plt)
    plt.clf()

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Doc2Vec Result
    |
    | ### Total Number of words
    | {length}
    |
    | ### Top {topn} Words
    | {topn_words}
    | {fig}
    |
    | ### Parameters
    | {params}
    """.format(length=length,
               topn=topn,
               topn_words=topn_words,
               fig=fig,
               params=dict2MD(params))))

    model = _model_dict('doc2vec_model')
    model['params'] = params
    model['d2v'] = d2v
    model['_repr_brtc_'] = rb.get()
    model['hash_val(Brightics)'] = hashfxn('Brightics')

    out_table1 = table.copy()
    out_table1['document_vectors'] = [
        d2v.infer_vector(doc.words).tolist() for doc in tagged_docs
    ]
    out_table2 = pd.DataFrame({
        'words': d2v.wv.index2word,
        'word_vectors': d2v.wv[vocab].tolist()
    })

    return {'model': model, 'doc_table': out_table1, 'word_table': out_table2}
Exemplo n.º 29
0
def _logistic_regression_train(table,
                               feature_cols,
                               label_col,
                               penalty='l2',
                               dual=False,
                               tol=0.0001,
                               C=1.0,
                               fit_intercept=True,
                               intercept_scaling=1,
                               class_weight=None,
                               random_state=None,
                               solver='liblinear',
                               max_iter=100,
                               multi_class='ovr',
                               verbose=0,
                               warm_start=False,
                               n_jobs=1):

    feature_names, features = check_col_type(table, feature_cols)
    label = table[label_col]

    if (sklearn_utils.multiclass.type_of_target(label) == 'continuous'):
        raise_error('0718', 'label_col')

    lr_model = LogisticRegression(penalty, dual, tol, C, fit_intercept,
                                  intercept_scaling, class_weight,
                                  random_state, solver, max_iter, multi_class,
                                  verbose, warm_start, n_jobs)
    lr_model.fit(features, label)

    intercept = lr_model.intercept_
    coefficients = lr_model.coef_
    classes = lr_model.classes_
    is_binary = len(classes) == 2
    if (fit_intercept == True):
        summary = pd.DataFrame({'features': ['intercept'] + feature_names})
        print(intercept)
        print(coefficients)

        coef_trans = np.concatenate(([intercept], np.transpose(coefficients)),
                                    axis=0)
        if not is_binary:
            summary = pd.concat(
                (summary, pd.DataFrame(coef_trans, columns=classes)), axis=1)
        elif is_binary:
            summary = pd.concat(
                (summary, pd.DataFrame(coef_trans, columns=[classes[0]])),
                axis=1)

    else:
        summary = pd.DataFrame({'features': feature_names})
        coef_trans = np.transpose(coefficients)

        if not is_binary:
            summary = pd.concat(
                (summary, pd.DataFrame(coef_trans, columns=classes)), axis=1)
        elif is_binary:
            summary = pd.concat(
                (summary, pd.DataFrame(coef_trans, columns=[classes[0]])),
                axis=1)

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Logistic Regression Result
    | ### Summary
    | {table1}
    """.format(table1=pandasDF2MD(summary))))

    model = _model_dict('logistic_regression_model')
    model['features'] = feature_cols
    model['label'] = label_col
    model['intercept'] = lr_model.intercept_
    model['coefficients'] = lr_model.coef_
    model['class'] = lr_model.classes_
    model['penalty'] = penalty
    model['solver'] = solver
    model['lr_model'] = lr_model
    model['_repr_brtc_'] = rb.get()
    model['summary'] = summary

    return {'model': model}
Exemplo n.º 30
0
def _linear_regression_train(table,
                             feature_cols,
                             label_col,
                             fit_intercept=True):
    features = table[feature_cols]
    label = table[label_col]
    lr_model = LinearRegression(fit_intercept)
    lr_model.fit(features, label)

    predict = lr_model.predict(features)
    residual = label - predict

    if fit_intercept == True:
        lr_model_fit = sm.OLS(label, sm.add_constant(features)).fit()
    else:
        lr_model_fit = sm.OLS(label, features).fit()
    summary = lr_model_fit.summary().as_html()

    plt.figure()
    plt.scatter(predict, label)
    plt.xlabel('Predicted values for ' + label_col)
    plt.ylabel('Actual values for ' + label_col)
    x = predict
    y = np.array(label)
    a = x.size
    b = np.sum(x)
    c = b
    d = 0
    for i in x:
        d += +i * i
    e = np.sum(y)
    f = 0
    for i in range(0, x.size - 1):
        f += x[i] * y[i]
    det = a * d - b * c
    aa = (d * e - b * f) / det
    bb = (a * f - c * e) / det
    p1x = np.min(x)
    p1y = aa + bb * p1x
    p2x = np.max(x)
    p2y = aa + bb * p2x
    plt.plot([p1x, p2x], [p1y, p2y], 'r--')
    fig_actual_predict = plt2MD(plt)

    plt.figure()
    plt.scatter(predict, residual)
    plt.xlabel('Predicted values for ' + label_col)
    plt.ylabel('Residuals')
    plt.axhline(y=0, color='r', linestyle='--')
    fig_residual_1 = plt2MD(plt)

    plt.figure()
    sm.qqplot(residual, line='s')
    plt.ylabel('Residuals')
    fig_residual_2 = plt2MD(plt)

    plt.figure()
    sns.distplot(residual)
    plt.xlabel('Residuals')
    fig_residual_3 = plt2MD(plt)

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | ## Linear Regression Result
    | ### Summary
    |
    """))
    rb.addHTML(summary)
    rb.addMD(
        strip_margin("""
    |
    | ### Predicted vs Actual
    | {image1}
    |
    | ### Fit Diagnostics
    | {image2}
    | {image3}
    | {image4}
    """.format(image1=fig_actual_predict,
               image2=fig_residual_1,
               image3=fig_residual_2,
               image4=fig_residual_3)))

    model = _model_dict('linear_regression_model')
    model['features'] = feature_cols
    model['label'] = label_col
    model['coefficients'] = lr_model_fit.params
    model['r2'] = lr_model_fit.rsquared
    model['adjusted_r2'] = lr_model_fit.rsquared_adj
    model['aic'] = lr_model_fit.aic
    model['bic'] = lr_model_fit.bic
    model['f_static'] = lr_model_fit.fvalue
    model['tvalues'] = lr_model_fit.tvalues
    model['pvalues'] = lr_model_fit.pvalues
    model['lr_model'] = lr_model
    model['report'] = rb.get()

    return {'model': model}