Пример #1
0
def _ftest_for_stacked_data(table,
                            response_cols,
                            factor_col,
                            alternatives,
                            first=None,
                            second=None,
                            confi_level=0.95):

    if first is not None or second is not None:
        check_table = np.array(table[factor_col])
        for element in check_table:
            if element is not None:
                if type(element) != str:
                    if type(element) == bool:
                        if first is not None and second is not None:
                            first = bool(first)
                            second = bool(second)
                            break
                        if first is not None:
                            first = bool(first)
                            break
                        second = bool(second)
                        break
                    else:
                        if first is not None and second is not None:
                            first = float(first)
                            second = float(second)
                            break
                        if first is not None:
                            first = float(first)
                            break
                        second = float(second)
                        break
                else:
                    break
    if first is None or second is None:
        tmp_factors = np.unique(table[factor_col])
        if len(tmp_factors) != 2:
            raise_error('0719', 'factor_col')
    if first is None:
        if tmp_factors[0] != second:
            first = tmp_factors[0]
        else:
            first = tmp_factors[1]
    if second is None:
        if tmp_factors[0] != first:
            second = tmp_factors[0]
        else:
            second = tmp_factors[1]
    table_first = table[table[factor_col] == first]
    table_second = table[table[factor_col] == second]
    tmp_table = []
    number1 = len(table_first[factor_col])
    number2 = len(table_second[factor_col])
    d_num = number1 - 1
    d_denum = number2 - 1
    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    ## F Test for Stacked Data Result
    | - Confidence level = {confi_level}
    | - Statistics = F statistic, F distribution with {d_num} numerator degrees of freedom and {d_denum} degrees of freedom under the null hypothesis
    """.format(confi_level=confi_level, d_num=d_num, d_denum=d_denum)))

    for response_col in response_cols:
        tmp_model = []
        std1 = (table_first[response_col]).std()
        std2 = (table_second[response_col]).std()
        f_value = (std1**2) / (std2**2)

        if 'larger' in alternatives:
            p_value = scipy.stats.f.cdf(1 / f_value, d_num, d_denum)
            tmp_model += [
                ['true ratio > 1'] + [p_value] +
                [(f_value /
                  (scipy.stats.f.ppf(confi_level, d_num, d_denum)), math.inf)]
            ]
            tmp_table += [[
                '%s by %s(%s,%s)' % (response_col, factor_col, first, second)
            ] + ['true ratio of variances > 1'] + [
                'F statistic, F distribution with %d numerator degrees of freedom and %d degrees of freedom under the null hypothesis.'
                % (d_num, d_denum)
            ] + [f_value] + [p_value] + [confi_level] + [
                f_value / (scipy.stats.f.ppf(confi_level, d_num, d_denum))
            ] + [math.inf]]

        if 'smaller' in alternatives:
            p_value = scipy.stats.f.cdf(f_value, d_num, d_denum)
            tmp_model += [['true ratio < 1'] + [p_value] +
                          [(0.0, f_value *
                            (scipy.stats.f.ppf(confi_level, d_denum, d_num)))]]
            tmp_table += [[
                '%s by %s(%s,%s)' % (response_col, factor_col, first, second)
            ] + ['true ratio of variances < 1'] + [
                'F statistic, F distribution with %d numerator degrees of freedom and %d degrees of freedom under the null hypothesis.'
                % (d_num, d_denum)
            ] + [f_value] + [p_value] + [confi_level] + [0.0] + [
                f_value * (scipy.stats.f.ppf(confi_level, d_denum, d_num))
            ]]

        if 'two-sided' in alternatives:
            p_value_tmp = scipy.stats.f.cdf(1 / f_value, d_num, d_denum)
            if p_value_tmp > 0.5:
                p_value = (1 - p_value_tmp) * 2
            else:
                p_value = p_value_tmp * 2
            tmp_model += [
                ['true ratio != 1'] + [p_value] +
                [(f_value / (scipy.stats.f.ppf(
                    (1 + confi_level) / 2, d_num, d_denum)), f_value *
                  (scipy.stats.f.ppf((1 + confi_level) / 2, d_denum, d_num)))]
            ]
            tmp_table += [[
                '%s by %s(%s,%s)' % (response_col, factor_col, first, second)
            ] + ['true ratio of variances != 1'] + [
                'F statistic, F distribution with %d numerator degrees of freedom and %d degrees of freedom under the null hypothesis.'
                % (d_num, d_denum)
            ] + [f_value] + [p_value] + [confi_level] + [
                f_value / (scipy.stats.f.ppf(
                    (1 + confi_level) / 2, d_num, d_denum))
            ] + [
                f_value * (scipy.stats.f.ppf(
                    (1 + confi_level) / 2, d_denum, d_num))
            ]]

        result_model = pd.DataFrame.from_records(tmp_model)
        result_model.columns = [
            'alternative_hypothesis', 'p-value',
            '%g%% confidence interval' % (confi_level * 100)
        ]
        rb.addMD(
            strip_margin("""
        | #### Data = {response_col} by {factor_col}({first},{second})
        | - F-value = {f_value}
        |
        | {result_model}
        |
        """.format(response_col=response_col,
                   factor_col=factor_col,
                   first=first,
                   second=second,
                   f_value=f_value,
                   result_model=pandasDF2MD(result_model))))

    result = pd.DataFrame.from_records(tmp_table)
    result.columns = [
        'data', 'alternative_hypothesis', 'statistics', 'estimates', 'p_value',
        'confidence_level', 'lower_confidence_interval',
        'upper_confidence_interval'
    ]

    model = dict()
    model['_repr_brtc_'] = rb.get()
    return {'out_table': result, 'model': model}
Пример #2
0
def _one_hot_encoder2(table,
                      input_cols,
                      prefix='list',
                      prefix_list=None,
                      suffix='index',
                      n_values='auto',
                      categorical_features='all',
                      sparse=True,
                      handle_unknown='error',
                      drop_last=False):
    out_table = table.copy()
    sparse = False
    enc_list = []
    le_list = []
    if drop_last:
        new_col_names_list_with_true_drop_last = []
    new_col_names_list = []
    prefix_list_index = 0
    if prefix == 'list':
        len_prefix_list = 0 if prefix_list is None else len(prefix_list)
        if len(input_cols) != len_prefix_list:
            # TODO: make the error message code
            raise_runtime_error(
                'The number of Input Columns and the number of Prefixes should be equal.'
            )
    number_distinct_classes = []
    for col_name in input_cols:
        enc = OneHotEncoder(n_values=n_values,
                            categorical_features=categorical_features,
                            sparse=sparse,
                            handle_unknown=handle_unknown)
        le = LabelEncoder()
        distinct_classes = np.unique(out_table[col_name].values)
        number_distinct_classes.append(len(distinct_classes))
        new_col_names = []
        if suffix == 'index':
            if prefix == 'list':
                for i in range(0, len(distinct_classes)):
                    new_col_names.append(prefix_list[prefix_list_index] + '_' +
                                         str(i))
            else:
                for i in range(0, len(distinct_classes)):
                    new_col_names.append(col_name + '_' + str(i))
        else:
            pattern = re.compile("\W")
            for i in distinct_classes:
                i = re.sub(pattern, "_", str(i))
                if prefix == 'list':
                    new_col_names.append(prefix_list[prefix_list_index] + '_' +
                                         i)
                else:
                    new_col_names.append(col_name + '_' + i)

        transformed_table = pd.DataFrame(enc.fit_transform(
            le.fit_transform(out_table[col_name]).reshape(-1, 1)),
                                         columns=new_col_names)
        new_col_names_list.append(new_col_names)
        if drop_last:
            new_col_names = new_col_names[:-1]
            new_col_names_list_with_true_drop_last.append(new_col_names)
        for new_col_name in new_col_names:
            out_table[new_col_name] = transformed_table[new_col_name]

        enc_list.append(enc)
        le_list.append(le)
        prefix_list_index = prefix_list_index + 1
    rb = BrtcReprBuilder()
    params = {
        'Input Columns': input_cols,
        "Prefix Type": prefix,
        "Suffix Type": suffix,
        "Drop Last": drop_last,
        "Number of values per feature": n_values,
        "Categorical features": categorical_features,
        "Error handling": handle_unknown
    }
    summary_table = pd.DataFrame()
    summary_table['Input Columns'] = input_cols
    summary_table['No. distinct classes'] = number_distinct_classes
    if drop_last:
        summary_table[
            'New encoded columns'] = new_col_names_list_with_true_drop_last
    else:
        summary_table['New encoded columns'] = new_col_names_list
    rb.addMD(
        strip_margin("""
    | ## One Hot Encoder Model
    | ### Parameters
    | {params}
    |
    | ### Summary
    | {summary_table}
    """.format(params=dict2MD(params),
               summary_table=pandasDF2MD(summary_table))))
    out_model = _model_dict('one_hot_encoder')
    out_model['one_hot_encoder_list'] = enc_list
    out_model['label_encoder_list'] = le_list
    out_model['input_cols'] = input_cols
    out_model['prefix'] = prefix
    out_model['prefix_list'] = prefix_list
    out_model['suffix'] = suffix
    out_model['drop_last'] = drop_last
    out_model['_repr_brtc_'] = rb.get()
    if drop_last:
        out_model[
            'new_col_names_list_with_true_drop_last'] = new_col_names_list_with_true_drop_last
    out_model['new_col_names_list'] = new_col_names_list
    return {'out_table': out_table, 'model': out_model}
Пример #3
0
def _collaborative_filtering_train(table,
                                   user_col,
                                   item_col,
                                   rating_col,
                                   N=10,
                                   k=5,
                                   based='item',
                                   mode='train',
                                   method='cosine',
                                   weighted=True,
                                   centered=True,
                                   targets=None):
    table_user_col = table[user_col]
    table_item_col = table[item_col]
    rating_col = table[rating_col]
    user_encoder = preprocessing.LabelEncoder()
    item_encoder = preprocessing.LabelEncoder()
    user_encoder.fit(table_user_col)
    item_encoder.fit(table_item_col)
    user_correspond = user_encoder.transform(table_user_col)
    item_correspond = item_encoder.transform(table_item_col)
    item_users = np.zeros(
        (len(item_encoder.classes_), len(user_encoder.classes_)))
    for i in range(len(table_user_col)):
        item_users[item_correspond[i]][user_correspond[i]] = rating_col[i] + 1
    centered_ratings = item_users.copy()
    if centered:
        check_cen = csr_matrix(centered_ratings)
    num_item, num_user = item_users.shape
    if centered:
        for item in range(num_item):
            index = 0
            sum = 0
            for user, rating in _nonzeros(check_cen, item):
                index += 1
                sum += rating
            avg = sum / index
            for user, rating in _nonzeros(check_cen, item):
                centered_ratings[item][user] -= avg
    for i in range(len(table_user_col)):
        item_users[item_correspond[i]][user_correspond[i]] -= 1
    if method == 'adjusted':
        check_cen = csr_matrix(np.transpose(centered_ratings))
    if method == 'adjusted':
        for user in range(num_user):
            sum = 0
            for item, rating in _nonzeros(check_cen, user):
                sum += rating
            avg = sum / num_item
            for item in range(num_item):
                centered_ratings[item][user] -= avg
        method = 'cosine'

    if based == 'item':
        similar_coeff = np.zeros((num_item, num_item))
        for item in range(num_item):
            similar_coeff[item][item] = -1
            for diff_item in range(item + 1, num_item):
                similar_coeff[item][diff_item] = _similar_coeff(
                    centered_ratings, item, diff_item, method)
                similar_coeff[diff_item][item] = similar_coeff[item][diff_item]

    else:
        similar_coeff = np.zeros((num_user, num_user))
        for user in range(num_user):
            similar_coeff[user][user] = -1
            for diff_user in range(user + 1, num_user):
                similar_coeff[user][diff_user] = _similar_coeff(
                    np.transpose(centered_ratings), user, diff_user, method)
                similar_coeff[diff_user][user] = similar_coeff[user][diff_user]

    if mode == 'Topn':
        if targets is None:
            targets = user_encoder.classes_
        targets_en = user_encoder.transform(targets)
        Topn_result = []
        for user in targets_en:
            recommendations_corre = _recommend(user, item_users, similar_coeff,
                                               N, k, method, weighted,
                                               centered, based)
            recommendations = []
            for (item, rating) in recommendations_corre:
                recommendations += [
                    item_encoder.inverse_transform([item])[0], rating
                ]
            Topn_result += [recommendations]
        Topn_result = pd.DataFrame(Topn_result)
        Topn_result = pd.concat([pd.DataFrame(targets), Topn_result],
                                axis=1,
                                ignore_index=True)
        column_names = ['user_name']
        for i in range(int((Topn_result.shape[1] - 1) / 2)):
            column_names += ['item_top%d' % (i + 1), 'rating_top%d' % (i + 1)]
        Topn_result.columns = column_names
        return {'out_table': Topn_result}

    parameters = dict()
    parameters['Number of Neighbors'] = k
    parameters['Based'] = based
    if method == 'cosine':
        parameters['Similarity method'] = 'Cosine'
    elif method == 'jaccard':
        parameters['Similarity method'] = 'Jaccard'
    elif method == 'pearson':
        parameters['Similarity method'] = 'Pearson'
    else:
        parameters['Similarity method'] = 'Adjusted Cosine'
    parameters['Use Centered Mean'] = centered
    parameters['Use Weighted Rating'] = weighted
    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Collaborative Filtering Result
    |
    | ### Parameters
    | {parameters} 
    |
    """.format(parameters=dict2MD(parameters))))

    model = _model_dict('collaborative filtering')
    model['weighted'] = weighted
    model['k'] = k
    model['similar_coeff'] = similar_coeff
    model['item_encoder'] = item_encoder
    model['user_encoder'] = user_encoder
    model['item_users'] = item_users
    model['user_col'] = user_col
    model['item_col'] = item_col
    model['based'] = based
    model['_repr_brtc_'] = rb.get()
    return {'model': model}
def _decision_tree_classification_train(
        table,
        feature_cols,
        label_col,  # fig_size=np.array([6.4, 4.8]), 
        criterion='gini',
        splitter='best',
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        min_weight_fraction_leaf=0.0,
        max_features=None,
        random_state=None,
        max_leaf_nodes=None,
        min_impurity_decrease=0.0,
        min_impurity_split=None,
        class_weight=None,
        presort=False,
        sample_weight=None,
        check_input=True,
        X_idx_sorted=None):

    param_validation_check = [
        greater_than_or_equal_to(min_samples_split, 2, 'min_samples_split'),
        greater_than_or_equal_to(min_samples_leaf, 1, 'min_samples_leaf'),
        greater_than_or_equal_to(min_weight_fraction_leaf, 0.0,
                                 'min_weight_fraction_leaf')
    ]
    if max_depth is not None:
        param_validation_check.append(
            greater_than_or_equal_to(max_depth, 1, 'max_depth'))

    validate(*param_validation_check)

    classifier = DecisionTreeClassifier(
        criterion, splitter, max_depth, min_samples_split, min_samples_leaf,
        min_weight_fraction_leaf, max_features, random_state, max_leaf_nodes,
        min_impurity_decrease, min_impurity_split, class_weight, presort)
    classifier.fit(table[feature_cols], table[label_col], sample_weight,
                   check_input, X_idx_sorted)

    try:
        from sklearn.externals.six import StringIO
        from sklearn.tree import export_graphviz
        import pydotplus
        dot_data = StringIO()
        export_graphviz(classifier,
                        out_file=dot_data,
                        feature_names=feature_cols,
                        class_names=table[label_col].astype('str').unique(),
                        filled=True,
                        rounded=True,
                        special_characters=True)
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
        from brightics.common.repr import png2MD
        fig_tree = png2MD(graph.create_png())
    except:
        fig_tree = "Graphviz is needed to draw a Decision Tree graph. Please download it from http://graphviz.org/download/ and install it to your computer."

    # json
    model = _model_dict('decision_tree_classification_model')
    model['feature_cols'] = feature_cols
    model['label_col'] = label_col
    model['classes'] = classifier.classes_
    feature_importance = classifier.feature_importances_
    model['feature_importance'] = feature_importance
    model['max_features'] = classifier.max_features_
    model['n_classes'] = classifier.n_classes_
    model['n_features'] = classifier.n_features_
    model['n_outputs'] = classifier.n_outputs_
    model['tree'] = classifier.tree_
    get_param = classifier.get_params()
    model['parameters'] = get_param
    model['classifier'] = classifier

    # report

    indices = np.argsort(feature_importance)
    sorted_feature_cols = np.array(feature_cols)[indices]

    plt.title('Feature Importances')
    plt.barh(range(len(indices)),
             feature_importance[indices],
             color='b',
             align='center')
    for i, v in enumerate(feature_importance[indices]):
        plt.text(v,
                 i,
                 " {:.2f}".format(v),
                 color='b',
                 va='center',
                 fontweight='bold')
    plt.yticks(range(len(indices)), sorted_feature_cols)
    plt.xlabel('Relative Importance')
    plt.xlim(0, 1.1)
    plt.tight_layout()
    fig_feature_importances = plt2MD(plt)
    plt.clf()

    params = dict2MD(get_param)
    feature_importance_df = pd.DataFrame(data=feature_importance,
                                         index=feature_cols).T

    # Add tree plot

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Decision Tree Classification Train Result
    | ### Decision Tree
    | {fig_tree}
    |
    | ### Feature Importance
    | {fig_feature_importances}
    |
    | ### Parameters
    | {list_parameters}
    |
    """.format(fig_tree=fig_tree,
               fig_feature_importances=fig_feature_importances,
               list_parameters=params)))
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Пример #5
0
def _outlier_detection_tukey_carling(table,
                                     input_cols,
                                     outlier_method='tukey',
                                     multiplier=None,
                                     number_of_removal=1,
                                     result_type='add_prediction',
                                     new_column_prefix='is_outlier_'):
    out_table = table.copy()
    median = out_table.median()
    q1s = out_table.quantile(0.25)
    q3s = out_table.quantile(0.75)
    iqrs = q3s - q1s
    output_col_names = []

    if outlier_method == 'tukey':
        if multiplier is None:
            multiplier = 1.5
        for col in input_cols:
            output_col_name = '{prefix}{col}'.format(prefix=new_column_prefix,
                                                     col=col)
            output_col_names.append(output_col_name)
            out_table[output_col_name] = out_table[col].apply(
                lambda _: _tukey(_, q1s[col], q3s[col], iqrs[col], multiplier))
    elif outlier_method == 'carling':
        if multiplier is None:
            multiplier = 2.3
        for col in input_cols:
            output_col_name = '{prefix}{col}'.format(prefix=new_column_prefix,
                                                     col=col)
            output_col_names.append(output_col_name)
            out_table[output_col_name] = out_table[col].apply(
                lambda _: _carling(_, median[col], iqrs[col], multiplier))
    else:
        raise_runtime_error("Please check 'outlier_method'.")

    # result_type is one of 'add_prediction', 'remove_outliers', 'both'
    if result_type == 'add_prediction':
        pass
    elif result_type == 'remove_outliers':
        prediction = out_table[output_col_names].apply(
            lambda row: np.sum(row == 'out') < number_of_removal, axis=1)
        out_table = out_table[prediction.values]
        out_table = out_table.drop(output_col_names, axis=1)
    elif result_type == 'both':
        prediction = out_table[output_col_names].apply(
            lambda row: np.sum(row == 'out') < number_of_removal, axis=1)
        out_table = out_table[prediction.values]
    else:
        raise_runtime_error("Please check 'result_type'.")

    params = {
        'Input Columns': input_cols,
        'Outlier Method': outlier_method,
        'Multiplier': multiplier,
        'Result Type': result_type,
        'New Column Prefix': new_column_prefix
    }

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Outlier Detection (Tukey/Carling) Result
    | ### Parameters
    |
    | {display_params}
    |
    """.format(display_params=dict2MD(params))))

    model = _model_dict('outlier_detection_tukey_carling')
    model['params'] = params
    model['input_cols'] = input_cols
    model['outlier_method'] = outlier_method
    model['multiplier'] = multiplier
    model['number_of_removal'] = number_of_removal
    model['result_type'] = result_type
    model['median'] = median
    model['q1'] = q1s
    model['q3'] = q3s
    model['iqr'] = iqrs
    model['_repr_brtc_'] = rb.get()

    return {'out_table': out_table, 'model': model}
Пример #6
0
def _random_forest_regression_train(table,
                                    feature_cols,
                                    label_col,
                                    n_estimators=10,
                                    criterion="mse",
                                    max_depth=None,
                                    min_samples_split=2,
                                    min_samples_leaf=1,
                                    min_weight_fraction_leaf=0,
                                    max_features="None",
                                    max_leaf_nodes=None,
                                    min_impurity_decrease=0,
                                    random_state=None):

    X_train = table[feature_cols]
    y_train = table[label_col]

    if max_features == "None":
        max_features = None

    regressor = RandomForestRegressor(
        n_estimators=n_estimators,
        criterion=criterion,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        min_weight_fraction_leaf=min_weight_fraction_leaf,
        max_features=max_features,
        max_leaf_nodes=max_leaf_nodes,
        min_impurity_decrease=min_impurity_decrease,
        random_state=random_state)
    regressor.fit(X_train, y_train)

    params = {
        'feature_cols': feature_cols,
        'label_col': label_col,
        'n_estimators': n_estimators,
        'criterion': criterion,
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
        'min_weight_fraction_leaf': min_weight_fraction_leaf,
        'max_features': max_features,
        'max_leaf_nodes': max_leaf_nodes,
        'min_impurity_decrease': min_impurity_decrease,
        'random_state': random_state
    }

    model = dict()
    model['regressor'] = regressor
    model['params'] = params

    fig_feature_importances = _plot_feature_importances(
        feature_cols, regressor)

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Random Forest Regression Train Result
    |
    | ### Feature Importance
    | {fig_feature_importances}
    |
    """.format(fig_feature_importances=fig_feature_importances)))

    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Пример #7
0
def _cross_table(table, input_cols_1, input_cols_2, result='N', margins=False):

    df1 = [table[col] for col in input_cols_1]
    df2 = [table[col] for col in input_cols_2]

    # cross table
    if result == 'N':
        result_table = pd.crosstab(df1, df2, margins=margins)
    elif result == 'N / Row Total':
        result_table = pd.crosstab(df1,
                                   df2,
                                   margins=margins,
                                   normalize='index')
    elif result == 'N / Column Total':
        result_table = pd.crosstab(df1,
                                   df2,
                                   margins=margins,
                                   normalize='columns')
    elif result == 'N / Total':
        result_table = pd.crosstab(df1, df2, margins=margins, normalize='all')
    else:
        raise_runtime_error("Please check 'result'.")

    # each row and column name
    row_names = list(result_table.index)[:]
    if len(input_cols_1) == 1:
        joined_row_name = [str(i) for i in row_names]
    else:
        if margins == False:
            joined_row_name = [
                '_'.join(str(s) for s in row_names[i])
                for i in range(len(row_names))
            ]
        elif margins == True:
            joined_row_name = [
                '_'.join(str(s) for s in row_names[i])
                for i in range(len(row_names) - 1)
            ] + [row_names[-1][0]]

    column_names = list(result_table.columns)[:]
    if len(input_cols_2) == 1:
        joined_column_name = [str(i) for i in column_names]
    else:
        if margins == False:
            joined_column_name = [
                '_'.join(str(s) for s in column_names[i])
                for i in range(len(column_names))
            ]
        elif margins == True:
            joined_column_name = [
                '_'.join(str(s) for s in column_names[i])
                for i in range(len(column_names) - 1)
            ] + [column_names[-1][0]]

    # cross table
    if result == 'N':
        result_table.insert(loc=0, column=' ', value=joined_row_name)
        result_table.columns = np.append('N', joined_column_name)
    # cross table normalize by row
    elif result == 'N / Row Total':
        result_table.insert(loc=0, column=' ', value=joined_row_name)
        result_table.columns = np.append('N / Row Total', joined_column_name)
    # cross table normalize by column
    elif result == 'N / Column Total':
        result_table.insert(loc=0, column=' ', value=joined_row_name)
        result_table.columns = np.append('N / Column Total',
                                         joined_column_name)
    # cross table normalize by all values
    elif result == 'N / Total':
        result_table.insert(loc=0, column=' ', value=joined_row_name)
        result_table.columns = np.append('N / Total', joined_column_name)
    else:
        raise_runtime_error("Please check 'result'.")

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Cross Table Result
    | ### Result Type : {result}
    |
    | #### Result Table
    |
    | {result_table}
    |
    """.format(result=result,
               result_table=pandasDF2MD(result_table,
                                        num_rows=len(result_table.index) +
                                        1))))

    model = _model_dict('cross_table')
    model['result'] = result
    model['result_table'] = result_table
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Пример #8
0
def _correlation(table,
                 vars,
                 method='pearson',
                 display_plt=True,
                 height=2.5,
                 corr_prec=2):

    size = len(vars)
    result_arr = []
    cov_xy = table[vars].cov()

    for i in range(size):
        for j in range(i):
            cov_temp = cov_xy[vars[i]][vars[j]]
            if method == 'pearson':
                r, p = stats.pearsonr(table[vars[i]], table[vars[j]])
            elif method == 'spearman':
                r, p = stats.spearmanr(table[vars[i]], table[vars[j]])
            else:
                r, p = stats.kendalltau(table[vars[i]], table[vars[j]])

            result_arr.append([vars[i], vars[j], r, p, cov_temp])

    df_result = pd.DataFrame(result_arr,
                             columns=['x', 'y', 'corr', 'p_value', 'cov'])

    rb = BrtcReprBuilder()
    if display_plt:
        s_default = plt.rcParams['lines.markersize']**2.
        scatter_kws = {"s": s_default * height / 6.4}

        def corr(x, y, **kwargs):
            if kwargs['method'] == 'pearson':
                r, p = stats.pearsonr(x, y)
            elif kwargs['method'] == 'spearman':
                r, p = stats.spearmanr(x, y)
            else:
                r, p = stats.kendalltau(x, y)

            p_stars = ''
            if p <= 0.05:
                p_stars = '*'
            if p <= 0.01:
                p_stars = '**'
            if p <= 0.001:
                p_stars = '***'

            corr_text = '{:.{prec}f}'.format(r, prec=corr_prec)
            font_size = abs(r) * 15 * 2 / corr_prec + 5
            ax = plt.gca()
            ax.annotate(corr_text, [
                .5,
                .5,
            ],
                        xycoords="axes fraction",
                        ha='center',
                        va='center',
                        fontsize=font_size * height)
            ax.annotate(p_stars,
                        xy=(0.65, 0.6),
                        xycoords=ax.transAxes,
                        color='red',
                        fontsize=17 * height)

        g = sns.PairGrid(table, vars=vars, height=height)
        g.map_diag(sns.distplot)
        if method == 'pearson':
            g.map_lower(sns.regplot, scatter_kws=scatter_kws)
        else:
            g.map_lower(sns.regplot, lowess=True, scatter_kws=scatter_kws)
        g.map_upper(corr, method=method)

        fig_corr = plt2MD(plt)
        plt.clf()

        rb.addMD(
            strip_margin(""" ## Correlation Results
            | ### Correlation Matrix
            | {fig_corr}
            |
            | ### Correlation Table
            | {table}
            """.format(fig_corr=fig_corr, table=pandasDF2MD(df_result))))

        params = {'vars': vars, 'method': method, 'height': height}

    else:
        rb.addMD(
            strip_margin(""" ## Correlation Results
            | ### Correlation Table
            | {table}
            """.format(table=pandasDF2MD(df_result))))

        params = {'vars': vars, 'method': method}

    res = dict()
    res['params'] = params
    res['corr_table'] = df_result
    res['_repr_brtc_'] = rb.get()

    return {'result': res}
Пример #9
0
def _doc2vec(table,
             input_col,
             dm=1,
             vector_size=100,
             window=10,
             min_count=1,
             max_vocab_size=None,
             train_epoch=100,
             workers=1,
             alpha=0.025,
             min_alpha=0.025,
             seed=None,
             hs=1,
             negative=5,
             ns_exponent=0.75,
             topn=30,
             hashfxn=hash):
    if seed is None:
        random_state = seed
        seed = randint(0, 0xffffffff)
    else:
        random_state = seed

    docs = table[input_col]
    tagged_docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs)]

    # hs = 1 if hs is True else 0
    if isinstance(dm, str):
        dm = int(dm)
    algo = {1: 'PV-DM', 0: 'PV_DBOW'}[dm]

    d2v = Doc2Vec(documents=tagged_docs,
                  dm=dm,
                  vector_size=vector_size,
                  window=window,
                  alpha=alpha,
                  min_alpha=min_alpha,
                  seed=seed,
                  min_count=min_count,
                  max_vocab_size=max_vocab_size,
                  workers=workers,
                  epochs=train_epoch,
                  hs=hs,
                  negative=negative,
                  ns_exponent=ns_exponent,
                  hashfxn=hashfxn)

    vocab = d2v.wv.vocab

    params = {
        'Input column': input_col,
        'Training algorithm': algo,
        'Dimension of Vectors': vector_size,
        'Window': window,
        'Minimum count': min_count,
        'Max vocabulary size': max_vocab_size,
        'Train epoch': train_epoch,
        'Number of workers': workers,
        'Alpha': alpha,
        'Minimum alpha': min_alpha,
        'Seed': random_state,
        'Hierarchical softmax': hs,
        'Negative': negative,
        'Negative sampling exponent': ns_exponent
    }

    # tsne visualization
    length = len(vocab)
    if length < topn:
        topn = length
    topn_words = sorted(vocab, key=vocab.get, reverse=True)[:topn]

    X = d2v[topn_words]
    tsne = TSNE(n_components=min(2, topn), random_state=seed)
    X_tsne = tsne.fit_transform(X)
    df = pd.DataFrame(X_tsne, index=topn_words, columns=['x', 'y'])

    fig = plt.figure()
    fig.set_size_inches(50, 40)
    ax = fig.add_subplot(1, 1, 1)

    ax.scatter(df['x'], df['y'], s=1000)
    ax.tick_params(axis='both', which='major', labelsize=50)

    for word, pos in df.iterrows():
        ax.annotate(word, pos, fontsize=80)
    plt.show()
    fig = plt2MD(plt)
    plt.clf()

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Doc2Vec Result
    |
    | ### Total Number of words
    | {length}
    |
    | ### Top {topn} Words
    | {topn_words}
    | {fig}
    |
    | ### Parameters
    | {params}
    """.format(length=length,
               topn=topn,
               topn_words=topn_words,
               fig=fig,
               params=dict2MD(params))))

    model = _model_dict('doc2vec_model')
    model['params'] = params
    model['d2v'] = d2v
    model['_repr_brtc_'] = rb.get()
    model['hash_val(Brightics)'] = hashfxn('Brightics')

    out_table1 = table.copy()
    out_table1['document_vectors'] = [
        d2v.infer_vector(doc.words).tolist() for doc in tagged_docs
    ]
    out_table2 = pd.DataFrame({
        'words': d2v.wv.index2word,
        'word_vectors': d2v.wv[vocab].tolist()
    })

    return {'model': model, 'doc_table': out_table1, 'word_table': out_table2}
Пример #10
0
def _glm_train(table,
               feature_cols,
               label_col,
               family="Gaussian",
               link="auto",
               fit_intercept=True):
    feature_names, features = check_col_type(table, feature_cols)
    label = table[label_col]

    if label_col in feature_cols:
        raise_runtime_error("%s is duplicated." % label_col)

    if link == "auto":
        sm_link = None
    elif link == "ident":
        sm_link = sm.families.links.identity
    elif link == "log":
        sm_link = sm.families.links.log
    elif link == "logit":
        sm_link = sm.families.links.logit
    elif link == "probit":
        sm_link = sm.families.links.probit
    elif link == "cloglog":
        sm_link = sm.families.links.CLogLog
    elif link == "pow":
        sm_link = sm.families.links.Power
    elif link == "nbinom":
        sm_link = sm.families.links.binom

    if family == "Gaussian":
        sm_family = sm.families.Gaussian(sm_link)
    elif family == "inv_Gaussian":
        sm_family = sm.families.InverseGaussian(sm_link)
    elif family == "binomial":
        sm_family = sm.families.Binomial(sm_link)
    elif family == "Poisson":
        sm_family = sm.families.Poisson(sm_link)
    elif family == "neg_binomial":
        sm_family = sm.families.NegativeBinomial(sm_link)
    elif family == "gamma":
        sm_family = sm.families.Gamma(sm_link)
    elif family == "Tweedie":
        sm_family = sm.families.Tweedie(sm_link)

    if fit_intercept == True:
        glm_model = sm.GLM(label, sm.add_constant(features),
                           family=sm_family).fit()
    else:
        glm_model = sm.GLM(label, features, family=sm_family).fit()
    summary = glm_model.summary().as_html()

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## GLM Result
    | ### Summary
    |
    """))
    rb.addHTML(summary)

    model = _model_dict('glm_model')
    model['features'] = feature_cols
    model['label'] = label_col
    model['family'] = family
    model['link'] = link
    model['coefficients'] = glm_model.params
    model['aic'] = glm_model.aic
    model['bic'] = glm_model.bic
    model['tvalues'] = glm_model.tvalues
    model['pvalues'] = glm_model.pvalues
    model['fit_intercept'] = fit_intercept
    glm_model.remove_data()
    model['glm_model'] = glm_model
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Пример #11
0
def _xgb_classification_train(table,
                              feature_cols,
                              label_col,
                              max_depth=3,
                              learning_rate=0.1,
                              n_estimators=100,
                              silent=True,
                              objective='binary:logistic',
                              booster='gbtree',
                              n_jobs=1,
                              nthread=None,
                              gamma=0,
                              min_child_weight=1,
                              max_delta_step=0,
                              subsample=1,
                              colsample_bytree=1,
                              colsample_bylevel=1,
                              reg_alpha=0,
                              reg_lambda=1,
                              scale_pos_weight=1,
                              base_score=0.5,
                              random_state=0,
                              seed=None,
                              missing=None,
                              sample_weight=None,
                              eval_set=None,
                              eval_metric=None,
                              early_stopping_rounds=None,
                              verbose=True,
                              xgb_model=None,
                              sample_weight_eval_set=None):
    validate(greater_than_or_equal_to(max_depth, 1, 'max_depth'),
             greater_than_or_equal_to(learning_rate, 0.0, 'learning_rate'),
             greater_than_or_equal_to(n_estimators, 1, 'n_estimators'))

    classifier = XGBClassifier(max_depth, learning_rate, n_estimators, silent,
                               objective, booster, n_jobs, nthread, gamma,
                               min_child_weight, max_delta_step, subsample,
                               colsample_bytree, colsample_bylevel, reg_alpha,
                               reg_lambda, scale_pos_weight, base_score,
                               random_state, seed, missing)
    classifier.fit(table[feature_cols], table[label_col], sample_weight,
                   eval_set, eval_metric, early_stopping_rounds, verbose,
                   xgb_model, sample_weight_eval_set)

    # json
    get_param = classifier.get_params()
    feature_importance = classifier.feature_importances_
    #     plt.rcdefaults()
    plot_importance(classifier)
    plt.tight_layout()
    fig_plot_importance = plt2MD(plt)
    plt.clf()
    #     plt.rcParams['figure.dpi'] = figure_dpi
    #     plot_tree(classifier)
    #     fig_plot_tree_UT = plt2MD(plt)
    #     plt.clf()
    #     plt.rcParams['figure.dpi'] = figure_dpi
    #     plot_tree(classifier, rankdir='LR')
    #     fig_plot_tree_LR = plt2MD(plt)
    #     plt.rcdefaults()
    #     plt.clf()

    model = _model_dict('xgb_classification_model')
    model['feature_cols'] = feature_cols
    model['label_col'] = label_col
    model['parameters'] = get_param
    model['feature_importance'] = feature_importance
    model['classifier'] = classifier

    # report
    #     get_param_list = []
    #     get_param_list.append(['feature_cols', feature_cols])
    #     get_param_list.append(['label_col', label_col])

    params = dict2MD(get_param)
    #     for key, value in get_param.items():
    #         temp = [key, value]
    #         get_param_list.append(temp)
    #     get_param_df = pd.DataFrame(data=get_param_list, columns=['parameter', 'value'])
    feature_importance_df = pd.DataFrame(data=feature_importance,
                                         index=feature_cols).T

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## XGB Classification Train Result
    |
    | ### Plot Importance
    | {fig_importance}
    |
    | ### Feature Importance
    | {table_feature_importance}
    |
    | ### Parameters
    | {list_parameters}
    |
    """.format(fig_importance=fig_plot_importance,
               table_feature_importance=pandasDF2MD(feature_importance_df, 20),
               list_parameters=params)))
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Пример #12
0
def _kmeans_train_predict(table,
                          input_cols,
                          n_clusters=3,
                          prediction_col='prediction',
                          init='k-means++',
                          n_init=10,
                          max_iter=300,
                          tol=1e-4,
                          precompute_distances='auto',
                          seed=None,
                          n_jobs=1,
                          algorithm='auto',
                          n_samples=None):
    inputarr = table[input_cols]
    if n_samples is None:
        n_samples = len(inputarr)

    validate(greater_than_or_equal_to(n_clusters, 1, 'n_clusters'),
             greater_than_or_equal_to(n_init, 1, 'n_init'),
             greater_than_or_equal_to(max_iter, 1, 'max_iter'),
             greater_than(tol, 0.0, 'tol'),
             greater_than_or_equal_to(n_jobs, 1, 'n_jobs'),
             greater_than_or_equal_to(n_samples, 0, 'n_samples'))

    k_means = SKKMeans(n_clusters=n_clusters,
                       init=init,
                       n_init=n_init,
                       max_iter=max_iter,
                       tol=tol,
                       precompute_distances=precompute_distances,
                       verbose=0,
                       random_state=seed,
                       copy_x=True,
                       n_jobs=n_jobs,
                       algorithm=algorithm)

    k_means.fit(inputarr)

    params = {
        'input_cols': input_cols,
        'n_clusters': n_clusters,
        'init': init,
        'n_init': n_init,
        'max_iter': max_iter,
        'tol': tol,
        'precompute_distances': precompute_distances,
        'seed': seed,
        'n_jobs': n_jobs,
        'algorithm': algorithm
    }

    cluster_centers = k_means.cluster_centers_
    labels = k_means.labels_

    pca2_model = PCA(n_components=2).fit(inputarr)
    pca2 = pca2_model.transform(inputarr)

    fig_centers = _kmeans_centers_plot(input_cols, cluster_centers)
    fig_samples = _kmeans_samples_plot(table, input_cols, n_samples,
                                       cluster_centers)
    fig_pca = _kmeans_pca_plot(labels, cluster_centers, pca2_model, pca2)

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Kmeans Result
    | - Number of iterations run: {n_iter_}.
    | - Coordinates of cluster centers
    | {fig_cluster_centers} 
    | - Samples
    | {fig_pca}
    | {fig_samples}
    |
    | ### Parameters
    | {params}
    """.format(n_iter_=k_means.n_iter_,
               fig_cluster_centers=fig_centers,
               fig_pca=fig_pca,
               fig_samples=fig_samples,
               params=dict2MD(params))))

    model = _model_dict('kmeans')
    model['model'] = k_means
    model['input_cols'] = input_cols
    model['_repr_brtc_'] = rb.get()

    out_table = table.copy()
    out_table[prediction_col] = labels
    return {'out_table': out_table, 'model': model}
Пример #13
0
def _kmeans_silhouette_train_predict(table,
                                     input_cols,
                                     n_clusters_list=range(2, 10),
                                     prediction_col='prediction',
                                     init='k-means++',
                                     n_init=10,
                                     max_iter=300,
                                     tol=1e-4,
                                     precompute_distances='auto',
                                     seed=None,
                                     n_jobs=1,
                                     algorithm='auto',
                                     n_samples=None):
    if n_samples is None:
        n_samples = len(table)
    inputarr = table[input_cols]

    validate(all_elements_greater_than(n_clusters_list, 1, 'n_clusters_list'),
             greater_than_or_equal_to(n_init, 1, 'n_init'),
             greater_than_or_equal_to(max_iter, 1, 'max_iter'),
             greater_than(tol, 0.0, 'tol'),
             greater_than_or_equal_to(n_jobs, 1, 'n_jobs'),
             greater_than_or_equal_to(n_samples, 0, 'n_samples'))

    pca2_model = PCA(n_components=2).fit(inputarr)
    pca2 = pca2_model.transform(inputarr)

    silhouette_list = []
    silouette_samples_list = []
    models = []
    centers_list = []
    images = []
    for k in n_clusters_list:
        k_means = SKKMeans(n_clusters=k,
                           init=init,
                           n_init=n_init,
                           max_iter=max_iter,
                           tol=tol,
                           precompute_distances=precompute_distances,
                           verbose=0,
                           random_state=seed,
                           copy_x=True,
                           n_jobs=n_jobs,
                           algorithm=algorithm)
        k_means.fit(inputarr)
        models.append(k_means)
        predict = k_means.labels_
        centersk = k_means.cluster_centers_
        centers_list.append(centersk)

        score = silhouette_score(inputarr, predict)
        silhouette_list.append(score)
        samples = silhouette_samples(inputarr, predict)
        silouette_samples_list.append(samples)

        pca2_centers = pca2_model.transform(centersk)

        _, (ax1, ax2) = plt.subplots(1, 2)
        colors = cm.nipy_spectral(np.arange(k).astype(float) / k)
        y_lower = 0

        for i, color in zip(range(k), colors):
            si = samples[predict == i]
            si.sort()

            sizei = si.shape[0]
            y_upper = y_lower + sizei

            ax1.fill_betweenx(np.arange(y_lower, y_upper),
                              0,
                              si,
                              facecolor=color,
                              edgecolor=color,
                              alpha=0.7)

            y_lower = y_upper

            ax2.scatter(pca2[:, 0][predict == i],
                        pca2[:, 1][predict == i],
                        color=color)

        ax1.axvline(x=score, color="red")
        ax2.scatter(pca2_centers[:, 0],
                    pca2_centers[:, 1],
                    marker='x',
                    edgecolors=1,
                    s=200,
                    color=colors)

        imagek = plt2MD(plt)
        plt.clf()
        images.append(imagek)

    argmax = np.argmax(silhouette_list)
    best_k = n_clusters_list[argmax]
    best_model = models[argmax]
    predict = best_model.predict(inputarr)
    best_centers = best_model.cluster_centers_
    best_labels = best_model.labels_

    fig_centers = _kmeans_centers_plot(input_cols, best_centers)
    fig_samples = _kmeans_samples_plot(table, input_cols, n_samples,
                                       best_centers)
    fig_pca = _kmeans_pca_plot(predict, best_centers, pca2_model, pca2)

    x_clusters = range(len(n_clusters_list))
    plt.xticks(x_clusters, n_clusters_list)
    plt.plot(x_clusters, silhouette_list, '.-')
    fig_silhouette = plt2MD(plt)
    plt.clf()

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Kmeans Silhouette Result
    | - silloutte metrics:
    | {fig_silhouette}
    | - best K: {best_k} 
    | - best centers:
    | {fig_pca}
    | {fig_centers}
    | {fig_samples}
    |
    """.format(fig_silhouette=fig_silhouette,
               best_k=best_k,
               fig_pca=fig_pca,
               fig_centers=fig_centers,
               fig_samples=fig_samples)))

    for k, image in zip(n_clusters_list, images):
        rb.addMD(
            strip_margin("""
        | ### k = {k}
        | {image}
        |
        """.format(k=k, image=image)))

    model = _model_dict('kmeans_silhouette')
    model['best_k'] = best_k
    model['best_centers'] = best_centers
    model['best_model'] = best_model
    model['input_cols'] = input_cols
    model['_repr_brtc_'] = rb.get()

    out_table = table.copy()
    out_table[prediction_col] = predict

    return {'out_table': out_table, 'model': model}
Пример #14
0
def _svm_classification_train(table,
                              feature_cols,
                              label_col,
                              gamma_val,
                              c=1.0,
                              kernel='rbf',
                              degree=3,
                              gamma='auto',
                              coef0=0.0,
                              shrinking=True,
                              probability=True,
                              tol=1e-3,
                              max_iter=-1,
                              random_state=None,
                              class_weight=None):
    _table = table.copy()

    feature_names, features = check_col_type(table, feature_cols)
    _label_col = _table[label_col]

    if (sklearn_utils.multiclass.type_of_target(_label_col) == 'continuous'):
        raise_runtime_error('''Label Column should not be continuous.''')

    class_labels = sorted(set(_label_col))
    if class_weight is not None:
        if len(class_weight) != len(class_labels):
            raise ValueError(
                "Number of class weights should match number of labels.")
        else:
            class_weight = {
                class_labels[i]: class_weight[i]
                for i in range(len(class_labels))
            }

    if gamma == 'other':
        _gamma = gamma_val
    else:
        _gamma = gamma
    _svc = svm.SVC(C=c,
                   kernel=kernel,
                   degree=degree,
                   gamma=_gamma,
                   coef0=coef0,
                   shrinking=shrinking,
                   probability=probability,
                   tol=tol,
                   max_iter=max_iter,
                   random_state=random_state,
                   class_weight=class_weight)
    _svc_model = _svc.fit(features, _label_col)

    get_param = _svc.get_params()
    get_param['feature_cols'] = feature_names
    get_param['label_col'] = label_col

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## SVM Classification Result
    | ### Parameters
    | {table_parameter} 
    """.format(table_parameter=dict2MD(get_param))))

    _model = _model_dict('svc_model')
    _model['svc_model'] = _svc_model
    _model['features'] = feature_cols
    _model['_repr_brtc_'] = rb.get()

    return {'model': _model}
Пример #15
0
def _svd2(table,
          input_cols,
          new_column_name='projected_',
          full_matrices=False):
    A = table[input_cols]

    u, s, vh = np.linalg.svd(A, full_matrices=full_matrices)
    projection = []
    for i in range(len(s)):
        projection += [(u.T[i] * s[i])]
    projection = np.array(projection).T
    s_normal = []
    for i in range(len(s)):
        if i == 0:
            s_normal += [s[i] / s.sum()]
        else:
            s_normal += [s[i] / s.sum() + s_normal[i - 1]]
    s = [s] + [s_normal]
    s = np.array(s)
    v = vh.T
    column_name_u = []
    column_name_s = []
    column_name_v = []
    column_name_projection = []
    for i in range(u.shape[1]):
        column_name_u += ['u%d' % (i + 1)]
    for i in range(s.shape[1]):
        column_name_s += ['s%d' % (i + 1)]
    for i in range(v.shape[1]):
        column_name_v += ['v%d' % (i + 1)]
    for i in range(s.shape[1]):
        column_name_projection += [new_column_name + '%d' % (i + 1)]

    out_table4 = pd.DataFrame(data=projection,
                              columns=[column_name_projection])
    out_table4 = pd.concat([table.reset_index(drop=True), out_table4], axis=1)
    out_table4.columns = table.columns.values.tolist() + column_name_projection

    res_param1 = {}
    res_param1['Input Columns'] = input_cols
    res_param1['full_matrices'] = full_matrices

    res_param2 = {}
    res_param2['u'] = u.shape
    res_param2['s'] = s.shape
    res_param2['v'] = v.shape
    res_param2['Projected Matrix'] = projection.shape

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## SVD Result
    |
    | ### Dimensions of Matrices
    | {parameter2}
    |
    | ### Parameters
    | {parameter1}
    """.format(parameter1=dict2MD(res_param1),
               parameter2=dict2MD(res_param2))))

    model = _model_dict('svd')
    model['right_singular_vectors'] = pd.DataFrame(v, columns=column_name_v)
    model['input_cols'] = input_cols
    model['parameters'] = res_param1
    model['_repr_brtc_'] = rb.get()

    return {
        'out_table1': pd.DataFrame(u, columns=column_name_u),
        'out_table2': pd.DataFrame(s, columns=column_name_s),
        'out_table3': pd.DataFrame(v, columns=column_name_v),
        'out_table4': out_table4,
        'model': model
    }
Пример #16
0
def _collaborative_filtering_train(table,
                                   user_col,
                                   item_col,
                                   rating_col,
                                   N=10,
                                   filter=True,
                                   k=5,
                                   based='item',
                                   mode='train',
                                   method='cosine',
                                   weighted=True,
                                   centered=True,
                                   targets=None,
                                   normalize=True,
                                   workers=1,
                                   filter_minus=False,
                                   maintain_already_scored=True):
    if based == 'item':
        normalize = False
    table_user_col = table[user_col]
    table_item_col = table[item_col]
    rating_col = table[rating_col]
    user_encoder = preprocessing.LabelEncoder()
    item_encoder = preprocessing.LabelEncoder()
    user_encoder.fit(table_user_col)
    item_encoder.fit(table_item_col)
    user_correspond = user_encoder.transform(table_user_col)
    item_correspond = item_encoder.transform(table_item_col)
    if based == 'item':
        item_users = csr_matrix(
            (rating_col, (item_correspond, user_correspond)))
        check_cen = csr_matrix(
            (rating_col + 1, (item_correspond, user_correspond)))
    else:
        item_users = csr_matrix(
            (rating_col, (user_correspond, item_correspond)))
        check_cen = csr_matrix(
            (rating_col + 1, (user_correspond, item_correspond)))
    centered_ratings = item_users.copy()

    num_item, num_user = item_users.shape
    if centered:
        update_item = []
        update_user = []
        update_rating = []
        for item in range(num_item):
            index = 0
            sum = 0
            for user, rating in _nonzeros(check_cen, item):
                index += 1
                sum += rating
            avg = sum / index - 1
            for user, rating in _nonzeros(check_cen, item):
                update_item.append(item)
                update_user.append(user)
                update_rating.append(avg)

        centered_ratings -= csr_matrix(
            (update_rating, (update_item, update_user)))
    if (method == 'adjusted' or normalize) and based == 'item':
        check_cen = check_cen.transpose().tocsr()
    if based == 'user':
        tmp = num_user
        num_user = num_item
        num_item = tmp
    user_avg = []
    if normalize:
        for user in range(num_user):
            index = 0
            sum = 0
            for user, rating in _nonzeros(check_cen, user):
                index += 1
                sum += rating
            avg = sum / index
            user_avg.append(avg)
    if method == 'adjusted':
        update_item = []
        update_user = []
        update_rating = []
        for user in range(num_user):
            sum = 0
            for item, rating in _nonzeros(check_cen, user):
                sum += rating
            avg = sum / num_item
            for item in range(num_item):
                update_item.append(item)
                update_user.append(user)
                update_rating.append(avg)
        if based == 'item':
            centered_ratings -= csr_matrix(
                (update_rating, (update_item, update_user)))
        else:
            centered_ratings -= csr_matrix(
                (update_rating, (update_user, update_item)))
        method = 'cosine'
    if based == 'user':
        tmp = num_user
        num_user = num_item
        num_item = tmp

    if method == 'cosine':
        similar_coeff = cosine_similarity(centered_ratings)
    elif method == 'pearson':
        result = []
        for i in centered_ratings.toarray():
            result.append(i - np.average(i))
        similar_coeff = cosine_similarity(result)
    elif method == 'jaccard':
        similar_coeff = 1 - pairwise_distances(centered_ratings.toarray(),
                                               metric="hamming")
    if based == 'user':
        item_users = item_users.transpose().tocsr()

    if mode == 'Topn':
        if targets is None:
            targets = user_encoder.classes_
        if table_user_col.dtype in (np.floating, float, np.int, int, np.int64):
            targets = [float(i) for i in targets]
        targets_en = user_encoder.transform(targets)
        Topn_result = []
        if workers == 1:
            for user in targets_en:
                recommendations_corre = _recommend(user, item_users,
                                                   similar_coeff, N, k, method,
                                                   weighted, centered, based,
                                                   normalize, user_avg, filter,
                                                   filter_minus,
                                                   maintain_already_scored)
                recommendations = []
                for (item, rating) in recommendations_corre:
                    recommendations += [
                        item_encoder.inverse_transform([item])[0], rating
                    ]
                Topn_result += [recommendations]
        else:
            Topn_result_tmp = apply_by_multiprocessing_list_to_list(
                targets_en,
                _recommend_multi,
                item_users=item_users,
                similar_coeff=similar_coeff,
                N=N,
                k=k,
                method=method,
                weighted=weighted,
                centered=centered,
                based=based,
                normalize=normalize,
                user_avg=user_avg,
                item_encoder=item_encoder,
                workers=workers,
                filter_minus=filter_minus,
                maintain_already_scored=maintain_already_scored)
            Topn_result = []
            for i in range(workers):
                Topn_result += Topn_result_tmp[i]
        Topn_result = pd.DataFrame(Topn_result)
        Topn_result = pd.concat([pd.DataFrame(targets), Topn_result],
                                axis=1,
                                ignore_index=True)
        column_names = ['user_name']
        for i in range(int((Topn_result.shape[1] - 1) / 2)):
            column_names += ['item_top%d' % (i + 1), 'rating_top%d' % (i + 1)]
        Topn_result.columns = column_names
        return {'out_table': Topn_result}

    parameters = dict()
    parameters['Number of Neighbors'] = k
    parameters['Based'] = based
    if method == 'cosine':
        parameters['Similarity method'] = 'Cosine'
    elif method == 'jaccard':
        parameters['Similarity method'] = 'Jaccard'
    elif method == 'pearson':
        parameters['Similarity method'] = 'Pearson'
    else:
        parameters['Similarity method'] = 'Adjusted Cosine'
    parameters['Use Centered Mean'] = centered
    parameters['Use Weighted Rating'] = weighted
    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Collaborative Filtering Result
    |
    | ### Parameters
    | {parameters} 
    |
    """.format(parameters=dict2MD(parameters))))

    model = _model_dict('collaborative filtering')
    model['weighted'] = weighted
    model['k'] = k
    model['method'] = method
    model['centered_ratings'] = centered_ratings
    model['item_encoder'] = item_encoder
    model['user_encoder'] = user_encoder
    model['item_users'] = item_users
    model['user_col'] = user_col
    model['item_col'] = item_col
    model['based'] = based
    model['_repr_brtc_'] = rb.get()
    model['normalize'] = normalize
    model['user_avg'] = user_avg
    return {'model': model}
Пример #17
0
def _linear_regression_train(table,
                             feature_cols,
                             label_col,
                             fit_intercept=True,
                             is_vif=False,
                             vif_threshold=10):
    features = table[feature_cols]
    label = table[label_col]

    if fit_intercept == True:
        features = sm.add_constant(features, has_constant='add')
        lr_model_fit = sm.OLS(label, features).fit()
    else:
        lr_model_fit = sm.OLS(label, features).fit()

    predict = lr_model_fit.predict(features)
    residual = label - predict

    summary = lr_model_fit.summary()
    summary_tables = simple_tables2df_list(summary.tables, drop_index=True)
    summary0 = summary_tables[0]
    summary1 = summary_tables[1]
    if is_vif:
        summary1['VIF'] = [
            variance_inflation_factor(features.values, i)
            for i in range(features.shape[1])
        ]
        summary1['VIF>{}'.format(vif_threshold)] = summary1['VIF'].apply(
            lambda _: 'true' if _ > vif_threshold else 'false')
    summary.tables[1] = _df_to_simpletable(summary1)
    summary2 = summary_tables[2]

    html_result = summary.as_html()

    plt.figure()
    plt.scatter(predict, label)
    plt.xlabel('Predicted values for ' + label_col)
    plt.ylabel('Actual values for ' + label_col)
    x = predict
    p1x = np.min(x)
    p2x = np.max(x)
    plt.plot([p1x, p2x], [p1x, p2x], 'r--')
    fig_actual_predict = plt2MD(plt)

    plt.figure()
    plt.scatter(predict, residual)
    plt.xlabel('Predicted values for ' + label_col)
    plt.ylabel('Residuals')
    plt.axhline(y=0, color='r', linestyle='--')
    fig_residual_1 = plt2MD(plt)

    plt.figure()
    sm.qqplot(residual, line='s')
    plt.ylabel('Residuals')
    fig_residual_2 = plt2MD(plt)

    plt.figure()
    sns.distplot(residual)
    plt.xlabel('Residuals')
    fig_residual_3 = plt2MD(plt)

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Linear Regression Result
    | ### Summary
    |
    """))
    rb.addHTML(html_result)
    rb.addMD(
        strip_margin("""
    |
    | ### Predicted vs Actual
    | {image1}
    |
    | ### Fit Diagnostics
    | {image2}
    | {image3}
    | {image4}
    """.format(image1=fig_actual_predict,
               image2=fig_residual_1,
               image3=fig_residual_2,
               image4=fig_residual_3)))

    model = _model_dict('linear_regression_model')
    model['features'] = feature_cols
    model['label'] = label_col
    model['coefficients'] = lr_model_fit.params
    model['fit_intercept'] = fit_intercept
    model['r2'] = lr_model_fit.rsquared
    model['adjusted_r2'] = lr_model_fit.rsquared_adj
    model['aic'] = lr_model_fit.aic
    model['bic'] = lr_model_fit.bic
    model['f_static'] = lr_model_fit.fvalue
    model['tvalues'] = lr_model_fit.tvalues
    model['pvalues'] = lr_model_fit.pvalues
    model['_repr_brtc_'] = rb.get()

    model['summary0'] = summary0
    model['summary1'] = summary1
    model['summary2'] = summary2
    lr_model_fit.remove_data()
    model['lr_model'] = lr_model_fit
    return {'model': model}
Пример #18
0
def _penalized_linear_regression_train(table,
                                       feature_cols,
                                       label_col,
                                       regression_type='ridge',
                                       alpha=1.0,
                                       l1_ratio=0.5,
                                       fit_intercept=True,
                                       max_iter=1000,
                                       tol=0.0001,
                                       random_state=None):
    out_table = table.copy()
    features = out_table[feature_cols]
    label = out_table[label_col]
    if regression_type == 'ridge':
        regression_model = Ridge(alpha=alpha,
                                 fit_intercept=fit_intercept,
                                 max_iter=None,
                                 tol=tol,
                                 solver='auto',
                                 random_state=random_state)
    elif regression_type == 'lasso':
        regression_model = Lasso(alpha=alpha,
                                 fit_intercept=fit_intercept,
                                 max_iter=max_iter,
                                 tol=tol,
                                 random_state=random_state,
                                 selection='random')
    elif regression_type == 'elastic_net':
        regression_model = ElasticNet(alpha=alpha,
                                      l1_ratio=l1_ratio,
                                      fit_intercept=fit_intercept,
                                      max_iter=max_iter,
                                      tol=tol,
                                      random_state=random_state,
                                      selection='random')
    else:
        raise_runtime_error("Please check 'regression_type'.")

    regression_model.fit(features, label)

    out_table1 = pd.DataFrame([])
    out_table1['x_variable_name'] = [variable for variable in feature_cols]
    out_table1['coefficient'] = regression_model.fit(features, label).coef_
    intercept = pd.DataFrame(
        [['intercept',
          regression_model.fit(features, label).intercept_]],
        columns=['x_variable_name', 'coefficient'])
    if fit_intercept == True:
        out_table1 = out_table1.append(intercept, ignore_index=True)

    predict = regression_model.predict(features)
    residual = label - predict

    out_table['predict'] = predict
    out_table['residual'] = residual

    if regression_type == 'elastic_net':
        params = {
            'Feature Columns': feature_cols,
            'Label Column': label_col,
            'Regression Type': regression_type,
            'Regularization (Penalty Weight)': alpha,
            'L1 Ratio': l1_ratio,
            'Fit Intercept': fit_intercept,
            'Maximum Number of Iterations': max_iter,
            'Tolerance': tol
        }
    else:
        params = {
            'Feature Columns': feature_cols,
            'Label Column': label_col,
            'Regression Type': regression_type,
            'Regularization (Penalty Weight)': alpha,
            'Fit Intercept': fit_intercept,
            'Maxium Number of Iterations': max_iter,
            'Tolerance': tol
        }

    score = {
        'MSE': mean_squared_error(label, predict),
        'R2': r2_score(label, predict)
    }

    plt.figure()
    plt.scatter(predict, label)
    plt.xlabel('Predicted values for ' + label_col)
    plt.ylabel('Actual values for ' + label_col)
    x = predict
    p1x = np.min(x)
    p2x = np.max(x)
    plt.plot([p1x, p2x], [p1x, p2x], 'r--')
    fig_actual_predict = plt2MD(plt)
    plt.clf()

    plt.figure()
    plt.scatter(predict, residual)
    plt.xlabel('Predicted values for ' + label_col)
    plt.ylabel('Residuals')
    plt.axhline(y=0, color='r', linestyle='--')
    fig_residual_1 = plt2MD(plt)
    plt.clf()

    plt.figure()
    sm.qqplot(residual, line='s')
    plt.ylabel('Residuals')
    fig_residual_2 = plt2MD(plt)
    plt.clf()

    plt.figure()
    sns.distplot(residual)
    plt.xlabel('Residuals')
    fig_residual_3 = plt2MD(plt)
    plt.clf()

    # checking the magnitude of coefficients

    plt.figure()
    predictors = features.columns
    coef = Series(regression_model.coef_, predictors).sort_values()
    coef.plot(kind='bar', title='Model Coefficients')
    plt.tight_layout()
    fig_model_coefficients = plt2MD(plt)
    plt.clf()

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | # Penalized Linear Regression Result
    | ### Selected Parameters: 
    | {params}
    |
    | ## Results
    | ### Model Parameters
    | {out_table1}
    |
    | ### Prediction and Residual
    | {out_table2}
    |
    | ### Regression Score
    | {score}
    |
    """.format(params=dict2MD(params),
               out_table1=pandasDF2MD(out_table1),
               out_table2=pandasDF2MD(out_table, num_rows=len(out_table) + 1),
               score=dict2MD(score))))
    rb.addMD(
        strip_margin("""
    |
    | ### Predicted vs Actual
    | {image1}
    |
    | ### Fit Diagnostics
    | {image2}
    | {image3}
    | {image4}
    |
    | ### Magnitude of Coefficients
    | {image5}
    |
    """.format(image1=fig_actual_predict,
               image2=fig_residual_1,
               image3=fig_residual_2,
               image4=fig_residual_3,
               image5=fig_model_coefficients)))

    model = _model_dict('penalized_linear_regression_model')
    model['feature_cols'] = feature_cols
    model['label_col'] = label_col
    model['regression_type'] = regression_type
    model['regression_model'] = regression_model
    model['parameters'] = params
    model['model_parameters'] = out_table1
    model['prediction_residual'] = out_table
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Пример #19
0
def _two_sample_ttest_for_stacked_data(table, response_cols, factor_col, alternatives, first=None , second=None , hypo_diff=0, equal_vari='pooled', confi_level=0.95):

    if(type(table[factor_col][0]) != str):
        if(type(table[factor_col][0]) == bool):
            if(first != None):
                first = bool(first)
            if(second != None):
                second = bool(second)
        else:
            if(first != None):
                first = float(first)
            if(second != None):
                second = float(second)
    if(first == None or second == None):
        tmp_factors = []
        if(first != None):
            tmp_factors += [first]
        if(second != None):
            tmp_factors += [second]
        for i in range(len(table[factor_col])):
            if(table[factor_col][i] != None and table[factor_col][i] not in tmp_factors):
                if(len(tmp_factors) == 2):
                    raise Exception("There are more that 2 factors.")
                else:
                    tmp_factors += [table[factor_col][i]]
    if(first == None):    
        if(tmp_factors[0] != second):
            first = tmp_factors[0]
        else:
            first = tmp_factors[1]
    if(second == None):
        if(tmp_factors[0] != first):
            second = tmp_factors[0]
        else:
            second = tmp_factors[1]
    table_first = table[table[factor_col] == first]
    table_second = table[table[factor_col] == second]
    tmp_table = []

    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    ## Two Sample T Test for Stacked Data Result
    | - Hypothesized mean = {hypo_diff}
    | - Confidence level = {confi_level}
    """.format(hypo_diff=hypo_diff, confi_level=confi_level)))
    
    for response_col in response_cols:
        tmp_model = []
        number1 = len(table_first[response_col])
        number2 = len(table_second[response_col])
        mean1 = (table_first[response_col]).mean()
        mean2 = (table_second[response_col]).mean()
        std1 = (table_first[response_col]).std()
        std2 = (table_second[response_col]).std()
        start_auto = 0
        if(equal_vari == 'auto'):
            start_auto = 1
            f_value = (std1 ** 2) / (std2 ** 2)
            f_test_p_value_tmp = stats.f.cdf(1 / f_value, number1 - 1, number2 - 1)
            if(f_test_p_value_tmp > 0.5):
                f_test_p_value = (1 - f_test_p_value_tmp) * 2
            else:
                f_test_p_value = f_test_p_value_tmp * 2
            if(f_test_p_value < 0.05):
                equal_vari = 'unequal'
            else:
                equal_vari = 'pooled'
        ttestresult = ttest_ind(table_first[response_col], table_second[response_col], 'larger', usevar=equal_vari, value=hypo_diff)
        
        if 'larger' in alternatives:
            ttestresult = ttest_ind(table_first[response_col], table_second[response_col], 'larger', usevar=equal_vari, value=hypo_diff)
            df = ttestresult[2]
            if(equal_vari == 'pooled'):    
                std_number1number2 = sqrt(((number1 - 1) * (std1) ** 2 + (number2 - 1) * (std2) ** 2) / (number1 + number2 - 2))
                margin = t.ppf((confi_level) , df) * std_number1number2 * sqrt(1 / number1 + 1 / number2)
            if(equal_vari == 'unequal'):
                margin = t.ppf((confi_level) , df) * sqrt(std1 ** 2 / (number1) + std2 ** 2 / (number2))
            tmp_model += [['true difference in means > 0.0'] + 
            [ttestresult[1]] + [(mean1 - mean2 - margin, math.inf)]]
            tmp_table += [['%s by %s(%s,%s)' % (response_col, factor_col, first, second)] + 
            ['true difference in means > 0.0'] + 
            ['t statistic, t distribution with %f degrees of freedom under the null hypothesis' % ttestresult[2]] + 
            [ttestresult[0]] + [ttestresult[1]] + [confi_level] + [mean1 - mean2 - margin] + [math.inf]]
            
        if 'smaller' in alternatives:
            ttestresult = ttest_ind(table_first[response_col], table_second[response_col], 'smaller', usevar=equal_vari, value=hypo_diff)
            df = ttestresult[2]
            if(equal_vari == 'pooled'):    
                std_number1number2 = sqrt(((number1 - 1) * (std1) ** 2 + (number2 - 1) * (std2) ** 2) / (number1 + number2 - 2))
                margin = t.ppf((confi_level) , df) * std_number1number2 * sqrt(1 / number1 + 1 / number2)
            if(equal_vari == 'unequal'):
                margin = t.ppf((confi_level) , df) * sqrt(std1 ** 2 / (number1) + std2 ** 2 / (number2))
            tmp_model += [['true difference in means < 0.0'] + 
            [ttestresult[1]] + [(-math.inf, mean1 - mean2 + margin)]] 
            tmp_table += [['%s by %s(%s,%s)' % (response_col, factor_col, first, second)] + 
            ['true difference in means < 0.0'] + 
            ['t statistic, t distribution with %f degrees of freedom under the null hypothesis' % ttestresult[2]] + 
            [ttestresult[0]] + [ttestresult[1]] + [confi_level] + [-math.inf] + [mean1 - mean2 + margin]] 
            
        if 'two-sided' in alternatives:
            ttestresult = ttest_ind(table_first[response_col], table_second[response_col], 'two-sided', usevar=equal_vari, value=hypo_diff)
            df = ttestresult[2]
            if(equal_vari == 'pooled'):    
                std_number1number2 = sqrt(((number1 - 1) * (std1) ** 2 + (number2 - 1) * (std2) ** 2) / (number1 + number2 - 2))
                margin = t.ppf((confi_level + 1) / 2 , df) * std_number1number2 * sqrt(1 / number1 + 1 / number2)
            if(equal_vari == 'unequal'):
                margin = t.ppf((confi_level + 1) / 2 , df) * sqrt(std1 ** 2 / (number1) + std2 ** 2 / (number2))
            tmp_model += [['true difference in means != 0.0'] + 
            [ttestresult[1]] + [(mean1 - mean2 - margin, mean1 - mean2 + margin)]]
            tmp_table += [['%s by %s(%s,%s)' % (response_col, factor_col, first, second)] + 
            ['true difference in means != 0.0'] + 
            ['t statistic, t distribution with %f degrees of freedom under the null hypothesis' % ttestresult[2]] + 
            [ttestresult[0]] + [ttestresult[1]] + [confi_level] + [mean1 - mean2 - margin] + [mean1 - mean2 + margin]]
            
        result_model = pd.DataFrame.from_records(tmp_model)
        result_model.columns = ['alternative hypothesis', 'p-value', '%g%% confidence interval' % (confi_level * 100)]
        rb.addMD(strip_margin("""
        | #### Data = {response_col} by {factor_col}({first},{second})
        
        | - Statistics = t statistic, t distribution with {ttestresult2} degrees of freedom under the null hypothesis
        | - t-value = {ttestresult0}
        |
        | {result_model}
        |
        """.format(ttestresult2=ttestresult[2], response_col=response_col, factor_col=factor_col, first=first, second=second, ttestresult0=ttestresult[0], result_model=pandasDF2MD(result_model))))
        if(start_auto == 1):
            equal_vari = 'auto'
    result = pd.DataFrame.from_records(tmp_table)
    result.columns = ['data', 'alternative_hypothesis', 'statistics', 'estimates', 'p_value', 'confidence_level', 'lower_confidence_interval', 'upper_confidence_interval']

    model = dict()
    model['_repr_brtc_'] = rb.get()    
    return {'out_table' : result, 'model' : model}
Пример #20
0
def _autocorrelation(table, input_col, nlags=20, conf_level=0.95):
    data = table[input_col]

    plt.figure()
    plot_acf(data, lags=nlags, alpha=1 - conf_level)
    fig_plt_acf = plt2MD(plt)
    plt.clf()

    plt.figure()
    plot_pacf(data, lags=nlags, alpha=1 - conf_level)
    fig_plt_pacf = plt2MD(plt)
    plt.clf()

    acf_ret = acf(data, nlags=nlags, alpha=1 - conf_level)
    pacf_ret = pacf(data, nlags=nlags, alpha=1 - conf_level)

    result_table1 = pd.DataFrame([])
    result_table1['lag'] = list(range(nlags + 1))
    result_table1['ACF'] = acf_ret[0]

    if conf_level is not None:
        result_table1['%g%% confidence Interval' % (conf_level * 100)] = [
            str((acf_ret[1][i][0], acf_ret[1][i][1])) for i in range(nlags + 1)
        ]

    result_table2 = pd.DataFrame([])
    result_table2['lag'] = list(range(nlags + 1))
    result_table2['PACF'] = pacf_ret[0]

    if conf_level is not None:
        result_table2['%g%% confidence Interval' % (conf_level * 100)] = [
            str((pacf_ret[1][i][0], pacf_ret[1][i][1]))
            for i in range(nlags + 1)
        ]

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""# Autocorrelation / Partial Autocorrelation Result"""))
    rb.addMD(
        strip_margin("""
    |## Autocorrelation
    |
    |{image1}
    |
    |### Autocorrelation Table
    |
    |{result_table1}
    |
    |## Partial Autocorrelation
    |
    |{image2}
    |
    |### Partial Autocorrelation Table
    |
    |{result_table2}
    |
    """.format(image1=fig_plt_acf,
               result_table1=pandasDF2MD(result_table1, num_rows=nlags + 1),
               image2=fig_plt_pacf,
               result_table2=pandasDF2MD(result_table2, num_rows=nlags + 1))))

    model = _model_dict('autocorrelation')
    model['autocorrelation_table'] = result_table1
    model['partial_autocorrelation_table'] = result_table2
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Пример #21
0
def _pca(table,
         input_cols,
         new_column_name='projected_',
         n_components=None,
         copy=True,
         whiten=False,
         svd_solver='auto',
         tol=0.0,
         iterated_power='auto',
         random_state=None,
         hue=None,
         alpha=0,
         key_col=None):

    num_feature_cols = len(input_cols)
    if n_components is None:
        n_components = num_feature_cols

    validate(greater_than_or_equal_to(n_components, 1, 'n_components'))

    pca = PCA(None, copy, whiten, svd_solver, tol, iterated_power,
              random_state)
    pca_model = pca.fit(table[input_cols])

    column_names = []
    for i in range(0, n_components):
        column_names.append(new_column_name + str(i))
    # print(column_names)

    pca_result = pca_model.transform(table[input_cols])
    out_df = pd.DataFrame(data=pca_result[:, :n_components],
                          columns=[column_names])

    out_df = pd.concat([table.reset_index(drop=True), out_df], axis=1)
    out_df.columns = table.columns.values.tolist() + column_names

    res_components = pca_model.components_
    res_components_df = pd.DataFrame(data=res_components[:n_components],
                                     columns=[input_cols])
    res_explained_variance = pca_model.explained_variance_
    res_explained_variance_ratio = pca_model.explained_variance_ratio_
    res_singular_values = pca_model.singular_values_
    res_mean = pca_model.mean_
    res_n_components = pca_model.n_components_
    res_noise_variance = pca_model.noise_variance_

    res_get_param = pca_model.get_params()
    res_get_covariance = pca_model.get_covariance()
    res_get_precision = pca_model.get_precision()

    # visualization
    plt.figure()
    if n_components == 1:
        sns.scatterplot(column_names[0], column_names[0], hue=hue, data=out_df)
        plt_two = plt2MD(plt)
        plt.clf()
    else:
        plt_two = _biplot(
            0,
            1,
            pc_columns=column_names,
            columns=input_cols,
            singular_values=res_singular_values,
            components=res_components,
            explained_variance_ratio=res_explained_variance_ratio,
            alpha=alpha,
            hue=hue,
            data=out_df,
            ax=plt.gca(),
            key_col=key_col)

    plt.figure()
    fig_scree = _screeplot(res_explained_variance,
                           res_explained_variance_ratio, n_components)

    table_explained_variance = pd.DataFrame(res_explained_variance,
                                            columns=['explained_variance'])
    table_explained_variance[
        'explained_variance_ratio'] = res_explained_variance_ratio
    table_explained_variance[
        'cum_explained_variance_ratio'] = res_explained_variance_ratio.cumsum(
        )

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## PCA Result
    | ### Plot
    | {image1}
    |
    | ### Explained Variance
    | {fig_scree}
    | {table_explained_variance}    
    |
    | ### Components
    | {table2}
    |
    | ### Parameters
    | {parameter1}
    """.format(image1=plt_two,
               fig_scree=fig_scree,
               table_explained_variance=pandasDF2MD(table_explained_variance),
               parameter1=dict2MD(res_get_param),
               table2=pandasDF2MD(res_components_df))))

    model = _model_dict('pca')
    model['components'] = res_components
    model['explained_variance'] = res_explained_variance
    model['explained_variance_ratio'] = res_explained_variance_ratio
    model['singular_values'] = res_singular_values
    model['mean'] = res_mean
    model['n_components'] = res_n_components
    model['noise_variance'] = res_noise_variance
    model['parameters'] = res_get_param
    model['covariance'] = res_get_covariance
    model['precision'] = res_get_precision
    model['_repr_brtc_'] = rb.get()
    model['pca_model'] = pca_model
    model['input_cols'] = input_cols

    return {'out_table': out_df, 'model': model}
def _naive_bayes_train(table,
                       feature_cols,
                       label_col,
                       alpha=1.0,
                       fit_prior=True,
                       class_prior=None):

    features = table[feature_cols]
    label = table[label_col]
    label_encoder = preprocessing.LabelEncoder()
    label_encoder.fit(label)
    label_correspond = label_encoder.transform(label)

    if class_prior is not None:
        tmp_class_prior = [0] * len(class_prior)
        for elems in class_prior:
            tmp = elems.split(":")
            tmp_class_prior[label_encoder.transform([tmp[0]
                                                     ])[0]] = float(tmp[1])
        class_prior = tmp_class_prior

    nb_model = MultinomialNB(alpha, fit_prior, class_prior)
    nb_model.fit(features, label_correspond)
    class_log_prior = nb_model.class_log_prior_
    feature_log_prob_ = nb_model.feature_log_prob_
    tmp_result = np.hstack(
        (list(map(list, zip(*[label_encoder.classes_] + [class_log_prior]))),
         (feature_log_prob_)))
    column_names = ['labels', 'pi']
    for feature_col in feature_cols:
        column_names += ['theta_' + feature_col]
    result_table = pd.DataFrame.from_records(tmp_result, columns=column_names)
    prediction_correspond = nb_model.predict(features)

    get_param = dict()
    get_param['Lambda'] = alpha
    # get_param['Prior Probabilities of the Classes'] = class_prior
    get_param['Fit Class Prior Probability'] = fit_prior
    get_param['Feature Columns'] = feature_cols
    get_param['Label Column'] = label_col

    cnf_matrix = confusion_matrix(label_correspond, prediction_correspond)

    plt.figure()
    _plot_confusion_matrix(cnf_matrix,
                           classes=label_encoder.classes_,
                           title='Confusion Matrix')
    fig_confusion_matrix = plt2MD(plt)
    accuracy = nb_model.score(features, label_correspond) * 100

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Naive Bayes Classification Result
    |
    | ### Model:Multinomial
    | {result_table}
    | ### Parameters
    | {table_parameter} 
    | ### Predicted vs Actual
    | {image1}
    | #### Accuacy = {accuracy}%
    |
    """.format(image1=fig_confusion_matrix,
               accuracy=accuracy,
               result_table=pandasDF2MD(result_table),
               table_parameter=dict2MD(get_param))))

    model = _model_dict('naive_bayes_model')
    model['features'] = feature_cols
    model['label_col'] = label_col
    model['label_encoder'] = label_encoder
    model['nb_model'] = nb_model
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Пример #23
0
def _logistic_regression_train(table,
                               feature_cols,
                               label_col,
                               penalty='l2',
                               dual=False,
                               tol=0.0001,
                               C=1.0,
                               fit_intercept=True,
                               intercept_scaling=1,
                               class_weight=None,
                               random_state=None,
                               solver='liblinear',
                               max_iter=100,
                               multi_class='ovr',
                               verbose=0,
                               warm_start=False,
                               n_jobs=1):

    feature_names, features = check_col_type(table, feature_cols)
    label = table[label_col]

    if (sklearn_utils.multiclass.type_of_target(label) == 'continuous'):
        raise_error('0718', 'label_col')

    lr_model = LogisticRegression(penalty, dual, tol, C, fit_intercept,
                                  intercept_scaling, class_weight,
                                  random_state, solver, max_iter, multi_class,
                                  verbose, warm_start, n_jobs)
    lr_model.fit(features, label)

    intercept = lr_model.intercept_
    coefficients = lr_model.coef_
    classes = lr_model.classes_
    is_binary = len(classes) == 2
    if (fit_intercept == True):
        summary = pd.DataFrame({'features': ['intercept'] + feature_names})
        print(intercept)
        print(coefficients)

        coef_trans = np.concatenate(([intercept], np.transpose(coefficients)),
                                    axis=0)
        if not is_binary:
            summary = pd.concat(
                (summary, pd.DataFrame(coef_trans, columns=classes)), axis=1)
        elif is_binary:
            summary = pd.concat(
                (summary, pd.DataFrame(coef_trans, columns=[classes[0]])),
                axis=1)

    else:
        summary = pd.DataFrame({'features': feature_names})
        coef_trans = np.transpose(coefficients)

        if not is_binary:
            summary = pd.concat(
                (summary, pd.DataFrame(coef_trans, columns=classes)), axis=1)
        elif is_binary:
            summary = pd.concat(
                (summary, pd.DataFrame(coef_trans, columns=[classes[0]])),
                axis=1)

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Logistic Regression Result
    | ### Summary
    | {table1}
    """.format(table1=pandasDF2MD(summary))))

    model = _model_dict('logistic_regression_model')
    model['features'] = feature_cols
    model['label'] = label_col
    model['intercept'] = lr_model.intercept_
    model['coefficients'] = lr_model.coef_
    model['class'] = lr_model.classes_
    model['penalty'] = penalty
    model['solver'] = solver
    model['lr_model'] = lr_model
    model['_repr_brtc_'] = rb.get()
    model['summary'] = summary

    return {'model': model}
Пример #24
0
def _logistic_regression_train(table,
                               feature_cols,
                               label_col,
                               penalty='l2',
                               dual=False,
                               tol=0.0001,
                               C=1.0,
                               fit_intercept=True,
                               intercept_scaling=1,
                               class_weight=None,
                               random_state=None,
                               solver='liblinear',
                               max_iter=100,
                               multi_class='ovr',
                               verbose=0,
                               warm_start=False,
                               n_jobs=1):

    feature_names, features = check_col_type(table, feature_cols)
    features = pd.DataFrame(features, columns=feature_names)

    label = table[label_col]

    if (sklearn_utils.multiclass.type_of_target(label) == 'continuous'):
        raise_error('0718', 'label_col')

    lr_model = LogisticRegression(penalty, dual, tol, C, fit_intercept,
                                  intercept_scaling, class_weight,
                                  random_state, solver, max_iter, multi_class,
                                  verbose, warm_start, n_jobs)
    lr_model.fit(features, label)
    new_features = pd.DataFrame({
        "Constant": np.ones(len(features))
    }).join(pd.DataFrame(features))
    intercept = lr_model.intercept_
    coefficients = lr_model.coef_
    classes = lr_model.classes_
    is_binary = len(classes) == 2
    prob = lr_model.predict_proba(features)
    prob_trans = prob.T
    classes_dict = dict()
    for i in range(len(classes)):
        classes_dict[classes[i]] = i
    tmp_label = np.array([classes_dict[i] for i in label])
    likelihood = 1
    for i in range(len(table)):
        likelihood *= prob_trans[tmp_label[i]][i]
    if fit_intercept:
        k = len(feature_cols) + 1
    else:
        k = len(feature_cols)
    aic = 2 * k - 2 * np.log(likelihood)
    bic = np.log(len(table)) * k - 2 * np.log(likelihood)
    if is_binary:
        if fit_intercept:
            x_design = np.hstack([np.ones((features.shape[0], 1)), features])
        else:
            x_design = features.values
        v = np.product(prob, axis=1)
        x_design_modi = (x_design.T * v).T
        cov_logit = np.linalg.inv(np.dot(x_design_modi.T, x_design))
        std_err = np.sqrt(np.diag(cov_logit))
        if fit_intercept:
            logit_params = np.insert(coefficients, 0, intercept)
        else:
            logit_params = coefficients
        wald = (logit_params / std_err)**2
        p_values = 1 - chi2.cdf(wald, 1)
    else:
        if fit_intercept:
            x_design = np.hstack([np.ones((features.shape[0], 1)), features])
        else:
            x_design = features.values
        std_err = []
        for i in range(len(classes)):
            v = prob.T[i] * (1 - prob.T[i])
            x_design_modi = (x_design.T * v).T
            cov_logit = np.linalg.inv(np.dot(x_design_modi.T, x_design))
            std_err.append(np.sqrt(np.diag(cov_logit)))
        std_err = np.array(std_err)

        #print(math.log(likelihood))

    if (fit_intercept == True):
        summary = pd.DataFrame({'features': ['intercept'] + feature_names})
        coef_trans = np.concatenate(([intercept], np.transpose(coefficients)),
                                    axis=0)

    else:
        summary = pd.DataFrame({'features': feature_names})
        coef_trans = np.transpose(coefficients)

    if not is_binary:
        summary = pd.concat(
            (summary, pd.DataFrame(coef_trans, columns=classes)), axis=1)
    else:
        summary = pd.concat(
            (summary, pd.DataFrame(coef_trans, columns=[classes[0]])), axis=1)
    if is_binary:
        summary = pd.concat(
            (summary, pd.DataFrame(std_err, columns=['standard_error']),
             pd.DataFrame(wald, columns=['wald_statistic']),
             pd.DataFrame(p_values, columns=['p_value'])),
            axis=1)
    else:
        columns = [
            'standard_error_{}'.format(classes[i]) for i in range(len(classes))
        ]
        summary = pd.concat(
            (summary, pd.DataFrame(std_err.T, columns=columns)), axis=1)
        arrange_col = ['features']
        for i in range(len(classes)):
            arrange_col.append(classes[i])
            arrange_col.append('standard_error_{}'.format(classes[i]))
        summary = summary[arrange_col]
    if is_binary:
        rb = BrtcReprBuilder()
        rb.addMD(
            strip_margin("""
        | ## Logistic Regression Result
        | ### Summary
        | {table1}
        |
        | ##### Column '{small}' is the coefficients under the assumption ({small} = 0, {big} = 1).
        |
        | #### AIC : {aic}
        |
        | #### BIC : {bic}
        """.format(small=classes[0],
                   big=classes[1],
                   table1=pandasDF2MD(summary, num_rows=100),
                   aic=aic,
                   bic=bic)))
    else:
        rb = BrtcReprBuilder()
        rb.addMD(
            strip_margin("""
        | ## Logistic Regression Result
        | ### Summary
        | {table1}
        |
        | ##### Each column whose name is one of classes of Label Column is the coefficients under the assumption it is 1 and others are 0.
        |
        | ##### For example, column '{small}' is the coefficients under the assumption ({small} = 1, others = 0).
        |
        | #### AIC : {aic}
        |
        | #### BIC : {bic}
        """.format(small=classes[0],
                   table1=pandasDF2MD(summary, num_rows=100),
                   aic=aic,
                   bic=bic)))

    model = _model_dict('logistic_regression_model')
    model['standard_errors'] = std_err
    model['aic'] = aic
    model['bic'] = bic
    if is_binary:
        model['wald_statistics'] = wald
        model['p_values'] = p_values
    model['features'] = feature_cols
    model['label'] = label_col
    model['intercept'] = lr_model.intercept_
    model['coefficients'] = lr_model.coef_
    model['class'] = lr_model.classes_
    model['penalty'] = penalty
    model['solver'] = solver
    model['lr_model'] = lr_model
    model['_repr_brtc_'] = rb.get()
    model['summary'] = summary
    return {'model': model}
Пример #25
0
def _gsdmm(table,
           input_col,
           topic_name='topic',
           K=10,
           alpha=0.1,
           beta=0.1,
           max_iter=50,
           num_topic_words=3):
    docs = np.array(table[input_col])
    docs_set = [set(doc) for doc in docs]
    docs_preprocessed = [list(doc_set) for doc_set in docs_set]
    vocab_set = list(set.union(*docs_set))
    vocab_size = len(vocab_set)

    # initialize and train a GSDMM model
    mgp = gsdmm_rwalk.MovieGroupProcess(K=K,
                                        alpha=alpha,
                                        beta=beta,
                                        n_iters=max_iter)
    topics = mgp.fit(docs_preprocessed, vocab_size)

    # generate topic table
    topic_word_count = mgp.cluster_word_distribution
    topic_words_raw = [[ind, _count_to_ratio_raw(word_count)]
                       for ind, word_count in enumerate(topic_word_count)
                       if word_count]
    topic_words = [[item[0]] + _gen_table(item[1], num_topic_words)
                   for item in topic_words_raw]

    # reset topic ids
    nonempty_topic_indices = [item[0] for item in topic_words]
    reset_topic_ind = {
        old_ind: (new_ind + 1)
        for new_ind, old_ind in enumerate(nonempty_topic_indices)
    }
    topics = [reset_topic_ind[old_ind] for old_ind in topics]
    topic_words = [[reset_topic_ind[old_item[0]]] + old_item[1:]
                   for old_item in topic_words]

    # generate output dataframes
    out_table = pd.DataFrame.copy(table, deep=True)
    if topic_name in table.columns:
        raise BrighticsFunctionException.from_errors([{
            '0100':
            "Existing table contains the topic column name. Please choose another name."
        }])
    out_table[topic_name] = topics
    columns = ['index', 'vocabularies_weights', 'vocabularies', 'weights']
    topic_table = pd.DataFrame(topic_words, columns=columns)
    topic_table['weights'] = topic_table['weights'].apply(pd.to_numeric)

    # pyLDAvis
    if len(topic_words) == 1:
        html_result = None
    else:
        topic_words_dicts = [item[1] for item in topic_words_raw]
        topic_term_dists = [[
            topic_words_dict.get(word, 0) for word in vocab_set
        ] for topic_words_dict in topic_words_dicts]
        num_docs = len(topics)
        num_topics = len(topic_words_raw)
        doc_topic_dists = np.zeros((num_docs, num_topics))
        for doc_id, topic_id in enumerate(topics):
            doc_topic_dists[doc_id][topic_id - 1] = 1.0
        doc_lengths = [len(doc) for doc in docs_preprocessed]
        vocab_count = functools.reduce(
            lambda dict_1, dict_2: {
                word: dict_1.get(word, 0) + dict_2.get(word, 0)
                for word in set(dict_1).union(dict_2)
            }, topic_word_count)
        term_frequency = [vocab_count.get(word) for word in vocab_set]

        prepared_data = pyLDAvis.prepare(topic_term_dists, doc_topic_dists,
                                         doc_lengths, vocab_set,
                                         term_frequency)
        html_result = pyLDAvis.prepared_data_to_html(prepared_data)

    # generate report
    params = {
        'Input column': input_col,
        'Topic column name': topic_name,
        'K': K,
        'Alpha': alpha,
        'Beta': beta,
        'Maximum number of iterations': max_iter,
        'Number of words for each topic': num_topic_words
    }
    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## GSDMM Result
    | ### Summary
    |
    """))
    if html_result is not None:
        rb.addHTML(html_result)
        rb.addMD(strip_margin("""
        |
        """))
    rb.addMD(
        strip_margin("""
    | ### Final Number of Topics
    | {num_topics}
    |
    | ### Parameters
    | {params}
    """.format(num_topics=len(topic_words_raw), params=dict2MD(params))))

    # create model
    model = _model_dict('lda_model')
    model['params'] = params
    model['gsdmm_model'] = mgp
    model['_repr_brtc_'] = rb.get()

    return {'out_table': out_table, 'topic_table': topic_table, 'model': model}
Пример #26
0
def _normality_test(table, input_cols, sig_level=0, method=['kstest', 'jarque_bera', 'anderson']):
    result = dict()
    rb = BrtcReprBuilder()
    rb.addMD("""## Normality test Result""")
    
    test_name = {'kstest': "Kolmogorov-Smirnov test",
                 'jarque_bera': "Jarque-Bera test",
                 'anderson': "Anderson-Darling test"}
    stats_name = {'kstest': "KS statistic, asymptotically Kolmogorov distribution under the null hypothesis.",
                 'jarque_bera': "JB statistic, asymptotically chi-square distribution with 2 degrees of freedom under the null hypothesis.",
                 'anderson': "A^2 statistic. The p-value is computed from the adjusted statistic."}
    
    if 'kstest' in method:
        stats_res = dict()
        stats_res['data'] = []
        stats_res['estimates'] = []
        stats_res['p_value'] = []
        result['kstest'] = dict()
        for input_col in input_cols:
            stats, pval = kstest(table[input_col], 'norm', mode='asymp')
            stats_res['data'].append(input_col)
            stats_res['estimates'].append(stats)
            stats_res['p_value'].append(pval)
            result['kstest'][input_col] = {'estimates':stats, 'p_value':pval}
        rb.addMD(strip_margin("""
        | ## {method} result
        |{stats_table}
        """.format(method=test_name['kstest'], stats_table=pandasDF2MD(pd.DataFrame(stats_res)))))
    if 'jarque_bera' in method:
        stats_res = dict()
        stats_res['data'] = []
        stats_res['estimates'] = []
        stats_res['p_value'] = []
        result['jarque_bera'] = dict()
        for input_col in input_cols:
            stats, pval = jarque_bera(table[input_col])
            stats_res['data'].append(input_col)
            stats_res['estimates'].append(stats)
            stats_res['p_value'].append(pval)
            result['jarque_bera'][input_col] = {'estimates':stats, 'p_value':pval}
        rb.addMD(strip_margin("""
        | ## {method} result
        |{stats_table}
        """.format(method=test_name['jarque_bera'], stats_table=pandasDF2MD(pd.DataFrame(stats_res)))))
    if 'anderson' in method:
        stats_res = dict()
        stats_res['data'] = []
        stats_res['estimates'] = []
        stats_res['critical value'] = []
        stats_res['significance level'] = []
        result['anderson'] = dict()
        for input_col in input_cols:
            stats, critical_val, significance_lvl = anderson(table[input_col], dist='norm')
            stats_res['data'] += [input_col]
            stats_res['estimates'] += [stats]
            stats_res['critical value'] += [critical_val[sig_level]]
            stats_res['significance level'] += [significance_lvl[sig_level]]
            result['anderson'][input_col] = {'estimates':[stats] * len(critical_val), 'critical value':list(critical_val), 'significance level':list(significance_lvl)}
        rb.addMD(strip_margin("""
        | ## {method} result
        |{stats_table}
        """.format(method=test_name['anderson'], stats_table=pandasDF2MD(pd.DataFrame(stats_res)))))
        
    result['_repr_brtc_'] = rb.get()
        
    return {'result': result}
Пример #27
0
def _evaluate_classification(table, label_col, prediction_col):

    label = table[label_col]
    predict = table[prediction_col]

    # compute metrics
    accuracy = accuracy_score(label, predict)
    f1 = f1_score(label, predict, average="weighted")
    precision = precision_score(label, predict, average="weighted")
    recall = recall_score(label, predict, average="weighted")
    class_names = np.unique(np.union1d(label.values, predict.values))

    # Plot non-normalized confusion matrix
    plt.figure()
    _plot_confusion_matrix(label,
                           predict,
                           classes=class_names,
                           title='Confusion matrix, without normalization')
    fig_cnf_matrix = plt2MD(plt)
    # Plot normalized confusion matrix
    plt.figure()
    _plot_confusion_matrix(label,
                           predict,
                           classes=class_names,
                           normalize=True,
                           title='Normalized confusion matrix')
    fig_cnf_matrix_normalized = plt2MD(plt)
    plt.clf()

    # json
    summary = dict()
    summary['label_col'] = label_col
    summary['prediction_col'] = prediction_col
    summary['f1_score'] = f1
    summary['accuracy_score'] = accuracy
    summary['precision_score'] = precision
    summary['recall_score'] = recall

    # report
    all_dict_list = [{
        'f1': f1,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall
    }]
    all_df = pd.DataFrame(all_dict_list)
    all_df = all_df[['f1', 'accuracy', 'precision', 'recall']]
    summary['metrics'] = all_df

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Evaluate Classification Result
    | ### Metrics
    | {table1}
    |
    | ### Confusion matrix
    | {fig_confusion_matrix}
    |
    | {fig_confusion_matrix_normalized}
    |
    """.format(table1=pandasDF2MD(all_df),
               fig_confusion_matrix=fig_cnf_matrix,
               fig_confusion_matrix_normalized=fig_cnf_matrix_normalized)))
    summary['_repr_brtc_'] = rb.get()

    return {'result': summary}
Пример #28
0
def _mlp_regression_train(table,
                          feature_cols,
                          label_col,
                          hidden_layer_sizes=(100, ),
                          activation='relu',
                          solver='adam',
                          alpha=0.0001,
                          batch_size_auto=True,
                          batch_size='auto',
                          learning_rate='constant',
                          learning_rate_init=0.001,
                          max_iter=200,
                          random_state=None,
                          tol=0.0001):
    _, features = check_col_type(table, feature_cols)
    label = table[label_col]

    mlp_model = MLPRegressor(hidden_layer_sizes=hidden_layer_sizes,
                             activation=activation,
                             solver=solver,
                             alpha=alpha,
                             batch_size=batch_size,
                             learning_rate=learning_rate,
                             learning_rate_init=learning_rate_init,
                             max_iter=max_iter,
                             shuffle=True,
                             random_state=random_state,
                             tol=tol)
    mlp_model.fit(features, label)

    predict = mlp_model.predict(features)

    intercepts = mlp_model.intercepts_
    coefficients = mlp_model.coefs_
    loss = mlp_model.loss_

    _mean_absolute_error = mean_absolute_error(label, predict)
    _mean_squared_error = mean_squared_error(label, predict)
    _r2_score = r2_score(label, predict)

    result_table = pd.DataFrame.from_items(
        [['Metric', ['Mean Absolute Error', 'Mean Squared Error', 'R2 Score']],
         ['Score', [_mean_absolute_error, _mean_squared_error, _r2_score]]])

    label_name = {
        'hidden_layer_sizes': 'Hidden Layer Sizes',
        'activation': 'Activation Function',
        'solver': 'Solver',
        'alpha': 'Alpha',
        'batch_size': 'Batch Size',
        'learning_rate': 'Learning Rate',
        'learning_rate_init': 'Learning Rate Initial',
        'max_iter': 'Max Iteration',
        'random_state': 'Seed',
        'tol': 'Tolerance'
    }
    get_param = mlp_model.get_params()
    param_table = pd.DataFrame.from_items(
        [['Parameter', list(label_name.values())],
         ['Value', [get_param[x] for x in list(label_name.keys())]]])

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ### MLP Classification Result
    | {result}
    | ### Parameters
    | {list_parameters}
    """.format(result=pandasDF2MD(result_table),
               list_parameters=pandasDF2MD(param_table))))

    model = _model_dict('mlp_regression_model')
    model['features'] = feature_cols
    model['label'] = label_col
    model['intercepts'] = mlp_model.intercepts_
    model['coefficients'] = mlp_model.coefs_
    model['loss'] = mlp_model.loss_
    model['mean_absolute_error'] = _mean_absolute_error
    model['mean_squared_error'] = _mean_squared_error
    model['r2_score'] = _r2_score
    model['activation'] = activation
    model['solver'] = solver
    model['alpha'] = alpha
    model['batch_size'] = batch_size
    model['learning_rate'] = learning_rate
    model['learning_rate_init'] = learning_rate_init
    model['max_iter'] = max_iter
    model['random_state'] = random_state
    model['tol'] = tol
    model['mlp_model'] = mlp_model
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Пример #29
0
def _dim(table, input_col, topic_name='topic', num_topic=5, num_topic_word=10, max_iter=20, time_slice=None,
         coherence='u_mass', vis_time=0, seed=None):
    running_os = platform.system()
    is_os_64bit = platform.machine().endswith('64')
    if running_os == 'Linux':
        if is_os_64bit:
            dtm_filename = 'dtm-linux64'
        else:
            dtm_filename = 'dtm-linux32'
    elif running_os == 'Windows':
        if is_os_64bit:
            dtm_filename = 'dtm-win64.exe'
        else:
            dtm_filename = 'dtm-win32.exe'
    else:  # Mac
        dtm_filename = 'dtm-darwin64'
    dtm_path = os.path.join(str(pathlib.Path(__file__).parent.absolute()), 'dtm', dtm_filename)
    tokenized_doc = np.array(table[input_col])
    num_doc = len(tokenized_doc)
    if time_slice is None:
        time_slice = [num_doc]
    elif sum(time_slice) != num_doc:
        raise_runtime_error("The sum of time slice list does not match the number of documents.")
    if vis_time < 0 or vis_time >= len(time_slice):
        raise_runtime_error("Invalid time parameter: {}".format(vis_time))
    dictionary = corpora.Dictionary(tokenized_doc)
    corpus = [dictionary.doc2bow(text) for text in tokenized_doc]
    dim_params = {"corpus": corpus,
                  "id2word": dictionary,
                  "time_slices": time_slice,
                  "num_topics": num_topic,
                  "lda_sequence_max_iter": max_iter,
                  "model": 'fixed'}
    if seed is not None:
        dim_params["rng_seed"] = seed
    dtm_model = DtmModel(dtm_path, **dim_params)

    topic_time = [[dtm_model.show_topic(topicid=topic_id, time=t, topn=num_topic_word) for topic_id in range(num_topic)]
                  for t in range(len(time_slice))]
    topic_time = [[["{}: {}".format(tup[1], tup[0]) for tup in topic] for topic in time] for time in topic_time]
    timeline = ["{} ({} docs)".format(ind, t) for ind, t in enumerate(time_slice)]
    columns = ["topic_{}".format(i + 1) for i in range(num_topic)]
    topic_table = pd.DataFrame(topic_time, columns=columns)
    topic_table['time'] = timeline
    topic_table = topic_table[['time'] + columns]

    prop_arr = dtm_model.gamma_
    out_table = pd.DataFrame.copy(table, deep=True)
    if topic_name in table.columns:
        raise BrighticsFunctionException.from_errors(
            [{'0100': "Existing table contains Topic Column Name. Please choose again."}])
    out_table[topic_name] = [item.argmax() + 1 for item in prop_arr]
    out_table['topic_distribution'] = prop_arr.tolist()

    # original influence table: influences_time[time_slice][document_no][topic_no]
    influence_arr = np.vstack(dtm_model.influences_time)
    influence_table = pd.DataFrame(influence_arr, columns=columns)
    time_id = np.concatenate([id * np.ones(duration) for id, duration in enumerate(time_slice)])
    influence_table['time'] = time_id
    influence_table = influence_table[['time'] + columns]

    coherence_topic_arr = [dtm_model.dtm_coherence(time) for time in range(len(time_slice))]
    if coherence == 'u_mass':
        coh_arr = [CoherenceModel(topics=item, dictionary=dictionary, corpus=corpus, coherence='u_mass').get_coherence()
                   for item in coherence_topic_arr]
    else:
        coh_arr = [CoherenceModel(topics=item, dictionary=dictionary, corpus=corpus, texts=tokenized_doc,
                                  coherence='c_v').get_coherence() for item in coherence_topic_arr]

    doc_topic, topic_term, doc_lengths, term_frequency, vocab = dtm_model.dtm_vis(corpus, vis_time)
    prepared_data = plv.prepare(topic_term, doc_topic, doc_lengths, vocab, term_frequency, sort_topics=False)
    html_result = plv.prepared_data_to_html(prepared_data)

    params = {'Input column': input_col,
              'Topic column name': topic_name,
              'Number of topics': num_topic,
              'Number of words for each topic': num_topic_word,
              'Maximum number of iterations': max_iter,
              'Time slice': time_slice,
              'Coherence measure': coherence,
              'Time to visualize': vis_time}
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## Document Influence Model Result
    | ### Summary
    |
    """))
    rb.addHTML(html_result)
    rb.addMD(strip_margin("""
    | ### Coherence for each period
    | {coh_arr}
    |
    | ### Parameters
    | {params}
    """.format(coh_arr=coh_arr, params=dict2MD(params))))

    model = _model_dict('dtm_model')
    model['params'] = params
    model['dtm_model'] = dtm_model
    model['coherences'] = coh_arr
    model['corpus'] = corpus
    model['_repr_brtc_'] = rb.get()

    return {'out_table': out_table, 'topic_table': topic_table, 'influence_table': influence_table, 'model': model}
Пример #30
0
def _gaussian_mixture_train(table, input_cols, number_of_components=1, covariance_type='full', tolerance=0.001, \
                            regularize_covariance=1e-06, max_iteration=100, initial_params='kmeans', seed=None):

    gmm = GaussianMixture(n_components=number_of_components, covariance_type=covariance_type, tol=tolerance, \
                          reg_covar=regularize_covariance, max_iter=max_iteration, init_params=initial_params, random_state=seed)
    X_train = table[input_cols]
    gmm.fit(X_train)
    
    out_table = pd.DataFrame()
    
    comp_num_arr = []
    for i in range(0, number_of_components):
        comp_num_arr.append(i)
    
    mean_arr = []
    for i in range(0, number_of_components):
        mean_arr.append(gmm.means_[i].tolist())
        
    covar_arr = []
    for i in range(0, number_of_components):
        covar_arr.append(gmm.covariances_[i].tolist())
        
    out_table['component_number'] = comp_num_arr
    out_table['weight'] = gmm.weights_
    out_table['mean_coordinate'] = mean_arr
    out_table['covariance_matrix'] = covar_arr
    
    rb = BrtcReprBuilder()
    params = { 
        'Input Columns': input_cols,
        'Number of Components': number_of_components,
        'Covariance Type': covariance_type,
        'Tolerance': tolerance,
        'Regularization of Covariance': regularize_covariance,
        'Number of Iteration': max_iteration,
        'Method to Initialize': initial_params
    }

    rb.addMD(strip_margin("""
    |## Gaussian Mixture Train Result 
    |
    |### Parameters
    |
    | {params}
    |
    |### Summary
    |
    |{result_table}
    |
    """.format(params=dict2MD(params), result_table=pandasDF2MD(out_table))))

    model = _model_dict('gaussian_mixture_train')
    model['input_cols'] = input_cols
    model['number_of_components'] = number_of_components
    model['covariance_type'] = covariance_type
    model['tolerance'] = tolerance
    model['regularize_covariance'] = regularize_covariance
    model['max_iteration'] = max_iteration
    model['initial_params'] = initial_params
    model['seed'] = seed
    model['summary'] = out_table
    model['gmm'] = gmm
    model['_repr_brtc_'] = rb.get()
    return {'model':model}