Exemplo n.º 1
0
def _oneway_anova(table, response_cols, factor_col):
    rb = ReportBuilder()
    rb.addMD(strip_margin("""
    ## One-way Analysis of Variance Result
    """))
    groups = table[factor_col].unique()
    groups.sort()
    sum_len = np.sum([ len(str(group)) for group in groups ])
    
    result = dict()
    result['_grouped_data'] = dict()
    
    for response_col in response_cols:
        data = table[response_col]
        result['_grouped_data'][response_col] = dict()
        
        ax = sns.boxplot(x=factor_col, y=response_col, data=table, order=groups)
        if sum_len > 512:
            ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
        elif sum_len > 64:
            ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
            
        fig_box = plt2MD(plt)
        plt.clf()
        
        model = ols("""Q('{response_col}') ~ C(Q('{factor_col}'))""".format(response_col=response_col, factor_col=factor_col), table).fit()  # TODO factor_col = class => error
        anova = anova_lm(model)
        
        anova_df = pandasDF2MD(anova)
        
        p_value = anova["""PR(>F)"""][0]
        
        residual = model.resid
        
        sns.distplot(residual)
        distplot = plt2MD(plt)
        plt.clf()
        
        sm.qqplot(residual, line='s')
        qqplot = plt2MD(plt)
        plt.clf()
            
        rb.addMD(strip_margin("""
        | ## {response_col} by {factor_col}
        | {fig_box}
        |
        | ### ANOVA
        | {anova_df}
        | 
        | ### Diagnostics
        | {distplot}
        |
        | {qqplot}
        """.format(response_col=response_col, factor_col=factor_col, fig_box=fig_box, anova_df=anova_df, distplot=distplot, qqplot=qqplot)))
        
        result['_grouped_data'][response_col]['p_value'] = p_value
        
    result['report'] = rb.get()
    return {'result': result}
Exemplo n.º 2
0
def generate_wordcloud(table,
                       input_col,
                       width=640,
                       height=480,
                       background_color="white",
                       max_font_size=None):
    font_path = './brightics/function/text_analytics/fonts/NanumGothic.ttf'  # todo

    counter = Counter()

    table[input_col].apply(counter.update)

    wordcloud = WordCloud(font_path=font_path,
                          width=width,
                          height=height,
                          background_color=background_color)
    wordcloud.generate_from_frequencies(dict(counter), max_font_size)

    img_bytes = io.BytesIO()
    wordcloud.to_image().save(img_bytes, format='PNG')
    fig_wordcloud = png2MD(img_bytes.getvalue())

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | ## Word Cloud Result
    | {fig}
    """.format(fig=fig_wordcloud)))

    result = dict()
    result['report'] = rb.get()

    return {'result': result}
Exemplo n.º 3
0
def doctovec_similar_sentence(table, model, text_col, label_col):

    df = table.copy()
    result_sim = {}

    for i in range(10):
        temp = {}
        temp['sentence'] = []
        temp['label'] = []
        for id, vec in model.docvecs.most_similar(i):
            temp['sentence'].append(df.at[id, text_col])
            temp['label'].append(df.at[id, label_col])
        result_sim[i] = pd.DataFrame(temp)

    str_MD = '## Most similar sentences \n'

    for i in range(10):
        str_MD += '|' + df.at[i, 'document'] + '\n'
        str_MD += '|' + pandasDF2MD(result_sim[i]) + '\n'
    rb = ReportBuilder()
    rb.addMD(strip_margin(str_MD))

    _model = _model_dict('doc2vec')
    _model['report'] = rb.get()

    return {'model': _model}
Exemplo n.º 4
0
def wordcloud(table,input_col,font_path = '/fonts/NanumGothic.ttf',width=800, height=800, background_color="white"):
    texts=''
    for tokens in table[input_col]:
        for token in tokens:
            texts += ' ' + token

    wordcloud = WordCloud(
    font_path = font_path,
    width = width,
    height = height,
    background_color=background_color
    )
    wordclud = wordcloud.generate_from_text(texts)

    array = wordcloud.to_array()

    fig = plt.figure(figsize=(10, 10))
    plt.imshow(array, interpolation="bilinear")
    plt.axis('off')
    
    fig_image=plt2MD(plt)

    rb = ReportBuilder()
    rb.addMD(strip_margin("""
    | ## Word Cloud Result
    | {fig}
    """.format(fig=fig_image)))

    model = _model_dict('wordcloud')
    model['plt'] = fig_image
    model['report']=rb.get()

    return {'model': model}
Exemplo n.º 5
0
def bartletts_test(table, response_cols, factor_col):
    groups = table[factor_col].unique()
    
    data_list = []
    stat_list = []
    p_list = []
    for response_col in response_cols:
        response = table[response_col]
        stat_bart, p_bart = bartlett(*[response[table[factor_col] == group] for group in groups])
        data = '{response_col} by {factor_col}'.format(response_col=response_col, factor_col=factor_col)
        data_list.append(data)
        stat_list.append(stat_bart)
        p_list.append(p_bart)
        
    result_table = pd.DataFrame.from_items([ 
        ['data', data_list],
        ['estimate', stat_list],
        ['p_value', p_list] 
    ])
    
    result = dict()
    result['result_table'] = result_table
    
    rb = ReportBuilder()
    rb.addMD(strip_margin("""
    ## Bartlett's Test Result
    | - H0: k population variances are equal.
    | - H1: at least two variances are different.
    |
    | {result_table}
    """.format(result_table=pandasDF2MD(result_table))))
    
    result['report'] = rb.get()
        
    return {'result': result}
Exemplo n.º 6
0
def _hierarchical_clustering_post(table, model, num_clusters, cluster_col='prediction'):
    Z = model['model']
    input_cols = model['input_cols']
    params = model['parameters']
    out_table = model['outtable']
    predict = fcluster(Z, t=num_clusters, criterion='maxclust')
    out_table2 = table.copy()
    out_table2[cluster_col] = predict
    
    L, M = leaders(Z, predict)
    which_cluster = []
    for leader in L:
        if leader in Z[:, 0]:
            select_indices = np.where(Z[:, 0] == leader)[0][0]
            which_cluster.append(out_table['joined_column1'][select_indices])
        elif leader in Z[:, 1]:
            select_indices = np.where(Z[:, 1] == leader)[0][0]
            which_cluster.append(out_table['joined_column2'][select_indices])
    
    out_table3 = pd.DataFrame([])
    out_table3[cluster_col] = M
    out_table3['name_of_clusters'] = which_cluster
    out_table3 = out_table3.sort_values(cluster_col)
    cluster_count = np.bincount(out_table2[cluster_col])
    cluster_count = cluster_count[cluster_count != 0]
    # data = {'cluster_name': ['prediction' + str(i) for i in range(1, num_clusters + 1)]}
    out_table3['num_of_entities'] = list(cluster_count)
    
    rb = ReportBuilder()
    rb.addMD(strip_margin("""### Hierarchical Clustering Post Process Result"""))
    rb.addMD(strip_margin("""
    |### Parameters
    |
    |{display_params}
    |
    |## Clusters Information
    |
    |{out_table3}
    |
    """.format(display_params=dict2MD(params), out_table3=pandasDF2MD(out_table3))))

    model = _model_dict('hierarchical_clustering_post')
    model['report'] = rb.get()
    
    return {'out_table2' : out_table2, 'model': model}
Exemplo n.º 7
0
def kmeans_train_predict(table, input_cols, n_clusters=3, prediction_col='prediction', init='k-means++', n_init=10,
             max_iter=300, tol=1e-4, precompute_distances='auto', seed=None,
             n_jobs=1, algorithm='auto', n_samples=None):
    inputarr = table[input_cols]
    if n_samples is None:
        n_samples = len(inputarr)
    
    k_means = SKKMeans(n_clusters=n_clusters, init=init, n_init=n_init,
             max_iter=max_iter, tol=tol, precompute_distances=precompute_distances,
             verbose=0, random_state=seed, copy_x=True, n_jobs=n_jobs, algorithm=algorithm)
    
    k_means.fit(inputarr)
    
    params = {'input_cols':input_cols, 'n_clusters':n_clusters, 'init':init, 'n_init':n_init, 'max_iter':max_iter, 'tol':tol,
              'precompute_distances':precompute_distances, 'seed':seed, 'n_jobs':n_jobs, 'algorithm':algorithm}
    
    cluster_centers = k_means.cluster_centers_
    labels = k_means.labels_
    
    pca2_model = PCA(n_components=2).fit(inputarr)
    pca2 = pca2_model.transform(inputarr)
    
    fig_centers = _kmeans_centers_plot(input_cols, cluster_centers)
    fig_samples = _kmeans_samples_plot(table, input_cols, n_samples, cluster_centers)
    fig_pca = _kmeans_pca_plot(labels, cluster_centers, pca2_model, pca2)
    
    rb = ReportBuilder()
    rb.addMD(strip_margin("""
    | ## Kmeans Result
    | - Number of iterations run: {n_iter_}.
    | - Coordinates of cluster centers
    | {fig_cluster_centers} 
    | - Samples
    | {fig_pca}
    | {fig_samples}
    |
    | ### Parameters
    | {params}
    """.format(n_iter_=k_means.n_iter_, fig_cluster_centers=fig_centers, fig_pca=fig_pca, fig_samples=fig_samples, params=dict2MD(params))))
    
    model = _model_dict('kmeans')
    model['model'] = k_means
    model['input_cols'] = input_cols
    model['report'] = rb.get()
    
    out_table = table.copy()
    out_table[prediction_col] = labels
    return {'out_table':out_table, 'model':model}
Exemplo n.º 8
0
def agglomerative_clustering_train_predict(input_table,
                                           input_cols,
                                           n_clusters=3,
                                           affinity='euclidean',
                                           compute_full_tree=True,
                                           linkage='ward',
                                           prediction_col='prediction',
                                           figw=6.4,
                                           figh=4.8):
    inputarr = input_table[input_cols]

    agglomerative_clustering = SKAgglomerativeClustering(
        n_clusters=n_clusters,
        affinity=affinity,
        memory=None,
        connectivity=None,
        compute_full_tree=compute_full_tree,
        linkage=linkage)
    agglomerative_clustering.fit(inputarr)
    input_table[prediction_col] = agglomerative_clustering.labels_

    children = agglomerative_clustering.children_
    distance = np.arange(children.shape[0])
    no_of_observations = np.arange(2, children.shape[0] + 2)
    linkage_matrix = np.column_stack([children, distance,
                                      no_of_observations]).astype(float)
    plt.figure(figsize=(figw, figh))
    dendrogram(linkage_matrix)
    plot_dendrogram = plt2MD(plt)
    plt.clf()

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | ## Agglomerative Clustering Result
    | {plot_dendrogram}
    """.format(plot_dendrogram=plot_dendrogram)))

    agglomerative_clustering_result = {
        'model': agglomerative_clustering,
        'input_cols': input_cols,
        'report': rb.get()
    }

    return {
        'out_table': input_table,
        'agglomerative_result': agglomerative_clustering_result
    }
Exemplo n.º 9
0
def _svc_train(table,
               feature_cols,
               label_col,
               c=1.0,
               kernel='rbf',
               degree=3,
               gamma='auto',
               coef0=0.0,
               shrinking=True,
               probability=True,
               tol=1e-3,
               max_iter=-1,
               random_state=None):
    _table = table.copy()

    _feature_cols = _table[feature_cols]
    _label_col = _table[label_col]

    _svc = svm.SVC(C=c,
                   kernel=kernel,
                   degree=degree,
                   gamma=gamma,
                   coef0=coef0,
                   shrinking=shrinking,
                   probability=probability,
                   tol=tol,
                   max_iter=max_iter,
                   random_state=random_state)
    _svc_model = _svc.fit(_feature_cols, _label_col)

    get_param = _svc.get_params()
    get_param['feature_cols'] = feature_cols
    get_param['label_col'] = label_col

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | ## SVM Classification Result
    | ### Parameters
    | {table_parameter} 
    """.format(table_parameter=dict2MD(get_param))))

    _model = _model_dict('svc_model')
    _model['svc_model'] = _svc_model
    _model['features'] = feature_cols
    _model['report'] = rb.get()

    return {'model': _model}
Exemplo n.º 10
0
def _evaluate_regression(table, label_col, prediction_col):
    label = table[label_col]
    predict = table[prediction_col]

    # compute metrics
    evs = explained_variance_score(label, predict)
    mae = mean_absolute_error(label, predict)
    mse = mean_squared_error(label, predict)
    mdae = median_absolute_error(label, predict)
    r2 = r2_score(label, predict)

    # json
    summary = dict()
    summary['label_col'] = label_col
    summary['prediction_col'] = prediction_col
    summary['r2_score'] = r2
    summary['mean_squared_error'] = mse
    summary['mean_absolute_error'] = mae
    summary['median_absolute_error'] = mdae
    summary['explained_variance_score'] = evs

    # report
    all_dict_list = [{
        'r2_score': r2,
        'mean_squared_error': mse,
        'mean_absolute_error': mae,
        'median_absolute_error': mdae,
        'explained_variance_score': evs
    }]
    all_df = pd.DataFrame(all_dict_list)
    all_df = all_df[[
        'r2_score', 'mean_squared_error', 'mean_absolute_error',
        'median_absolute_error', 'explained_variance_score'
    ]]
    summary['metrics'] = all_df

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | ## Evaluate Regression Result
    | ### Metrics
    | {table1}
    |
    |
    """.format(table1=pandasDF2MD(all_df))))
    summary['report'] = rb.get()

    return {'result': summary}
Exemplo n.º 11
0
def _plot_roc_pr_curve(table,
                       label_col,
                       probability_col,
                       fig_size=[6.4, 4.8],
                       pos_label=None):
    label = table[label_col]
    probability = table[probability_col]

    threshold, fig_tpr_fpr, fig_roc, fig_precision_recall, fig_pr, fig_confusion = _plot_binary(
        label,
        probability,
        fig_size=(fig_size[0], fig_size[1]),
        pos_label=pos_label)

    summary = dict()
    summary['threshold'] = threshold
    summary['label_col'] = label_col
    summary['probability_col'] = probability_col

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | ## Plot ROC Curve and PR Curve Result
    |
    | ### ROC Curve
    | {fig_tpr_fpr}
    | {fig_roc}
    |
    | ### PR Curve
    | {fig_precision_recall}
    | {fig_pr}
    |
    | ### Confusion Matrix
    | {fig_confusion}
    """.format(fig_roc=fig_roc,
               fig_tpr_fpr=fig_tpr_fpr,
               fig_pr=fig_pr,
               fig_precision_recall=fig_precision_recall,
               fig_confusion=fig_confusion)))
    summary['report'] = rb.get()

    return {'result': summary}
Exemplo n.º 12
0
def _outlier_detection_lof(table, input_cols, choice='add_prediction', n_neighbors=20, new_column_name='is_outlier'):  # algorithm='auto', leaf_size=30,
                          # metric='minkowski', p=2, contamination=0.1, 
    out_table = table.copy()
    lof_model = LocalOutlierFactor(n_neighbors, algorithm='auto', leaf_size=30, metric='minkowski', p=2, contamination=0.1)
    lof_model.fit_predict(out_table[input_cols])
    
    isinlier = lambda _: 'in' if _ == 1 else 'out'
    out_table[new_column_name] = [isinlier(lof_predict) for lof_predict in lof_model.fit_predict(out_table[input_cols])]
    
    if choice == 'add_prediction':
        pass
    elif choice == 'remove_outliers':
        out_table = out_table[out_table[new_column_name] == 'in']
        out_table = out_table.drop(new_column_name, axis=1)
    elif choice == 'both':
        out_table = out_table[out_table[new_column_name] == 'in']
    
    params = {
        'Input Columns': input_cols,
        'Result Type': choice,
        'Number of Neighbors': n_neighbors,
    #    'Algorithm': algorithm,
    #    'Metric': metric,
    #    'Contamination': contamination
    }
    
    rb = ReportBuilder()
    rb.addMD(strip_margin("""
    | ## Outlier Detection (Local Outlier Factor) Result
    | ### Parameters
    |
    | {display_params}
    """.format(display_params=dict2MD(params))))
    
    model = _model_dict('outlier_detection_lof')
    model['params'] = params
    model['lof_model'] = lof_model
    model['report'] = rb.get()
    
    return {'out_table':out_table, 'model':model}
Exemplo n.º 13
0
def tfidf_train(table,
                tokens_col,
                tf_weighing='n',
                df_weighing='t',
                document_normalization='c'):

    out_table = table.copy()
    _corpus = out_table[tokens_col]
    _smartirs = tf_weighing + df_weighing + document_normalization

    _dictionary = Dictionary(_corpus)
    _corpus = [_dictionary.doc2bow(text) for text in _corpus]

    _model = TfidfModel(_corpus, smartirs=_smartirs)
    _corpus = [text for text in _model[_corpus]]

    _sparse_matrix = corpus2csc(_corpus, num_terms=len(_dictionary.token2id)).T

    _values = [value for value in _dictionary.values()]
    _keys = [key for key in _dictionary.keys()]
    _dic = pd.DataFrame({'indice': _keys, 'word': _values})
    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | ## Dictionary
    | {table1}
    """.format(table1=pandasDF2MD(_dic))))

    out_table['sparse_vectors'] = sparse_encode(
        _sparse_matrix)['sparse_vectors']

    fit_model = dict()
    fit_model['dictionary'] = _dictionary
    fit_model['model'] = _model
    fit_model['report'] = rb.get()
    return {'out_table': out_table, 'fit_model': fit_model}
Exemplo n.º 14
0
def two_sample_ttest_for_stacked_data(table,
                                      response_cols,
                                      factor_col,
                                      alternatives,
                                      first,
                                      second,
                                      hypo_diff=0,
                                      equal_vari='pooled',
                                      confi_level=0.95):

    if (type(table[factor_col][0]) == str):
        table_first = table[table[factor_col] == first]
        table_second = table[table[factor_col] == second]
    elif (type(table[factor_col][0]) == bool):
        table_first = table[table[factor_col] == bool(first)]
        table_second = table[table[factor_col] == bool(second)]
    else:
        table_first = table[table[factor_col] == float(first)]
        table_second = table[table[factor_col] == float(second)]

    tmp_table = []

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    ## Two Sample T Test for Stacked Data Result
    | - Hypothesized mean = {hypo_diff}
    | - Confidence level = {confi_level}
    """.format(hypo_diff=hypo_diff, confi_level=confi_level)))

    for response_col in response_cols:
        tmp_model = []
        number1 = len(table_first[response_col])
        number2 = len(table_second[response_col])
        mean1 = (table_first[response_col]).mean()
        mean2 = (table_second[response_col]).mean()
        std1 = (table_first[response_col]).std()
        std2 = (table_second[response_col]).std()
        start_auto = 0
        if (equal_vari == 'auto'):
            start_auto = 1
            f_value = (std1**2) / (std2**2)
            f_test_p_value_tmp = stats.f.cdf(1 / f_value, number1 - 1,
                                             number2 - 1)
            if (f_test_p_value_tmp > 0.5):
                f_test_p_value = (1 - f_test_p_value_tmp) * 2
            else:
                f_test_p_value = f_test_p_value_tmp * 2
            if (f_test_p_value < 0.05):
                equal_vari = 'unequal'
            else:
                equal_vari = 'pooled'
        ttestresult = ttest_ind(table_first[response_col],
                                table_second[response_col],
                                'larger',
                                usevar=equal_vari,
                                value=hypo_diff)

        if 'larger' in alternatives:
            ttestresult = ttest_ind(table_first[response_col],
                                    table_second[response_col],
                                    'larger',
                                    usevar=equal_vari,
                                    value=hypo_diff)
            df = ttestresult[2]
            if (equal_vari == 'pooled'):
                std_number1number2 = sqrt(
                    ((number1 - 1) * (std1)**2 + (number2 - 1) *
                     (std2)**2) / (number1 + number2 - 2))
                margin = t.ppf(
                    (confi_level),
                    df) * std_number1number2 * sqrt(1 / number1 + 1 / number2)
            if (equal_vari == 'unequal'):
                margin = t.ppf(
                    (confi_level), df) * sqrt(std1**2 / (number1) + std2**2 /
                                              (number2))
            tmp_model += [['true difference in means > 0.0'] +
                          [ttestresult[1]] +
                          [(mean1 - mean2 - margin, math.inf)]]
            tmp_table += [[
                '%s by %s(%s,%s)' % (response_col, factor_col, first, second)
            ] + ['true difference in means > 0.0'] + [
                't statistic, t distribution with %f degrees of freedom under the null hypothesis'
                % ttestresult[2]
            ] + [ttestresult[0]] + [ttestresult[1]] + [confi_level] +
                          [mean1 - mean2 - margin] + [math.inf]]

        if 'smaller' in alternatives:
            ttestresult = ttest_ind(table_first[response_col],
                                    table_second[response_col],
                                    'smaller',
                                    usevar=equal_vari,
                                    value=hypo_diff)
            df = ttestresult[2]
            if (equal_vari == 'pooled'):
                std_number1number2 = sqrt(
                    ((number1 - 1) * (std1)**2 + (number2 - 1) *
                     (std2)**2) / (number1 + number2 - 2))
                margin = t.ppf(
                    (confi_level),
                    df) * std_number1number2 * sqrt(1 / number1 + 1 / number2)
            if (equal_vari == 'unequal'):
                margin = t.ppf(
                    (confi_level), df) * sqrt(std1**2 / (number1) + std2**2 /
                                              (number2))
            tmp_model += [['true difference in means < 0.0'] +
                          [ttestresult[1]] +
                          [(-math.inf, mean1 - mean2 + margin)]]
            tmp_table += [[
                '%s by %s(%s,%s)' % (response_col, factor_col, first, second)
            ] + ['true difference in means < 0.0'] + [
                't statistic, t distribution with %f degrees of freedom under the null hypothesis'
                % ttestresult[2]
            ] + [ttestresult[0]] + [ttestresult[1]] + [confi_level] +
                          [-math.inf] + [mean1 - mean2 + margin]]

        if 'two-sided' in alternatives:
            ttestresult = ttest_ind(table_first[response_col],
                                    table_second[response_col],
                                    'two-sided',
                                    usevar=equal_vari,
                                    value=hypo_diff)
            df = ttestresult[2]
            if (equal_vari == 'pooled'):
                std_number1number2 = sqrt(
                    ((number1 - 1) * (std1)**2 + (number2 - 1) *
                     (std2)**2) / (number1 + number2 - 2))
                margin = t.ppf(
                    (confi_level),
                    df) * std_number1number2 * sqrt(1 / number1 + 1 / number2)
            if (equal_vari == 'unequal'):
                margin = t.ppf(
                    (confi_level), df) * sqrt(std1**2 / (number1) + std2**2 /
                                              (number2))
            tmp_model += [['true difference in means != 0.0'] +
                          [ttestresult[1]] +
                          [(mean1 - mean2 - margin, mean1 - mean2 + margin)]]
            tmp_table += [[
                '%s by %s(%s,%s)' % (response_col, factor_col, first, second)
            ] + ['true difference in means != 0.0'] + [
                't statistic, t distribution with %f degrees of freedom under the null hypothesis'
                % ttestresult[2]
            ] + [ttestresult[0]] + [ttestresult[1]] + [confi_level] +
                          [mean1 - mean2 - margin] + [mean1 - mean2 + margin]]

        result_model = pd.DataFrame.from_records(tmp_model)
        result_model.columns = [
            'alternatives', 'p values',
            '%g%% confidence interval' % (confi_level * 100)
        ]
        rb.addMD(
            strip_margin("""
        | #### Data = {response_col} by {factor_col}({first},{second})
        
        | - Statistics = t statistic, t distribution with {ttestresult2} degrees of freedom under the null hypothesis
        | - Estimates= {ttestresult0}
        |
        | {result_model}
        |
        """.format(ttestresult2=ttestresult[2],
                   response_col=response_col,
                   factor_col=factor_col,
                   first=first,
                   second=second,
                   ttestresult0=ttestresult[0],
                   result_model=pandasDF2MD(result_model))))
        if (start_auto == 1):
            equal_vari = 'auto'
    result = pd.DataFrame.from_records(tmp_table)
    result.columns = [
        'data', 'alternative_hypothesis', 'statistics', 'estimates', 'p_value',
        'confidence_level', 'lower_confidence_interval',
        'upper_confidence_interval'
    ]

    model = dict()
    model['report'] = rb.get()
    return {'out_table': result, 'model': model}
Exemplo n.º 15
0
def one_sample_ttest(table,
                     input_cols,
                     alternatives,
                     hypothesized_mean=0,
                     conf_level=0.95):

    n = len(table)
    degree = n - 1
    alpha = 1.0 - conf_level
    out_table = pd.DataFrame()

    # statistics
    statistics = "t statistic, t distribution with %d degrees of freedom under the null hypothesis." % degree

    # Print model
    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    ## One Sample T Test Result
    | - Statistics = {s}
    | - Hypothesized mean = {h} 
    | - Confidence level = {cl}
    """.format(s=statistics, h=hypothesized_mean, cl=conf_level)))

    for input_col in input_cols:
        # model
        alter_list = []
        p_list = []
        CI_list = []

        # data
        data = input_col

        # estimates
        result = stats.ttest_1samp(table[input_col], hypothesized_mean)
        estimates = result[0]

        cols = [
            'data', 'alternative_hypothesis', 'statistics', 'estimates',
            'p_value', 'confidence_level', 'lower_confidence_interval',
            'upper_confidence_interval'
        ]

        for i in alternatives:
            if (i == 'Greater'):
                # alternative hypothesis
                alternative_hypothesis = "true mean >" + str(hypothesized_mean)
                # p-values
                p_value = 1.0 - t.cdf(estimates, degree)
                # confidence interval - greater
                critical_val = t.ppf(1.0 - alpha, degree)
                width = critical_val * np.std(
                    table[input_col]) / math.sqrt(n - 1)
                lower_conf_interval = np.mean(table[input_col]) - width
                upper_conf_interval = math.inf

                # model
                alter = 'true mean > {hypothesized_mean}'.format(
                    hypothesized_mean=hypothesized_mean)
                alter_list.append(alter)
                p_list.append(p_value)
                conf_interval = '({lower_conf_interval}, {upper_conf_interval})'.format(
                    lower_conf_interval=lower_conf_interval,
                    upper_conf_interval=upper_conf_interval)
                CI_list.append(conf_interval)
                # out_table
                list = []
                list.append([
                    data, alternative_hypothesis, statistics, estimates,
                    p_value, conf_level, lower_conf_interval,
                    upper_conf_interval
                ])
                out_table = out_table.append(pd.DataFrame(list, columns=cols))

            if (i == 'Less'):
                # alternative hypothesis
                alternative_hypothesis = "true mean <" + str(hypothesized_mean)
                p_value = t.cdf(estimates, degree)
                # confidence interval - less
                critical_val = t.ppf(1.0 - alpha, degree)
                width = critical_val * np.std(
                    table[input_col]) / math.sqrt(n - 1)
                lower_conf_interval = -math.inf
                upper_conf_interval = np.mean(table[input_col]) + width

                # model
                alter = 'true mean < {hypothesized_mean}'.format(
                    hypothesized_mean=hypothesized_mean)
                alter_list.append(alter)
                p_list.append(p_value)
                conf_interval = '({lower_conf_interval}, {upper_conf_interval})'.format(
                    lower_conf_interval=lower_conf_interval,
                    upper_conf_interval=upper_conf_interval)
                CI_list.append(conf_interval)
                # out_table
                list = []
                list.append([
                    data, alternative_hypothesis, statistics, estimates,
                    p_value, conf_level, lower_conf_interval,
                    upper_conf_interval
                ])
                out_table = out_table.append(pd.DataFrame(list, columns=cols))

            if (i == 'Two Sided'):
                # alternative hypothesis
                alternative_hypothesis = "true mean !=" + str(
                    hypothesized_mean)
                # p_value = (1.0 - t.cdf(abs(estimates), degree)) * 2.0
                if (estimates >= 0):
                    p_value = 2.0 * t.cdf(-estimates, degree)
                else:
                    p_value = 2.0 * t.cdf(estimates, degree)
                # confidence interval - two-sided
                critical_val = t.ppf(1.0 - alpha / 2, degree)
                width = critical_val * np.std(
                    table[input_col]) / math.sqrt(n - 1)
                lower_conf_interval = np.mean(table[input_col]) - width
                upper_conf_interval = np.mean(table[input_col]) + width

                # model
                alter = 'true mean != {hypothesized_mean}'.format(
                    hypothesized_mean=hypothesized_mean)
                alter_list.append(alter)
                p_list.append(p_value)
                conf_interval = '({lower_conf_interval}, {upper_conf_interval})'.format(
                    lower_conf_interval=lower_conf_interval,
                    upper_conf_interval=upper_conf_interval)
                CI_list.append(conf_interval)
                # out_table
                list = []
                list.append([
                    data, alternative_hypothesis, statistics, estimates,
                    p_value, conf_level, lower_conf_interval,
                    upper_conf_interval
                ])
                out_table = out_table.append(pd.DataFrame(list, columns=cols))

        # Print model
        conf_level_percent = conf_level * 100
        result_table = pd.DataFrame.from_items(
            [['alternative hypothesis', alter_list], ['p-value', p_list],
             ['%g%% confidence Interval' % conf_level_percent, CI_list]])

        result = dict()
        result['result_table'] = result_table
        rb.addMD(
            strip_margin("""
        ### Data = {input_col}
        | - Estimates = {estimates} 
        |
        | {result_table}
        """.format(input_col=input_col,
                   estimates=estimates,
                   result_table=pandasDF2MD(result_table))))

    # print model
    result['report'] = rb.get()

    return {'out_table': out_table, 'model': result}
Exemplo n.º 16
0
def _xgb_regression_train(table,
                          feature_cols,
                          label_col,
                          max_depth=3,
                          learning_rate=0.1,
                          n_estimators=100,
                          silent=True,
                          objectibe='reg:linear',
                          booster='gbtree',
                          n_jobs=1,
                          nthread=None,
                          gamma=0,
                          min_child_weight=1,
                          max_delta_step=0,
                          subsample=1,
                          colsample_bytree=1,
                          colsample_bylevel=1,
                          reg_alpha=0,
                          reg_lambda=1,
                          scale_pos_weight=1,
                          base_score=0.5,
                          random_state=0,
                          seed=None,
                          missing=None,
                          sample_weight=None,
                          eval_set=None,
                          eval_metric=None,
                          early_stopping_rounds=None,
                          verbose=True,
                          xgb_model=None,
                          sample_weight_eval_set=None):

    regressor = XGBRegressor(max_depth, learning_rate, n_estimators, silent,
                             objectibe, booster, n_jobs, nthread, gamma,
                             min_child_weight, max_delta_step, subsample,
                             colsample_bytree, colsample_bylevel, reg_alpha,
                             reg_lambda, scale_pos_weight, base_score,
                             random_state, seed, missing)
    regressor.fit(table[feature_cols], table[label_col], sample_weight,
                  eval_set, eval_metric, early_stopping_rounds, verbose,
                  xgb_model, sample_weight_eval_set)

    # json
    get_param = regressor.get_params()
    feature_importance = regressor.feature_importances_
    #     plt.rcdefaults()
    plot_importance(regressor)
    plt.tight_layout()
    fig_plot_importance = plt2MD(plt)
    plt.clf()
    #     plt.rcParams['figure.dpi'] = figure_dpi
    #     plot_tree(regressor)
    #     fig_plot_tree_UT = plt2MD(plt)
    #     plt.clf()
    #     plt.rcParams['figure.dpi'] = figure_dpi
    #     plot_tree(regressor, rankdir='LR')
    #     fig_plot_tree_LR = plt2MD(plt)
    #     plt.rcdefaults()
    #     plt.clf()

    out_model = _model_dict('xgb_regression_model')
    out_model['feature_cols'] = feature_cols
    out_model['label_col'] = label_col
    out_model['parameters'] = get_param
    out_model['feature_importance'] = feature_importance
    out_model['regressor'] = regressor
    out_model['plot_importance'] = fig_plot_importance
    #     out_model['plot_tree_UT'] = fig_plot_tree_UT
    #     out_model['plot_tree_LR'] = fig_plot_tree_LR
    #         out_model['to_graphviz'] = md_to_graphviz

    # report
    get_param_list = []
    get_param_list.append(['feature_cols', feature_cols])
    get_param_list.append(['label_col', label_col])
    for key, value in get_param.items():
        temp = [key, value]
        get_param_list.append(temp)
    get_param_df = pd.DataFrame(data=get_param_list,
                                columns=['parameter', 'value'])
    feature_importance_df = pd.DataFrame(data=feature_importance,
                                         index=feature_cols).T

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | ## XGB Regression Result
    |
    | ### Plot Importance
    | {image_importance}
    |
    | ### Feature Importance
    | {table_feature_importance}
    |
    | ### Parameters
    | {table_parameter}
    |
    """.format(image_importance=fig_plot_importance,
               table_feature_importance=pandasDF2MD(feature_importance_df, 20),
               table_parameter=pandasDF2MD(get_param_df))))
    out_model['report'] = rb.get()

    return {'model': out_model}
Exemplo n.º 17
0
def _correlation(table, vars, method='pearson', height=2.5, corr_prec=2):
    size = len(vars)

    s_default = plt.rcParams['lines.markersize']**2.
    scatter_kws = {"s": s_default * height / 6.4}

    result_arr = []

    for i in range(size):
        for j in range(i):
            if method == 'pearson':
                r, p = stats.pearsonr(table[vars[i]], table[vars[j]])
            elif method == 'spearman':
                r, p = stats.spearmanr(table[vars[i]], table[vars[j]])
            elif method == 'kendal':
                r, p = stats.kendalltau(table[vars[i]], table[vars[j]])

            result_arr.append([vars[i], vars[j], r, p])

    df_result = pd.DataFrame(result_arr, columns=['x', 'y', 'corr', 'p_value'])

    def corr(x, y, **kwargs):
        if kwargs['method'] == 'pearson':
            r, p = stats.pearsonr(x, y)
        elif kwargs['method'] == 'spearman':
            r, p = stats.spearmanr(x, y)
        elif kwargs['method'] == 'kendal':
            r, p = stats.kendalltau(x, y)

        p_stars = ''
        if p <= 0.05:
            p_stars = '*'
        if p <= 0.01:
            p_stars = '**'
        if p <= 0.001:
            p_stars = '***'

        corr_text = '{:.{prec}f}'.format(r, prec=corr_prec)
        font_size = abs(r) * 15 * 2 / corr_prec + 5
        ax = plt.gca()
        ax.annotate(corr_text, [
            .5,
            .5,
        ],
                    xycoords="axes fraction",
                    ha='center',
                    va='center',
                    fontsize=font_size * height)
        ax.annotate(p_stars,
                    xy=(0.65, 0.6),
                    xycoords=ax.transAxes,
                    color='red',
                    fontsize=17 * height)

    g = sns.PairGrid(table, vars=vars, height=height)
    g.map_diag(sns.distplot)
    if method == 'pearson':
        g.map_lower(sns.regplot, scatter_kws=scatter_kws)
    else:
        g.map_lower(sns.regplot, lowess=True, scatter_kws=scatter_kws)
    g.map_upper(corr, method=method)

    fig_corr = plt2MD(plt)
    plt.clf()

    rb = ReportBuilder()
    rb.addMD(
        strip_margin(""" ## Correlation Results
        | ### Correlation Matrix
        | {fig_corr}
        |
        | ### Correlation Table
        | {table}
        """.format(fig_corr=fig_corr, table=pandasDF2MD(df_result))))

    params = {'vars': vars, 'method': method, 'height': height}

    res = dict()
    res['params'] = params
    res['corr_table'] = df_result
    res['report'] = rb.get()

    return {'result': res}
Exemplo n.º 18
0
def _kmeans_silhouette_train_predict(table,
                                     input_cols,
                                     n_clusters_list=range(2, 10),
                                     prediction_col='prediction',
                                     init='k-means++',
                                     n_init=10,
                                     max_iter=300,
                                     tol=1e-4,
                                     precompute_distances='auto',
                                     seed=None,
                                     n_jobs=1,
                                     algorithm='auto',
                                     n_samples=None):
    if n_samples is None:
        n_samples = len(table)
    inputarr = table[input_cols]

    pca2_model = PCA(n_components=2).fit(inputarr)
    pca2 = pca2_model.transform(inputarr)

    silhouette_list = []
    silouette_samples_list = []
    models = []
    centers_list = []
    images = []
    for k in n_clusters_list:
        k_means = SKKMeans(n_clusters=k,
                           init=init,
                           n_init=n_init,
                           max_iter=max_iter,
                           tol=tol,
                           precompute_distances=precompute_distances,
                           verbose=0,
                           random_state=seed,
                           copy_x=True,
                           n_jobs=n_jobs,
                           algorithm=algorithm)
        k_means.fit(inputarr)
        models.append(k_means)
        predict = k_means.labels_
        centersk = k_means.cluster_centers_
        centers_list.append(centersk)

        score = silhouette_score(inputarr, predict)
        silhouette_list.append(score)
        samples = silhouette_samples(inputarr, predict)
        silouette_samples_list.append(samples)

        pca2_centers = pca2_model.transform(centersk)

        _, (ax1, ax2) = plt.subplots(1, 2)
        colors = cm.nipy_spectral(np.arange(k).astype(float) / k)
        y_lower = 0

        for i, color in zip(range(k), colors):
            si = samples[predict == i]
            si.sort()

            sizei = si.shape[0]
            y_upper = y_lower + sizei

            ax1.fill_betweenx(np.arange(y_lower, y_upper),
                              0,
                              si,
                              facecolor=color,
                              edgecolor=color,
                              alpha=0.7)

            y_lower = y_upper

            ax2.scatter(pca2[:, 0][predict == i],
                        pca2[:, 1][predict == i],
                        color=color)

        ax1.axvline(x=score, color="red")
        ax2.scatter(pca2_centers[:, 0],
                    pca2_centers[:, 1],
                    marker='x',
                    edgecolors=1,
                    s=200,
                    color=colors)

        imagek = plt2MD(plt)
        plt.clf()
        images.append(imagek)

    argmax = np.argmax(silhouette_list)
    best_k = n_clusters_list[argmax]
    best_model = models[argmax]
    predict = best_model.predict(inputarr)
    best_centers = best_model.cluster_centers_
    best_labels = best_model.labels_

    fig_centers = _kmeans_centers_plot(input_cols, best_centers)
    fig_samples = _kmeans_samples_plot(table, input_cols, n_samples,
                                       best_centers)
    fig_pca = _kmeans_pca_plot(predict, best_centers, pca2_model, pca2)

    x_clusters = range(len(n_clusters_list))
    plt.xticks(x_clusters, n_clusters_list)
    plt.plot(x_clusters, silhouette_list, '.-')
    fig_silhouette = plt2MD(plt)
    plt.clf()

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | ## Kmeans Silhouette Result
    | - silloutte metrics:
    | {fig_silhouette}
    | - best K: {best_k} 
    | - best centers:
    | {fig_pca}
    | {fig_centers}
    | {fig_samples}
    |
    """.format(fig_silhouette=fig_silhouette,
               best_k=best_k,
               fig_pca=fig_pca,
               fig_centers=fig_centers,
               fig_samples=fig_samples)))

    for k, image in zip(n_clusters_list, images):
        rb.addMD(
            strip_margin("""
        | ### k = {k}
        | {image}
        |
        """.format(k=k, image=image)))

    model = _model_dict('kmeans_silhouette')
    model['best_k'] = best_k
    model['best_centers'] = best_centers
    model['best_model'] = best_model
    model['input_cols'] = input_cols
    model['report'] = rb.get()

    out_table = table.copy()
    out_table[prediction_col] = predict

    return {'out_table': out_table, 'model': model}
Exemplo n.º 19
0
def _linear_regression_train(table, feature_cols, label_col, fit_intercept=True):
    features = table[feature_cols]
    label = table[label_col]
    lr_model = LinearRegression(fit_intercept)
    lr_model.fit(features, label)

    predict = lr_model.predict(features)
    residual = label - predict

    if fit_intercept == True:
        lr_model_fit = sm.OLS(label, sm.add_constant(features)).fit()
    else:
        lr_model_fit = sm.OLS(label, features).fit()
    
    summary = lr_model_fit.summary()
    summary_tables = simple_tables2df_list(summary.tables)
    summary0 = summary_tables[0]
    summary1 = summary_tables[1]
    summary2 = summary_tables[2]
    
    html_result = summary.as_html()

    plt.figure()
    plt.scatter(predict, label)
    plt.xlabel('Predicted values for ' + label_col)
    plt.ylabel('Actual values for ' + label_col)
    x = predict
    y = np.array(label)
    a = x.size
    b = np.sum(x)
    c = b
    d = 0
    for i in x: d += +i * i
    e = np.sum(y)
    f = 0
    for i in range(0, x.size - 1): f += x[i] * y[i]
    det = a * d - b * c
    aa = (d * e - b * f) / det
    bb = (a * f - c * e) / det
    p1x = np.min(x)
    p1y = aa + bb * p1x
    p2x = np.max(x)
    p2y = aa + bb * p2x
    plt.plot([p1x, p2x], [p1y, p2y], 'r--')
    fig_actual_predict = plt2MD(plt)

    plt.figure()
    plt.scatter(predict, residual)
    plt.xlabel('Predicted values for ' + label_col)
    plt.ylabel('Residuals')
    plt.axhline(y=0, color='r', linestyle='--')
    fig_residual_1 = plt2MD(plt)

    plt.figure()
    sm.qqplot(residual, line='s')
    plt.ylabel('Residuals')
    fig_residual_2 = plt2MD(plt)

    plt.figure()
    sns.distplot(residual)
    plt.xlabel('Residuals')
    fig_residual_3 = plt2MD(plt)

    rb = ReportBuilder()
    rb.addMD(strip_margin("""
    | ## Linear Regression Result
    | ### Summary
    |
    """))
    rb.addHTML(html_result)
    rb.addMD(strip_margin("""
    |
    | ### Predicted vs Actual
    | {image1}
    |
    | ### Fit Diagnostics
    | {image2}
    | {image3}
    | {image4}
    """.format(image1=fig_actual_predict,
               image2=fig_residual_1,
               image3=fig_residual_2,
               image4=fig_residual_3
               )))

    model = _model_dict('linear_regression_model')
    model['features'] = feature_cols
    model['label'] = label_col
    model['coefficients'] = lr_model_fit.params
    model['r2'] = lr_model_fit.rsquared
    model['adjusted_r2'] = lr_model_fit.rsquared_adj
    model['aic'] = lr_model_fit.aic
    model['bic'] = lr_model_fit.bic
    model['f_static'] = lr_model_fit.fvalue
    model['tvalues'] = lr_model_fit.tvalues
    model['pvalues'] = lr_model_fit.pvalues
    model['lr_model'] = lr_model
    model['report'] = rb.get()
    
    model['summary0'] = summary0
    model['summary1'] = summary1
    model['summary2'] = summary2
    
    return {'model' : model}
Exemplo n.º 20
0
def _pca(table,
         input_cols,
         new_column_name='projected_',
         n_components=None,
         copy=True,
         whiten=False,
         svd_solver='auto',
         tol=0.0,
         iterated_power='auto',
         random_state=None):

    num_feature_cols = len(input_cols)
    if n_components is None:
        n_components = num_feature_cols

    pca = PCA(n_components, copy, whiten, svd_solver, tol, iterated_power,
              random_state)
    pca_model = pca.fit(table[input_cols])

    column_names = []
    for i in range(0, n_components):
        column_names.append(new_column_name + str(i))
    # print(column_names)

    pca_result = pca_model.transform(table[input_cols])
    out_df = pd.DataFrame(data=pca_result, columns=[column_names])

    res_components = pca_model.components_
    res_components_df = pd.DataFrame(data=res_components, columns=[input_cols])
    res_explained_variance = pca_model.explained_variance_
    res_explained_variance_ratio = pca_model.explained_variance_ratio_
    res_singular_values = pca_model.singular_values_
    res_mean = pca_model.mean_
    res_n_components = pca_model.n_components_
    res_noise_variance = pca_model.noise_variance_

    res_get_param = pca_model.get_params()
    res_get_covariance = pca_model.get_covariance()
    res_get_precision = pca_model.get_precision()

    # visualization
    plt.figure()
    if res_n_components == 1:
        plt.scatter(pca_result[:, 0], pca_result[:, 0])
    else:
        plt.scatter(pca_result[:, 0], pca_result[:, 1])
    # plt.title('PCA result with two components')
    # plt.show()
    plt_two = plt2MD(plt)
    plt.clf()

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | 
    | ### Plot
    | The x-axis and y-axis of the following plot is projected0 and projected1, respectively.    
    | {image1}
    |
    | ### Result
    | {table1}
    | only showing top 20 rows
    |
    | ### Parameters
    | {parameter1}
    |
    | ### Components
    | {table2}
    | 
    | ### Mean
    | {array1}
    | 
    | ### Explained Variance 
    | {array2}
    |
    """.format(table1=pandasDF2MD(out_df, 20),
               image1=plt_two,
               parameter1=dict2MD(res_get_param),
               table2=pandasDF2MD(res_components_df),
               array1=res_mean,
               array2=res_explained_variance)))

    model = _model_dict('pca')
    model['components'] = res_components
    model['explained_variance'] = res_explained_variance
    model['explained_variance_ratio'] = res_explained_variance_ratio
    model['singular_values'] = res_singular_values
    model['mean'] = res_mean
    model['n_components'] = res_n_components
    model['noise_variance'] = res_noise_variance
    model['parameters'] = res_get_param
    model['covariance'] = res_get_covariance
    model['precision'] = res_get_precision
    model['report'] = rb.get()
    model['pca_model'] = pca_model
    model['input_cols'] = input_cols

    out_df = pd.concat([table.reset_index(drop=True), out_df], axis=1)
    out_df.columns = table.columns.values.tolist() + column_names

    return {'out_table': out_df, 'model': model}
Exemplo n.º 21
0
def _evaluate_classification(table, label_col, prediction_col):

    label = table[label_col]
    predict = table[prediction_col]

    # compute metrics
    accuracy = accuracy_score(label, predict)
    f1 = f1_score(label, predict, average="weighted")
    precision = precision_score(label, predict, average="weighted")
    recall = recall_score(label, predict, average="weighted")
    class_names = np.unique(np.union1d(label.values, predict.values))

    # Plot non-normalized confusion matrix
    plt.figure()
    _plot_confusion_matrix(label,
                           predict,
                           classes=class_names,
                           title='Confusion matrix, without normalization')
    fig_cnf_matrix = plt2MD(plt)
    # Plot normalized confusion matrix
    plt.figure()
    _plot_confusion_matrix(label,
                           predict,
                           classes=class_names,
                           normalize=True,
                           title='Normalized confusion matrix')
    fig_cnf_matrix_normalized = plt2MD(plt)
    plt.clf()

    # json
    summary = dict()
    summary['label_col'] = label_col
    summary['prediction_col'] = prediction_col
    summary['f1_score'] = f1
    summary['accuracy_score'] = accuracy
    summary['precision_score'] = precision
    summary['recall_score'] = recall

    # report
    all_dict_list = [{
        'f1': f1,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall
    }]
    all_df = pd.DataFrame(all_dict_list)
    all_df = all_df[['f1', 'accuracy', 'precision', 'recall']]
    summary['metrics'] = all_df

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | ## Evaluate Classification Result
    | ### Metrics
    | {table1}
    |
    | ### Confusion matrix
    | {fig_confusion_matrix}
    |
    | {fig_confusion_matrix_normalized}
    |
    """.format(table1=pandasDF2MD(all_df),
               fig_confusion_matrix=fig_cnf_matrix,
               fig_confusion_matrix_normalized=fig_cnf_matrix_normalized)))
    summary['report'] = rb.get()

    return {'result': summary}
Exemplo n.º 22
0
def _outlier_detection_tukey_carling(table, input_cols, outlier_method="tukey", multiplier=None, number_of_removal=1,
                                    choice='add_prediction', new_column_prefix='is_outlier_'):
    out_table = table.copy()

    if multiplier is None and outlier_method == "tukey":
        multiplier = 1.5
    elif multiplier is None and outlier_method == "carling":
        multiplier = 2.3
    
    mean = table.mean()
    q1s = table.quantile(0.25)
    q3s = table.quantile(0.75)
    iqrs = q3s - q1s
    
    new_column_names = ['{prefix}{col}'.format(prefix=new_column_prefix, col=col) for col in input_cols]

    def _tukey(x, q1, q3, iqr, multiplier):
        return 'out' if x < q1 - multiplier * iqr or x > q3 + multiplier * iqr else 'in' 

    def _carling(x, mean, iqr, multiplier):
        return 'out' if x < mean - multiplier * iqr or x > mean + multiplier * iqr else 'in'
    
    if outlier_method == "tukey":
        for col in input_cols:
            output_col_name = '{prefix}{col}'.format(prefix=new_column_prefix, col=col)
            out_table[output_col_name] = table[col].apply(lambda _: _tukey(_, q1s[col], q3s[col], iqrs[col], multiplier))
            
    elif outlier_method == "carling":
        if multiplier is None:
            multiplier = 2.3
            
        for col in input_cols:
            output_col_name = '{prefix}{col}'.format(prefix=new_column_prefix, col=col)
            out_table[output_col_name] = table[col].apply(lambda _: _carling(_, mean[col], iqrs[col], multiplier))
        
    prediction = out_table[new_column_names].apply(lambda row: np.sum(row == 'out') < number_of_removal, axis=1)
    
    rb = ReportBuilder()
    params = { 
        'Input Columns': input_cols,
        'Outlier Method': outlier_method,
        'Multiplier': multiplier,
        'Number of Outliers in a Row': number_of_removal,
        'Result Type': choice,
        'New Column Prefix': new_column_prefix
    }
    rb.addMD(strip_margin("""
    | ## Outlier Detection (Tukey/Carling) Result
    | ### Parameters
    |
    | {display_params}
    """.format(display_params=dict2MD(params))))
    
    if choice == 'add_prediction':
        pass
    elif choice == 'remove_outliers':
        out_table = out_table.drop(new_column_names, axis=1)
        out_table = out_table[prediction.values]
    elif choice == 'both':
        out_table = out_table[prediction.values]
    
    model = _model_dict('outlier_detection_tukey_carling')
    model['params'] = params
    model['mean'] = mean
    model['q1'] = q1s
    model['q3'] = q3s
    model['iqr'] = iqrs
    model['multiplier'] = multiplier
    model['report'] = rb.get()
    
    return {'out_table': out_table, 'model' : model}
Exemplo n.º 23
0
def _chi_square_test_of_independence(table,
                                     response_cols,
                                     factor_col,
                                     correction=False):
    label_list = []
    feature_list = []
    alternative_hypothesis_list = []
    dof_list = []
    stat_chi_list = []
    p_chi_list = []
    for response_col in response_cols:
        response = table[response_col]
        contingency_table = pd.crosstab(table[response_col],
                                        table[factor_col],
                                        margins=True)
        response_index = len(contingency_table) - 1
        factor_index = len(contingency_table.columns) - 1
        temporary = contingency_table.iloc[0:response_index, 0:factor_index]
        f_object = np.array(temporary)
        test = stats.chi2_contingency(f_object, correction, 1)[0:3]
        label = '{factor_col}'.format(factor_col=factor_col)
        feature = '{response_col}'.format(response_col=response_col)
        if test[1] < 0.05:
            dependence = 'Reject the null hypothesis that two categorical variables are independent at 5% significance level.'
        elif test[1] >= 0.05:
            dependence = 'No association was found between two categorical variables at 5% significance level.'
        elif math.isnan(test[1]):
            dependence = 'Independence of two categorical variables cannot be decided.'
        conclusion = '{dependence}'.format(dependence=dependence)
        alternative_hypothesis = 'Two categorical variables are dependent.'
        dof = 'chi-square distribution with {dof} degrees of freedom'.format(
            dof=test[2])
        stat_chi = '{stat_chi}'.format(stat_chi=test[0])
        p_chi = '{p_chi}'.format(p_chi=test[1])
        label_list.append(label)
        feature_list.append(feature)
        alternative_hypothesis_list.append(alternative_hypothesis)
        dof_list.append(dof)
        stat_chi_list.append(stat_chi)
        p_chi_list.append(p_chi)

    result_table = pd.DataFrame.from_items(
        [['label', label_list], ['feature', feature_list],
         ['alternative_hypothesis', alternative_hypothesis_list],
         ['df', dof_list], ['estimate', stat_chi_list],
         ['p_value', p_chi_list]])

    result = dict()
    result['result_table'] = result_table

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | ## Chi-square Test of Independence Result
    |  - H0: the two categorical variables are independent.
    |  - H1: the two categorical variables are dependent.
    """))
    for response_col in response_cols:
        response = table[response_col]
        contingency_table = pd.crosstab(table[response_col],
                                        table[factor_col],
                                        margins=True)
        response_index = len(contingency_table) - 1
        factor_index = len(contingency_table.columns) - 1
        temporary = contingency_table.iloc[0:response_index, 0:factor_index]
        f_object = np.array(temporary)
        test = stats.chi2_contingency(f_object, correction, 1)[0:3]
        label = '{factor_col}'.format(factor_col=factor_col)
        feature = '{response_col}'.format(response_col=response_col)
        if test[1] < 0.05:
            dependence = 'Reject the null hypothesis that two categorical variables are independent at 5% significance level.'
        elif test[1] >= 0.05:
            dependence = 'No association was found between two categorical variables at 5% significance level.'
        elif math.isnan(test[1]):
            dependence = 'Independence of two categorical variables cannot be decided.'
        dof_simplelist = []
        stat_chi_simplelist = []
        p_chi_simplelist = []
        dof = '{dof}'.format(dof=test[2])
        stat_chi = '{stat_chi}'.format(stat_chi=test[0])
        p_chi = '{p_chi}'.format(p_chi=test[1])
        stat_chi_simplelist.append(stat_chi)
        dof_simplelist.append(dof)
        p_chi_simplelist.append(p_chi)
        result_table_simple = pd.DataFrame.from_items(
            [['estimate', stat_chi_simplelist], ['df', dof_simplelist],
             ['p_value', p_chi_simplelist]])

        # test statistic = {test_statistic}, df = {dof}, p_value = {p_value}
        # test_statistic = stats.chi2_contingency(f_object,correction,lambda_)[0], dof=stats.chi2_contingency(f_object,correction,lambda_)[2], p_value=stats.chi2_contingency(f_object,correction,lambda_)[1]
        rb.addMD(
            strip_margin("""
        |### Label: {label}, Feature: {feature}
        |  
        |{result_table_simple}
        |
        |{dependence}
        |
        |
        """.format(label=factor_col,
                   feature=response_col,
                   result_table_simple=pandasDF2MD(result_table_simple),
                   dependence=dependence)))

    model = _model_dict('Chi-square test of independence')

    model['report'] = rb.get()

    result_table = result_table.copy()

    return {'model': model}
Exemplo n.º 24
0
def paired_ttest(table,
                 first_column,
                 second_column,
                 alternative,
                 hypothesized_difference=0,
                 confidence_level=0.95):
    df = len(table) - 1
    diff_mean = abs(table[first_column] - table[second_column]).mean()
    std_dev = np.sqrt(
        ((diff_mean - abs(table[first_column] - table[second_column])) *
         (diff_mean - abs(table[first_column] - table[second_column]))).mean())
    ans = stats.ttest_rel(table[first_column],
                          table[second_column] + hypothesized_difference)
    t_value = ans[0]
    p_value_ul = ans[1]
    p_value_u = stats.t.sf(t_value, 149)
    p_value_l = stats.t.cdf(t_value, 149)

    left_u = diff_mean - std_dev * stats.t.isf(
        (1 - confidence_level), df) / np.sqrt(df)
    right_u = np.Infinity
    left_l = -np.Infinity
    right_l = diff_mean + std_dev * stats.t.isf(
        (1 - confidence_level), df) / np.sqrt(df)
    left_ul = diff_mean - std_dev * stats.t.isf(
        (1 - confidence_level) / 2, df) / np.sqrt(df)
    right_ul = diff_mean + std_dev * stats.t.isf(
        (1 - confidence_level) / 2, df) / np.sqrt(df)

    result_value_u = [{
        'data':
        first_column + " , " + second_column,
        'alternative_hypothesis':
        "true difference in means > " + str(hypothesized_difference),
        'statistics':
        "t statistics, t distribution with " + str(df) +
        " degrees of freedom under the null hypothesis",
        'estimates':
        t_value,
        'p_value':
        p_value_u,
        'confidence_level':
        confidence_level,
        'low_confidence_interval':
        left_u,
        'upper_confidence_interval':
        right_u
    }]
    result_value_l = [{
        'data':
        first_column + " , " + second_column,
        'alternative_hypothesis':
        "true difference in means < " + str(hypothesized_difference),
        'statistics':
        "t statistics, t distribution with " + str(df) +
        " degrees of freedom under the null hypothesis",
        'estimates':
        t_value,
        'p_value':
        p_value_l,
        'confidence_level':
        confidence_level,
        'low_confidence_interval':
        left_l,
        'upper_confidence_interval':
        right_l
    }]
    result_value_ul = [{
        'data':
        first_column + " , " + second_column,
        'alternative_hypothesis':
        "true difference in means != " + str(hypothesized_difference),
        'statistics':
        "t statistics, t distribution with " + str(df) +
        " degrees of freedom under the null hypothesis",
        'estimates':
        t_value,
        'p_value':
        p_value_ul,
        'confidence_level':
        confidence_level,
        'low_confidence_interval':
        left_ul,
        'upper_confidence_interval':
        right_ul
    }]

    df_result = pd.DataFrame()
    df_u = pd.DataFrame(result_value_u,
                        columns=[
                            'data', 'alternative_hypothesis', 'statistics',
                            'estimates', 'p_value', 'confidence_level',
                            'low_confidence_interval',
                            'upper_confidence_interval'
                        ])
    df_l = pd.DataFrame(result_value_l,
                        columns=[
                            'data', 'alternative_hypothesis', 'statistics',
                            'estimates', 'p_value', 'confidence_level',
                            'low_confidence_interval',
                            'upper_confidence_interval'
                        ])
    df_ul = pd.DataFrame(result_value_ul,
                         columns=[
                             'data', 'alternative_hypothesis', 'statistics',
                             'estimates', 'p_value', 'confidence_level',
                             'low_confidence_interval',
                             'upper_confidence_interval'
                         ])

    if 'greater' in alternative:
        df_result = df_result.append(df_u, ignore_index=True)
    if 'less' in alternative:
        df_result = df_result.append(df_l, ignore_index=True)
    if 'twosided' in alternative:
        df_result = df_result.append(df_ul, ignore_index=True)

    params = {
        'Input columns': first_column + ", " + second_column,
        'Hypothesized difference': str(hypothesized_difference),
        'Confidence level': str(confidence_level)
    }

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | ## Paired T Test Result
    |
    |df|mean_difference|standard_deviation|t_value
    |--|--|--|--
    |{deg_f}|{dm}|{sd}|{tv}
    """.format(deg_f=df,
               dm=diff_mean,
               sd=std_dev,
               tv=t_value,
               params=dict2MD(params))))

    if 'greater' in alternative:
        rb.addMD(
            strip_margin("""
        | - H0 : true diffrence in means is less than or equal to {hd}.
        | - H1 : true diffrence in means is larger than {hd}.
        |
        |p_value|confidence_level|confidence_interval
        |--|--|--
        |{pvu}|{con_lv}|({l_u}, {r_u})
        |
        """.format(pvu=p_value_u,
                   hd=str(hypothesized_difference),
                   con_lv=str(confidence_level),
                   l_u=left_u,
                   r_u=right_u)))

    if 'less' in alternative:
        rb.addMD(
            strip_margin("""
        | - H0 : true diffrence in means is larger than or equal to {hd}.
        | - H1 : true diffrence in means is less than {hd}.
        |
        |p_value|confidence_level|confidence_interval
        |--|--|--
        |{pvl}|{con_lv}|({l_l}, {r_l})
        |
        """.format(pvl=p_value_l,
                   hd=str(hypothesized_difference),
                   con_lv=str(confidence_level),
                   l_l=left_l,
                   r_l=right_l)))

    if 'twosided' in alternative:
        rb.addMD(
            strip_margin("""
        | - H0 : true diffrence in means is equal to {hd}.
        | - H1 : true diffrence in means is not equal to {hd}.
        |
        |p_value|confidence_level|confidence_interval
        |--|--|--
        |{pvul}|{con_lv}|({l_ul}, {r_ul})
        |
        """.format(pvul=p_value_ul,
                   hd=str(hypothesized_difference),
                   con_lv=str(confidence_level),
                   l_ul=left_ul,
                   r_ul=right_ul)))

    model = dict()
    model['report'] = rb.get()

    return {'out_table': df_result, 'model': model}
Exemplo n.º 25
0
def _logistic_regression_train(table,
                               feature_cols,
                               label_col,
                               penalty='l2',
                               dual=False,
                               tol=0.0001,
                               C=1.0,
                               fit_intercept=True,
                               intercept_scaling=1,
                               class_weight=None,
                               random_state=None,
                               solver='liblinear',
                               max_iter=100,
                               multi_class='ovr',
                               verbose=0,
                               warm_start=False,
                               n_jobs=1):
    features = table[feature_cols]
    label = table[label_col]
    lr_model = LogisticRegression(penalty, dual, tol, C, fit_intercept,
                                  intercept_scaling, class_weight,
                                  random_state, solver, max_iter, multi_class,
                                  verbose, warm_start, n_jobs)
    lr_model.fit(features, label)

    featureNames = np.append("Intercept", feature_cols)
    intercept = lr_model.intercept_
    coefficients = lr_model.coef_
    classes = lr_model.classes_
    is_binary = len(classes) == 2

    if (fit_intercept == True):
        summary = pd.DataFrame({'features': ['intercept'] + feature_cols})
        print(intercept)
        print(coefficients)

        coef_trans = np.concatenate(([intercept], np.transpose(coefficients)),
                                    axis=0)
        if not is_binary:
            summary = pd.concat(
                (summary, pd.DataFrame(coef_trans, columns=classes)), axis=1)
        elif is_binary:
            summary = pd.concat(
                (summary, pd.DataFrame(coef_trans, columns=[classes[0]])),
                axis=1)

    else:
        summary = pd.DataFrame({'features': feature_cols})
        coef_trans = np.transpose(coefficients)

        if not is_binary:
            summary = pd.concat(
                (summary, pd.DataFrame(coef_trans, columns=classes)), axis=1)
        elif is_binary:
            summary = pd.concat(
                (summary, pd.DataFrame(coef_trans, columns=[classes[0]])),
                axis=1)

    prob = lr_model.predict_proba(features)

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | ## Logistic Regression Result
    | ### Summary
    | {table1}
    """.format(table1=pandasDF2MD(summary))))

    model = dict()
    model['features'] = feature_cols
    model['label'] = label_col
    model['intercept'] = lr_model.intercept_
    model['coefficients'] = lr_model.coef_
    model['class'] = lr_model.classes_
    model['penalty'] = penalty
    model['solver'] = solver
    model['lr_model'] = lr_model
    model['report'] = rb.get()

    return {'model': model}
Exemplo n.º 26
0
def _glm_train(table, feature_cols, label_col, family="Gaussian", link="ident", fit_intercept=True):
    features = table[feature_cols]
    label = table[label_col]

    if label_col in feature_cols:
        raise Exception("%s is duplicated." % label_col)

    if family == "Gaussian": 
        sm_family = sm.families.Gaussian()
    elif family == "inv_Gaussian":
        sm_family = sm.families.InverseGaussian()
    elif family == "binomial":
        sm_family = sm.families.Binomial()
    elif family == "Poisson":
        sm_family = sm.families.Poisson()
    elif family == "neg_binomial":
        sm_family = sm.families.NegativeBinomial()
    elif family == "gamma":
        sm_family = sm.families.Gamma()
    elif family == "Tweedie":
        sm_family = sm.families.Tweedie()

    if link == "ident":
        sm_link = sm.families.links.identity
    elif link == "log":
        sm_link = sm.families.links.log
    elif link == "logit":
        sm_link = sm.families.links.logit
    elif link == "probit":
        sm_link = sm.families.links.probit
    elif link == "cloglog":
        sm_link = sm.families.links.cLogLog
    elif link == "pow":
        sm_link = sm.families.links.Power
    elif link == "nbinom":
        sm_link = sm.families.links.binom

    if fit_intercept == True:
        glm_model = sm.GLM(label, sm.add_constant(features), family=sm_family, link=sm_link).fit()
    else:
        glm_model = sm.GLM(label, features, family=sm_family, link=sm_link).fit()
    summary = glm_model.summary().as_html()

    rb = ReportBuilder()
    rb.addMD(strip_margin("""
    | ## GLM Result
    | ### Summary
    |
    """))
    rb.addHTML(summary)

    model = _model_dict('glm_model')
    model['features'] = feature_cols
    model['label'] = label_col
    model['family'] = family
    model['link'] = link
    model['coefficients'] = glm_model.params
    model['aic'] = glm_model.aic
    model['bic'] = glm_model.bic
    model['tvalues'] = glm_model.tvalues
    model['pvalues'] = glm_model.pvalues
    model['fit_intercept'] = fit_intercept
    model['glm_model'] = glm_model
    model['report'] = rb.get()

    return {'model' : model}
Exemplo n.º 27
0
def _pca(table,
         input_cols,
         new_column_name='projected_',
         n_components=None,
         copy=True,
         whiten=False,
         svd_solver='auto',
         tol=0.0,
         iterated_power='auto',
         random_state=None,
         hue=None,
         alpha=0,
         key_col=None):

    num_feature_cols = len(input_cols)
    if n_components is None:
        n_components = num_feature_cols

    pca = PCA(None, copy, whiten, svd_solver, tol, iterated_power,
              random_state)
    pca_model = pca.fit(table[input_cols])

    column_names = []
    for i in range(0, n_components):
        column_names.append(new_column_name + str(i))
    # print(column_names)

    pca_result = pca_model.transform(table[input_cols])
    out_df = pd.DataFrame(data=pca_result[:, :n_components],
                          columns=[column_names])

    out_df = pd.concat([table.reset_index(drop=True), out_df], axis=1)
    out_df.columns = table.columns.values.tolist() + column_names

    res_components = pca_model.components_
    res_components_df = pd.DataFrame(data=res_components[:n_components],
                                     columns=[input_cols])
    res_explained_variance = pca_model.explained_variance_
    res_explained_variance_ratio = pca_model.explained_variance_ratio_
    res_singular_values = pca_model.singular_values_
    res_mean = pca_model.mean_
    res_n_components = pca_model.n_components_
    res_noise_variance = pca_model.noise_variance_

    res_get_param = pca_model.get_params()
    res_get_covariance = pca_model.get_covariance()
    res_get_precision = pca_model.get_precision()

    # visualization
    plt.figure()
    if n_components == 1:
        sns.scatterplot(column_names[0], column_names[0], hue=hue, data=out_df)
        plt_two = plt2MD(plt)
        plt.clf()
    else:
        plt_two = _biplot(
            0,
            1,
            pc_columns=column_names,
            columns=input_cols,
            singular_values=res_singular_values,
            components=res_components,
            explained_variance_ratio=res_explained_variance_ratio,
            alpha=alpha,
            hue=hue,
            data=out_df,
            ax=plt.gca(),
            key_col=key_col)

    plt.figure()
    fig_scree = _screeplot(res_explained_variance,
                           res_explained_variance_ratio, n_components)

    table_explained_variance = pd.DataFrame(res_explained_variance,
                                            columns=['explained_variance'])
    table_explained_variance[
        'explained_variance_ratio'] = res_explained_variance_ratio
    table_explained_variance[
        'cum_explained_variance_ratio'] = res_explained_variance_ratio.cumsum(
        )

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | ## PCA Result
    | ### Plot
    | {image1}
    |
    | ### Explained Variance
    | {fig_scree}
    | {table_explained_variance}    
    |
    | ### Components
    | {table2}
    |
    | ### Parameters
    | {parameter1}
    """.format(image1=plt_two,
               fig_scree=fig_scree,
               table_explained_variance=pandasDF2MD(table_explained_variance),
               parameter1=dict2MD(res_get_param),
               table2=pandasDF2MD(res_components_df))))

    model = _model_dict('pca')
    model['components'] = res_components
    model['explained_variance'] = res_explained_variance
    model['explained_variance_ratio'] = res_explained_variance_ratio
    model['singular_values'] = res_singular_values
    model['mean'] = res_mean
    model['n_components'] = res_n_components
    model['noise_variance'] = res_noise_variance
    model['parameters'] = res_get_param
    model['covariance'] = res_get_covariance
    model['precision'] = res_get_precision
    model['report'] = rb.get()
    model['pca_model'] = pca_model
    model['input_cols'] = input_cols

    return {'out_table': out_df, 'model': model}
def _decision_tree_classification_train(
        table,
        feature_cols,
        label_col,  # fig_size=np.array([6.4, 4.8]), 
        criterion='gini',
        splitter='best',
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        min_weight_fraction_leaf=0.0,
        max_features=None,
        random_state=None,
        max_leaf_nodes=None,
        min_impurity_decrease=0.0,
        min_impurity_split=None,
        class_weight=None,
        presort=False,
        sample_weight=None,
        check_input=True,
        X_idx_sorted=None):
    classifier = DecisionTreeClassifier(
        criterion, splitter, max_depth, min_samples_split, min_samples_leaf,
        min_weight_fraction_leaf, max_features, random_state, max_leaf_nodes,
        min_impurity_decrease, min_impurity_split, class_weight, presort)
    classifier.fit(table[feature_cols], table[label_col], sample_weight,
                   check_input, X_idx_sorted)

    try:
        from sklearn.externals.six import StringIO
        from sklearn.tree import export_graphviz
        import pydotplus
        dot_data = StringIO()
        export_graphviz(classifier,
                        out_file=dot_data,
                        feature_names=feature_cols,
                        class_names=table[label_col].astype('str').unique(),
                        filled=True,
                        rounded=True,
                        special_characters=True)
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
        from brightics.common.report import png2MD
        fig_tree = png2MD(graph.create_png())
    except:
        fig_tree = "Graphviz is needed to draw a Decision Tree graph. Please download it from http://graphviz.org/download/ and install it to your computer."

    # json
    model = _model_dict('decision_tree_classification_model')
    model['feature_cols'] = feature_cols
    model['label_col'] = label_col
    model['classes'] = classifier.classes_
    feature_importance = classifier.feature_importances_
    model['feature_importance'] = feature_importance
    model['max_features'] = classifier.max_features_
    model['n_classes'] = classifier.n_classes_
    model['n_features'] = classifier.n_features_
    model['n_outputs'] = classifier.n_outputs_
    model['tree'] = classifier.tree_
    get_param = classifier.get_params()
    model['parameters'] = get_param
    model['classifier'] = classifier

    # report

    indices = np.argsort(feature_importance)
    sorted_feature_cols = np.array(feature_cols)[indices]

    plt.title('Feature Importances')
    plt.barh(range(len(indices)),
             feature_importance[indices],
             color='b',
             align='center')
    for i, v in enumerate(feature_importance[indices]):
        plt.text(v,
                 i,
                 " {:.2f}".format(v),
                 color='b',
                 va='center',
                 fontweight='bold')
    plt.yticks(range(len(indices)), sorted_feature_cols)
    plt.xlabel('Relative Importance')
    plt.xlim(0, 1.1)
    plt.tight_layout()
    fig_feature_importances = plt2MD(plt)
    plt.clf()

    params = dict2MD(get_param)
    feature_importance_df = pd.DataFrame(data=feature_importance,
                                         index=feature_cols).T

    # Add tree plot

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | ## Decision Tree Classification Train Result
    | ### Decision Tree
    | {fig_tree}
    |
    | ### Feature Importance
    | {fig_feature_importances}
    |
    | ### Parameters
    | {list_parameters}
    |
    """.format(fig_tree=fig_tree,
               fig_feature_importances=fig_feature_importances,
               list_parameters=params)))
    model['report'] = rb.get()

    return {'model': model}
Exemplo n.º 29
0
def paired_ttest(table, first_column, second_column, alternative, hypothesized_difference=0, confidence_level=0.95):
    df = len(table) - 1
    diff_mean = (table[first_column] - table[second_column]).mean()
    std_dev = np.std(table[first_column] - table[second_column])
    t_value = stats.ttest_rel(table[first_column], table[second_column] + hypothesized_difference)[0]
    p_value_ul = stats.ttest_rel(table[first_column], table[second_column] + hypothesized_difference)[1]
    p_value_u = stats.t.sf(t_value, df)
    p_value_l = stats.t.cdf(t_value, df)

    left_u = diff_mean - std_dev * stats.t.isf((1 - confidence_level), df) / np.sqrt(df)
    right_l = diff_mean + std_dev * stats.t.isf((1 - confidence_level), df) / np.sqrt(df)
    left_ul = diff_mean - std_dev * stats.t.isf((1 - confidence_level) / 2, df) / np.sqrt(df)
    right_ul = diff_mean + std_dev * stats.t.isf((1 - confidence_level) / 2, df) / np.sqrt(df)

    result_value_u = [{'data' : first_column + " , " + second_column,
                 'alternative_hypothesis' : "true difference in means > " + str(hypothesized_difference),
                 'statistics' : "t statistics, t distribution with " + str(df) + " degrees of freedom under the null hypothesis",
                 'estimates' : t_value,
                 'p_value' : p_value_u,
                 'confidence_level' : confidence_level,
                 'low_confidence_interval' : left_u,
                 'upper_confidence_interval' : np.Infinity}]
    result_value_l = [{'data' : first_column + " , " + second_column,
                 'alternative_hypothesis' : "true difference in means < " + str(hypothesized_difference),
                 'statistics' : "t statistics, t distribution with " + str(df) + " degrees of freedom under the null hypothesis",
                 'estimates' : t_value,
                 'p_value' : p_value_l,
                 'confidence_level' : confidence_level,
                 'low_confidence_interval' :-np.Infinity,
                 'upper_confidence_interval' : right_l}]
    result_value_ul = [{'data' : first_column + " , " + second_column,
                 'alternative_hypothesis' : "true difference in means != " + str(hypothesized_difference),
                 'statistics' : "t statistics, t distribution with " + str(df) + " degrees of freedom under the null hypothesis",
                 'estimates' : t_value,
                 'p_value' : p_value_ul,
                 'confidence_level' : confidence_level,
                 'low_confidence_interval' : left_ul,
                 'upper_confidence_interval' : right_ul}]

    df_result = pd.DataFrame()
    df_u = pd.DataFrame(result_value_u, columns=['data', 'alternative_hypothesis', 'statistics', 'estimates', 'p_value', 'confidence_level', 'low_confidence_interval', 'upper_confidence_interval'])
    df_l = pd.DataFrame(result_value_l, columns=['data', 'alternative_hypothesis', 'statistics', 'estimates', 'p_value', 'confidence_level', 'low_confidence_interval', 'upper_confidence_interval'])
    df_ul = pd.DataFrame(result_value_ul, columns=['data', 'alternative_hypothesis', 'statistics', 'estimates', 'p_value', 'confidence_level', 'low_confidence_interval', 'upper_confidence_interval'])

    if 'greater' in alternative:
        df_result = df_result.append(df_u, ignore_index=True)
    if 'less' in alternative:
        df_result = df_result.append(df_l, ignore_index=True)
    if 'twosided' in alternative:
        df_result = df_result.append(df_ul, ignore_index=True)

    result_table_ul = pd.DataFrame([{'Alternative': 'Two Sided', 'H1': 'true difference in means != ' + str(hypothesized_difference), 't_value': t_value, 'p_value': p_value_ul, str(confidence_level * 100) + '% confidence interval': '(' + str(left_ul) + ', ' + str(right_ul) + ')'}])
    result_table_u = pd.DataFrame([{'Alternative': 'Greater', 'H1': 'true difference in means > ' + str(hypothesized_difference), 't_value': t_value, 'p_value': p_value_u, str(confidence_level * 100) + '% confidence interval': '(' + str(left_u) + ', ' + str(np.Infinity) + ')'}])
    result_table_l = pd.DataFrame([{'Alternative': 'Less', 'H1': 'true difference in means < ' + str(hypothesized_difference), 't_value': t_value, 'p_value': p_value_l, str(confidence_level * 100) + '% confidence interval': '(' + str(-np.Infinity) + ', ' + str(right_l) + ')'}])
    result_table = pd.DataFrame()

    if 'greater' in alternative:
        result_table = result_table.append(result_table_u, ignore_index=True)
    if 'less' in alternative:
        result_table = result_table.append(result_table_l, ignore_index=True)
    if 'twosided' in alternative:
        result_table = result_table.append(result_table_ul, ignore_index=True)

    ordered_result_table = pd.DataFrame(result_table, columns=['Alternative', 'H1', 't_value', 'p_value', str(confidence_level * 100) + '% confidence interval'])

    rb = ReportBuilder()
    rb.addMD(strip_margin("""
    |## Paired T Test Result
    |##### df : {deg_f}
    |##### Mean of differences : {dm}
    |##### Standard deviation : {sd}
    |
    |{result_table}
    |
    """.format(deg_f=df, dm=diff_mean, sd=std_dev, result_table=pandasDF2MD(ordered_result_table))))

    model = dict()
    model['report'] = rb.get()

    return{'out_table':df_result, 'model':model}
def naive_bayes_train(table,
                      feature_cols,
                      label_col,
                      alpha=1.0,
                      fit_prior=True,
                      class_prior=None):

    features = table[feature_cols]
    label = table[label_col]
    label_encoder = preprocessing.LabelEncoder()
    label_encoder.fit(label)
    label_correspond = label_encoder.transform(label)

    if class_prior is not None:
        tmp_class_prior = [0 for x in range(len(class_prior))]
        for elems in class_prior:
            tmp = elems.split(":")
            tmp_class_prior[label_encoder.transform([tmp[0]
                                                     ])[0]] = float(tmp[1])
        class_prior = tmp_class_prior

    nb_model = MultinomialNB(alpha, fit_prior, class_prior)
    nb_model.fit(features, label_correspond)
    prediction_correspond = nb_model.predict(features)

    get_param = dict()
    get_param['Lambda'] = alpha
    # get_param['Prior Probabilities of the Classes'] = class_prior
    # get_param['Fit Class Prior Probability'] = fit_prior
    get_param['Feature Columns'] = feature_cols
    get_param['Label Column'] = label_col

    cnf_matrix = confusion_matrix(label_correspond, prediction_correspond)

    plt.figure()
    plot_confusion_matrix(cnf_matrix,
                          classes=label_encoder.classes_,
                          title='Confusion Matrix')
    fig_confusion_matrix = plt2MD(plt)
    accuracy = nb_model.score(features, label_correspond) * 100

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | ## Naive Bayes Classification Result
    | ### Parameters
    | {table_parameter} 
    | ### Predicted vs Actual
    | {image1}
    | #### Accuacy = {accuracy}%
    |
    """.format(image1=fig_confusion_matrix,
               accuracy=accuracy,
               table_parameter=dict2MD(get_param))))

    model = _model_dict('naive_bayes_model')
    model['features'] = feature_cols
    model['label_col'] = label_col
    model['label_encoder'] = label_encoder
    model['nb_model'] = nb_model
    model['report'] = rb.get()

    return {'model': model}