示例#1
0
def _autocorrelation(table, input_col, nlags=20, conf_level=0.95):
    data = table[input_col]
    
    plt.figure()
    plot_acf(data, lags=nlags, alpha=1 - conf_level)
    fig_plt_acf = plt2MD(plt)
    plt.clf()
    
    plt.figure()
    plot_pacf(data, lags=nlags, alpha=1 - conf_level)
    fig_plt_pacf = plt2MD(plt)
    plt.clf()
    
    acf_ret = acf(data, nlags=nlags, alpha=1-conf_level)
    pacf_ret = pacf(data, nlags=nlags, alpha=1-conf_level)
    
    result_table1 = pd.DataFrame([])
    result_table1['lag'] = list(range(nlags + 1))
    result_table1['ACF'] = acf_ret[0]
    
    if conf_level is not None:
        result_table1['%g%% confidence Interval' % (conf_level * 100)] = [str((acf_ret[1][i][0], acf_ret[1][i][1]))  for i in range(nlags + 1)]
    
    result_table2 = pd.DataFrame([])
    result_table2['lag'] = list(range(nlags + 1))
    result_table2['PACF'] = pacf_ret[0]
    
    if conf_level is not None:
        result_table2['%g%% confidence Interval' % (conf_level * 100)] = [str((pacf_ret[1][i][0], pacf_ret[1][i][1])) for i in range(nlags + 1)]
    
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""# Autocorrelation / Partial Autocorrelation Result"""))
    rb.addMD(strip_margin("""
    |## Autocorrelation
    |
    |{image1}
    |
    |### Autocorrelation Table
    |
    |{result_table1}
    |
    |## Partial Autocorrelation
    |
    |{image2}
    |
    |### Partial Autocorrelation Table
    |
    |{result_table2}
    |
    """.format(image1=fig_plt_acf, result_table1=pandasDF2MD(result_table1, num_rows=nlags + 1), image2=fig_plt_pacf, result_table2=pandasDF2MD(result_table2, num_rows=nlags + 1))))

    model = _model_dict('autocorrelation')
    model['autocorrelation_table'] = result_table1
    model['partial_autocorrelation_table'] = result_table2
    model['_repr_brtc_'] = rb.get()
        
    return {'model':model}
示例#2
0
def _unit_root_test(table,
                    input_col,
                    maxlag=None,
                    regression='c',
                    autolag='AIC'):
    if autolag == 'None':
        autolag = None
    result = adfuller(table[input_col], maxlag, regression, autolag)
    model = dict()
    if autolag is not None:
        rb = BrtcReprBuilder()
        rb.addMD(
            strip_margin("""
        ## Augmented Dickey-Fuller unit root test result
        | - null hypothesis : A unit root is present in a time series sample
        | - alternative hypothesis : There is no unit root
        | - Test statistic : {adf}
        | - p-value : {p_value}
        | - Number of observations used for the ADF regression and calculation of the critical values : {nobs}
        | - Number of lags used : {usedlag}
        | - Critical values for the test statistic at the 1 %, 5 %, and 10 % levels : {critical_values}
        | - The maximized information criterion if autolag is not None : {icbest}
        |
        """.format(adf=result[0],
                   p_value=result[1],
                   usedlag=result[2],
                   nobs=result[3],
                   critical_values=result[4],
                   icbest=result[5])))
    else:
        rb = BrtcReprBuilder()
        rb.addMD(
            strip_margin("""
        ## Augmented Dickey-Fuller unit root test result
        | - null hypothesis : A unit root is present in a time series sample
        | - alternative hypothesis : There is no unit root
        | - Test statistic : {adf}
        | - p-value : {p_value}
        | - Number of observations used for the ADF regression and calculation of the critical values : {nobs}
        | - Number of lags used : {usedlag}
        | - Critical values for the test statistic at the 1 %, 5 %, and 10 % levels : {critical_values}
        |
        """.format(adf=result[0],
                   p_value=result[1],
                   usedlag=result[2],
                   nobs=result[3],
                   critical_values=result[4])))
    model['adf'] = result[0]
    model['p_value'] = result[1]
    model['usedlag'] = result[2]
    model['nobs'] = result[3]
    model['critical_values'] = result[4]
    if autolag is not None:
        model['icbest'] = result[5]
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
示例#3
0
def _hierarchical_clustering_post(table,
                                  model,
                                  num_clusters,
                                  cluster_col='prediction'):
    Z = model['model']
    mode = model['input_mode']
    if mode == 'matrix':
        distance_matrix = model['dist_matrix']
    out_table = model['linkage_matrix']

    predict = fcluster(Z, t=num_clusters, criterion='maxclust')
    if mode == 'original':
        prediction_table = table.copy()
    elif mode == 'matrix':
        prediction_table = distance_matrix
    prediction_table[cluster_col] = predict

    L, M = leaders(Z, predict)
    which_cluster = []
    for leader in L:
        if leader in Z[:, 0]:
            select_indices = np.where(Z[:, 0] == leader)[0][0]
            which_cluster.append(out_table['joined_column1'][select_indices])
        elif leader in Z[:, 1]:
            select_indices = np.where(Z[:, 1] == leader)[0][0]
            which_cluster.append(out_table['joined_column2'][select_indices])

    clusters_info_table = pd.DataFrame([])
    clusters_info_table[cluster_col] = M
    clusters_info_table['name_of_clusters'] = which_cluster
    clusters_info_table = clusters_info_table.sort_values(cluster_col)
    cluster_count = np.bincount(prediction_table[cluster_col])
    cluster_count = cluster_count[cluster_count != 0]
    clusters_info_table['num_of_entities'] = list(cluster_count)

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""### Hierarchical Clustering Post Process Result"""))
    rb.addMD(
        strip_margin("""
    |### Parameters
    |
    |{display_params}
    |
    |## Clusters Information
    |
    |{clusters_info_table}
    |
    """.format(display_params=dict2MD(model['parameters']),
               clusters_info_table=pandasDF2MD(clusters_info_table))))

    model = _model_dict('hierarchical_clustering_post')
    model['clusters_info'] = clusters_info_table
    model['_repr_brtc_'] = rb.get()

    return {'out_table': prediction_table, 'model': model}
示例#4
0
def _chi_square_test_of_independence(table, feature_cols, label_col, correction=False):
        
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## Chi-square Test of Independence Result
    |  - H0: the two categorical variables are independent.
    |  - H1: the two categorical variables are dependent.
    """))
    
    model = _model_dict('chi_square_test_of_independence')
    
    for idx, feature_col in enumerate(feature_cols):
        contingency_table = pd.crosstab(table[feature_col], table[label_col], margins=True)
        feature_index = len(contingency_table) - 1 
        label_index = len(contingency_table.columns) - 1
        temporary = contingency_table.iloc[0:feature_index, 0:label_index]
        
        test = stats.chi2_contingency(np.array(temporary), correction, 1)
        stat_chi = test[0]
        dof = test[2]
        p_chi = test[1]
        
        if p_chi < 0.05:
            dependence = 'Reject the null hypothesis that two categorical variables are independent at 5% significance level.'
        elif p_chi >= 0.05:
            dependence = 'No association was found between two categorical variables at 5% significance level.'
        elif math.isnan(p_chi):
            dependence = 'Independence of two categorical variables cannot be decided.'
            
        data = {
            'estimate': stat_chi,
            'df': dof,
            'p_value': p_chi
        }
            
        result_table = pd.DataFrame([data], columns=['estimate', 'df', 'p_value'])
        
        model['result{}'.format(idx)] = result_table
        
        rb.addMD(strip_margin("""
        |### Label: {label}, Feature: {feature}
        |###### Result Table {idx}
        |  
        |{result_table}
        |
        |{dependence}
        |
        |
        """.format(label=label_col, feature=feature_col, idx=idx, result_table=pandasDF2MD(result_table), dependence=dependence)))
    
    model['_repr_brtc_'] = rb.get()
    
    return {'model':model}
示例#5
0
文件: ancova.py 项目: yemode2k/studio
def _ancova(table, response_cols, factor_col, between_col):
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    ## Analysis of Covariance Result
    """))
    groups = table[between_col].unique()
    groups.sort()
    sum_len = np.sum([len(str(group)) for group in groups])

    result = dict()
    result['_grouped_data'] = dict()

    for response_col in response_cols:
        data = table[response_col]
        result['_grouped_data'][response_col] = dict()

        ax = sns.boxplot(x=between_col,
                         y=response_col,
                         data=table,
                         order=groups)
        if sum_len > 512:
            ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
        elif sum_len > 64:
            ax.set_xticklabels(ax.get_xticklabels(), rotation=45)

        fig_box = plt2MD(plt)
        plt.clf()

        ancova_res = pg_ancova(data=table,
                               dv=response_col,
                               covar=factor_col,
                               between=between_col)
        ancova_df = pandasDF2MD(ancova_res)

        rb.addMD(
            strip_margin("""
        | ## {response_col} by {between_col}
        | {fig_box}
        |
        | ### ANCOVA
        | {ancova_df}
        """.format(response_col=response_col,
                   between_col=between_col,
                   fig_box=fig_box,
                   ancova_df=ancova_df)))

    result['_repr_brtc_'] = rb.get()
    return {'result': result}
示例#6
0
def _kruskal_wallis_test(table,
                         response_cols,
                         factor_col,
                         nan_policy='propagate'):
    result = dict()
    rb = BrtcReprBuilder()
    rb.addMD("""## Kruskal Wallis test Result""")

    groups = dict()
    for name, group in table.groupby(factor_col):
        groups[name] = group

    for response_col in response_cols:
        stats, pval = kruskal(*[x[response_col] for x in groups.values()])
        rb.addMD(
            strip_margin("""
        | ## {response_col} by {factor_col}
        |
        | ### Statistics value: {stats}
        |
        | ### P value: {pval}
        """.format(response_col=response_col,
                   factor_col=factor_col,
                   stats=stats,
                   pval=pval)))

        name = response_col + '_' + factor_col
        result[name] = dict()
        result[name]['Statistics'] = stats
        result[name]['P value'] = pval

    result['_repr_brtc_'] = rb.get()

    return {'result': result}
示例#7
0
def _mann_whitney_test(table, response_col, factor_col, use_continuity=True):
    result = dict()
    rb = BrtcReprBuilder()
    rb.addMD("""## Mann Whitney test Result""")
    
    groups = dict()
    uniq_factor = table[factor_col].unique()
    for name in uniq_factor:
        groups[name] = np.array(table[response_col])[np.where(table[factor_col] == name)]
    group_name = []
    stats = []
    pvals = []
    for name1, name2 in itertools.combinations(uniq_factor, 2):
        name = str(name1) + ' vs ' + str(name2)
        stat, pval = mannwhitneyu(groups[name1], groups[name2], use_continuity=use_continuity)
        group_name.append(name)
        stats.append(stat)
        pvals.append(pval)
            
        result[name] = dict()
        result[name]['Statistics'] = stat
        result[name]['P value'] = pval
        
    rb.addMD(strip_margin("""
    | {table}
    """.format(table=pandasDF2MD(pd.DataFrame({'': group_name, 'Test Statistics': stats, 'P Value': pvals})))))
    result['_repr_brtc_'] = rb.get()
        
    return {'result': result}
示例#8
0
def _plot_roc_pr_curve(table, label_col, probability_col, fig_w=6.4, fig_h=4.8, pos_label=None):
    label = table[label_col]
    probability = table[probability_col]
    
    threshold, fig_tpr_fpr, fig_roc, fig_precision_recall, fig_pr, fig_confusion = _plot_binary(label, probability, fig_size=(fig_w, fig_h), pos_label=pos_label)

    summary = dict()
    summary['threshold'] = threshold
    summary['label_col'] = label_col
    summary['probability_col'] = probability_col
    
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## Plot ROC Curve and PR Curve Result
    |
    | ### ROC Curve
    | {fig_tpr_fpr}
    | {fig_roc}
    |
    | ### PR Curve
    | {fig_precision_recall}
    | {fig_pr}
    |
    | ### Confusion Matrix
    | {fig_confusion}
    """.format(fig_roc=fig_roc,
               fig_tpr_fpr=fig_tpr_fpr,
               fig_pr=fig_pr,
               fig_precision_recall=fig_precision_recall,
               fig_confusion=fig_confusion
               )))     
    summary['_repr_brtc_'] = rb.get()
                   
    return {'result' : summary}
示例#9
0
def _ada_boost_classification_train(table,
                                    feature_cols,
                                    label_col,
                                    max_depth=1,
                                    n_estimators=50,
                                    learning_rate=1.0,
                                    algorithm='SAMME.R',
                                    random_state=None):

    x_train = table[feature_cols]
    y_train = table[label_col]

    base_estimator = DecisionTreeClassifier(max_depth=max_depth)

    classifier = AdaBoostClassifier(base_estimator, n_estimators,
                                    learning_rate, algorithm, random_state)

    classifier.fit(x_train, y_train)

    params = {
        'feature_cols': feature_cols,
        'label_col': label_col,
        'feature_importance': classifier.feature_importances_,
        'n_estimators': n_estimators,
        'learning_rate': learning_rate,
        'algorithm': algorithm,
        'random_state': random_state
    }

    model = _model_dict('ada_boost_classification_model')
    get_param = classifier.get_params()
    model['parameters'] = get_param
    model['classifier'] = classifier
    model['params'] = params

    fig_feature_importance = _plot_feature_importance(feature_cols, classifier)
    params = dict2MD(get_param)

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## AdaBoost Classification Train Result
    |
    | ### Feature Importance
    | {fig_feature_importance}
    |
    | ### Parameters
    | {list_parameters}
    |
    """.format(fig_feature_importance=fig_feature_importance,
               list_parameters=params)))

    model['_repr_brtc_'] = rb.get()
    feature_importance = classifier.feature_importances_
    feature_importance_table = pd.DataFrame(
        [[feature_cols[i], feature_importance[i]]
         for i in range(len(feature_cols))],
        columns=['feature_name', 'importance'])
    model['feature_importance_table'] = feature_importance_table
    return {'model': model}
示例#10
0
def _ljung_box_test(table, input_cols, lags=None):
    result = dict()
    rb = BrtcReprBuilder()
    rb.addMD("""## Ljung Box test Result""")

    for input_col in input_cols:
        lbvalue, pvalue = acorr_ljungbox(x=table[input_col], lags=lags)

        lb_res = dict()
        lb_res['lags'] = range(1, len(lbvalue) + 1)
        lb_res['test statistic'] = lbvalue
        lb_res['p-value based on chi-square distribution'] = pvalue
        lb_res = pd.DataFrame(lb_res)

        rb.addMD(
            strip_margin("""
        | ## {input_col} test result
        |
        | {lb_res}
        """.format(input_col=input_col,
                   lb_res=pandasDF2MD(lb_res, num_rows=lb_res.shape[0]))))

        result[input_col] = lb_res

    result['_repr_brtc_'] = rb.get()

    return {'result': result}
示例#11
0
def _wilcoxon_test(table,
                   response_col,
                   factor_col,
                   zero_method='wilcox',
                   correction=False):
    result = dict()
    rb = BrtcReprBuilder()
    rb.addMD("""## Wilcoxon Test Result""")

    groups = dict()
    for name, group in table.groupby(factor_col):
        groups[name] = group
    for name1, name2 in itertools.combinations(groups.keys(), 2):
        stats, pval = wilcoxon(x=groups[name1][response_col],
                               y=groups[name2][response_col],
                               zero_method=zero_method,
                               correction=correction)
        rb.addMD(
            strip_margin("""
        | ## {name1} vs {name2}
        |
        | ### The sum of the ranks of the differences: {stats}
        |
        | ### The two-sided p-value for the test: {pval}
        """.format(name1=name1, name2=name2, stats=stats, pval=pval)))

        name = str(name1) + '_' + str(name2)
        result[name] = dict()
        result[name]['Statistics'] = stats
        result[name]['P value'] = pval

    result['_repr_brtc_'] = rb.get()

    return {'result': result}
示例#12
0
def _timeseries_decomposition(table, input_col, frequency, model_type='additive', filteration=None, two_sided=True, extrapolate_trend=0):
    out_table = table.copy()
    decomposition = sm.tsa.seasonal_decompose(out_table[input_col], model=model_type, filt=filteration, freq=frequency, two_sided=two_sided, extrapolate_trend=extrapolate_trend)
    decomposition.plot()
    plt2 = plt2MD(plt)
    plt.clf()
    
    out_table['trend'] = decomposition.trend
    out_table['seasonal'] = decomposition.seasonal
    out_table['residual'] = decomposition.resid
    
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## Time Series Decomposition Result
    | Model Type : {model_type}
    |
    | {image2}
    |
    """.format(model_type=model_type, image2=plt2)))
    
    model = _model_dict('timeseries_decomposition')
    model['model_type'] = model_type
    model['_repr_brtc_'] = rb.get()
    
    return {'out_table':out_table, 'model':model}
示例#13
0
def agglomerative_clustering_train_predict(input_table, input_cols, n_clusters=3, affinity='euclidean', compute_full_tree=True, linkage='ward', prediction_col='prediction', figw=6.4, figh=4.8):
    inputarr = input_table[input_cols]
    
    agglomerative_clustering = SKAgglomerativeClustering(n_clusters=n_clusters, affinity=affinity, memory=None, connectivity=None, compute_full_tree=compute_full_tree, linkage=linkage)
    agglomerative_clustering.fit(inputarr)
    input_table[prediction_col] = agglomerative_clustering.labels_
    
    children = agglomerative_clustering.children_
    distance = np.arange(children.shape[0])
    no_of_observations = np.arange(2, children.shape[0] + 2)
    linkage_matrix = np.column_stack([children, distance, no_of_observations]).astype(float)
    plt.figure(figsize=(figw, figh))
    dendrogram(linkage_matrix)
    plot_dendrogram = plt2MD(plt) 
    plt.clf()
    
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## Agglomerative Clustering Result
    | {plot_dendrogram}
    """.format(plot_dendrogram=plot_dendrogram)))
    
    agglomerative_clustering_result = {'model':agglomerative_clustering, 'input_cols':input_cols, '_repr_brtc_':rb.get()}
    
    return {'out_table': input_table, 'agglomerative_result':agglomerative_clustering_result}
示例#14
0
def _logistic_regression_train(table, feature_cols, label_col, penalty='l2', dual=False, tol=0.0001, C=1.0,
                               fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None,
                               solver='liblinear', max_iter=100, multi_class='ovr', verbose=0, warm_start=False,
                               n_jobs=1):

    features = table[feature_cols]
    label = table[label_col]

    if(sklearn_utils.multiclass.type_of_target(label) == 'continuous'):
        raise_error('0718', 'label_col')
    
    lr_model = LogisticRegression(penalty, dual, tol, C, fit_intercept, intercept_scaling, class_weight, random_state,
                                  solver, max_iter, multi_class, verbose, warm_start, n_jobs)
    lr_model.fit(features, label)

    intercept = lr_model.intercept_
    coefficients = lr_model.coef_
    classes = lr_model.classes_
    is_binary = len(classes) == 2

    if (fit_intercept == True):
        summary = pd.DataFrame({'features': ['intercept'] + feature_cols})
        print(intercept)
        print(coefficients)
        
        coef_trans = np.concatenate(([intercept], np.transpose(coefficients)), axis=0)
        if not is_binary:
            summary = pd.concat((summary, pd.DataFrame(coef_trans, columns=classes)), axis=1)
        elif is_binary:
            summary = pd.concat((summary, pd.DataFrame(coef_trans, columns=[classes[0]])), axis=1)
            
    else:
        summary = pd.DataFrame({'features': feature_cols})
        coef_trans = np.transpose(coefficients)
        
        if not is_binary:
            summary = pd.concat((summary, pd.DataFrame(coef_trans, columns=classes)), axis=1)
        elif is_binary:
            summary = pd.concat((summary, pd.DataFrame(coef_trans, columns=[classes[0]])), axis=1)
        
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## Logistic Regression Result
    | ### Summary
    | {table1}
    """.format(table1=pandasDF2MD(summary)
               )))

    model = _model_dict('logistic_regression_model')
    model['features'] = feature_cols
    model['label'] = label_col
    model['intercept'] = lr_model.intercept_
    model['coefficients'] = lr_model.coef_
    model['class'] = lr_model.classes_
    model['penalty'] = penalty
    model['solver'] = solver
    model['lr_model'] = lr_model
    model['_repr_brtc_'] = rb.get()

    return {'model' : model}
示例#15
0
def _mann_whitney_test(table, response_col, factor_col, use_continuity=True):
    result = dict()
    rb = BrtcReprBuilder()
    rb.addMD("""## Mann Whitney test Result""")

    groups = dict()
    uniq_factor = table[factor_col].unique()
    for name in uniq_factor:
        groups[name] = np.array(
            table[response_col])[np.where(table[factor_col] == name)]
    for name1, name2 in itertools.combinations(uniq_factor, 2):
        stats, pval = mannwhitneyu(groups[name1],
                                   groups[name2],
                                   use_continuity=use_continuity)
        rb.addMD(
            strip_margin("""
        | ## {name1} vs {name2}
        |
        | ### Statistics U value: {stats}
        |
        | ### P value: {pval}
        """.format(name1=name1, name2=name2, stats=stats, pval=pval)))

        name = str(name1) + '_' + str(name2)
        result[name] = dict()
        result[name]['Statistics'] = stats
        result[name]['P value'] = pval

    result['_repr_brtc_'] = rb.get()

    return {'result': result}
示例#16
0
def _svm_classification_train(table, feature_cols, label_col, c=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True,
              probability=True, tol=1e-3, max_iter=-1, random_state=None):
    validate(greater_than(c, 0.0, 'c'))
    
    _table = table.copy()
    
    _feature_cols = _table[feature_cols]
    _label_col = _table[label_col]
    
    if(sklearn_utils.multiclass.type_of_target(_label_col) == 'continuous'):
        raise_runtime_error('''Label Column should not be continuous.''')
    
    _svc = svm.SVC(C=c, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, shrinking=shrinking,
              probability=probability, tol=tol, max_iter=max_iter, random_state=random_state)
    _svc_model = _svc.fit(_feature_cols, _label_col)
    
    get_param = _svc.get_params()
    get_param['feature_cols'] = feature_cols
    get_param['label_col'] = label_col
    
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## SVM Classification Result
    | ### Parameters
    | {table_parameter} 
    """.format(table_parameter=dict2MD(get_param))))
    
    _model = _model_dict('svc_model')
    _model['svc_model'] = _svc_model
    _model['features'] = feature_cols
    _model['_repr_brtc_'] = rb.get()
    
    return {'model':_model}
示例#17
0
def _label_encoder2(table, input_cols, suffix='_index'):
    out_table = table.copy()
    out_model_list = [None] * len(input_cols)
    new_col_list = []
    number_distinct_classes = []
    for ind, col in enumerate(input_cols):
        le = LabelEncoder().fit(table[col])
        out_model_list[ind] = le
        new_col_name = col + suffix
        new_col_list.append(new_col_name)
        number_distinct_classes.append(len(le.classes_))
        out_table[new_col_name] = le.transform(table[col])
    out_model = _model_dict('label_encoders')
    out_model['label_encoders'] = out_model_list
    out_model['input_cols'] = input_cols
    rb = BrtcReprBuilder()
    params = {"Input columns": input_cols, "Suffix": suffix}
    summary_table = pd.DataFrame()
    summary_table['Input columns'] = input_cols
    summary_table['No. distinct classes'] = number_distinct_classes
    summary_table['New column names'] = new_col_list
    rb.addMD(
        strip_margin("""
    | ## Label Encoder Model
    | ### Parameters
    | {params}
    | ### Summary
    | {summary_table}
    """.format(params=dict2MD(params),
               summary_table=pandasDF2MD(summary_table))))
    out_model['_repr_brtc_'] = rb.get()
    return {'out_table': out_table, 'model': out_model}
示例#18
0
def _wilcoxon_test2(table,
                    first_col,
                    second_col,
                    zero_method='wilcox',
                    correction=False):
    result = dict()
    rb = BrtcReprBuilder()
    rb.addMD("""## Wilcoxon Test Result""")

    alter_hypothesis = []
    stats = []
    pvals = []

    stat, pval = wilcoxon(x=table[first_col],
                          y=table[second_col],
                          zero_method=zero_method,
                          correction=correction)
    alter_hypothesis.append('Median of the differences != 0')
    stats.append(stat)
    pvals.append(pval)

    result_table = pd.DataFrame({
        'Alternative hypothesis': alter_hypothesis,
        'Sum of differences ranks': stats,
        'P-value': pvals
    })

    rb.addMD(
        strip_margin("""
    | {table}
    """.format(table=pandasDF2MD(result_table))))
    result['_repr_brtc_'] = rb.get()

    return {'result': result}
示例#19
0
def _kruskal_wallis_test(table, response_cols, factor_col, nan_policy='propagate'):
    result = dict()
    rb = BrtcReprBuilder()
    rb.addMD("""## Kruskal Wallis test Result""")
    
    groups = dict()
    for name, group in table.groupby(factor_col):
        groups[name] = group
        
    group_name = []
    df = [len(groups) - 1] * len(response_cols)
    stats = []
    pvals = []
    for response_col in response_cols:
        stat, pval = kruskal(*[x[response_col] for x in groups.values()])
        group_name.append(response_col + ' by ' + factor_col)
        stats.append(stat)
        pvals.append(pval)
            
        name = response_col + '_' + factor_col
        result[name] = dict()
        result[name]['Statistics'] = stat
        result[name]['P value'] = pval
        
    rb.addMD(strip_margin("""
    | {table}
    """.format(table=pandasDF2MD(pd.DataFrame({'': group_name, 
                                                'Degree of Freedom': df, 
                                                'Test Statistics': stats, 
                                                'P value': pvals})))))
    result['_repr_brtc_'] = rb.get()
        
    return {'result': result}
示例#20
0
def _paired_ttest(table, first_column, second_column, alternative, hypothesized_difference=0, confidence_level=0.95):

    df = len(table) - 1    
    first_col = table[first_column]
    second_col = table[second_column]
    
    diff_mean = (first_col - second_col).mean()
    std_dev = np.std(first_col - second_col)
    t_value = stats.ttest_rel(table[first_column], table[second_column] + hypothesized_difference)[0]
    
    result = []
    alternative_hypothesis = []
    p_value = []
    confidence_interval = []
    
    if 'greater' in alternative:
        alternative_hypothesis.append('true difference in means > ' + str(hypothesized_difference))
        p_value.append(stats.t.sf(t_value, df))
        confidence_interval.append((diff_mean - std_dev * stats.t.isf((1 - confidence_level), df) / np.sqrt(df), np.Infinity))
    
    if 'less' in alternative:
        alternative_hypothesis.append('true difference in means < ' + str(hypothesized_difference))
        p_value.append(stats.t.cdf(t_value, df))
        confidence_interval.append((-np.Infinity, diff_mean + std_dev * stats.t.isf((1 - confidence_level), df) / np.sqrt(df)))
    
    if 'twosided' in alternative:
        alternative_hypothesis.append('true difference in means != ' + str(hypothesized_difference))
        p_value.append(stats.ttest_rel(first_col, second_col + hypothesized_difference)[1])
        other_term = std_dev * stats.t.isf((1 - confidence_level) / 2, df) / np.sqrt(df)
        confidence_interval.append((diff_mean - other_term, diff_mean + other_term))
    
    result.append(['alternative hypothesis', alternative_hypothesis])
    result.append(['p-value', p_value])
    result.append(['%g%% confidence Interval' % (confidence_level * 100), confidence_interval])
    result_table = pd.DataFrame.from_items(result)

    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    |## Paired T Test Result
    |##### df : {deg_f}
    |##### Mean of differences : {dm}
    |##### Standard deviation : {sd}
    |##### t-value : {tv}
    |
    |#### Summary
    |
    |{result_table}
    |
    """.format(deg_f=df, dm=diff_mean, sd=std_dev, tv=t_value, result_table=pandasDF2MD(result_table))))

    model = dict()
    model['_repr_brtc_'] = rb.get()
    model['degree_of_freedom'] = df
    model['mean_of_differences'] = diff_mean
    model['standard_deviation'] = std_dev
    model['t_value'] = t_value    
    model['summary'] = result_table

    return{'model':model}
示例#21
0
def _ada_boost_regression_train(table,
                                feature_cols,
                                label_col,
                                max_depth=3,
                                n_estimators=50,
                                learning_rate=1.0,
                                loss='linear',
                                random_state=None):

    feature_names, x_train = check_col_type(table, feature_cols)
    y_train = table[label_col]

    base_estimator = DecisionTreeRegressor(max_depth=max_depth)
    regressor = AdaBoostRegressor(base_estimator, n_estimators, learning_rate,
                                  loss, random_state)

    regressor.fit(x_train, y_train)

    params = {
        'feature_cols': feature_cols,
        'label_col': label_col,
        'feature_importance': regressor.feature_importances_,
        'n_estimators': n_estimators,
        'learning_rate': learning_rate,
        'loss': loss,
        'random_state': random_state
    }

    model = _model_dict('ada_boost_regression_model')
    get_param = regressor.get_params()
    model['parameters'] = get_param
    model['regressor'] = regressor
    model['params'] = params

    fig_feature_importance = _plot_feature_importance(feature_names, regressor)
    params = dict2MD(get_param)

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## AdaBoost Regression Train Result
    |
    | ### Feature Importance
    | {fig_feature_importance}
    |
    | ### Parameters
    | {list_parameters}
    |
    """.format(fig_feature_importance=fig_feature_importance,
               list_parameters=params)))

    model['_repr_brtc_'] = rb.get()
    feature_importance = regressor.feature_importances_
    feature_importance_table = pd.DataFrame(
        [[feature_names[i], feature_importance[i]]
         for i in range(len(feature_names))],
        columns=['feature_name', 'importance'])
    model['feature_importance_table'] = feature_importance_table
    return {'model': model}
示例#22
0
文件: bow.py 项目: steelblu/studio
def _bow(table,
         input_col,
         add_words=None,
         no_below=1,
         no_above=0.8,
         keep_n=10000):
    word_list = table[input_col].tolist()
    dictionary = Dictionary(word_list)
    if add_words != None:
        dictionary.add_documents([add_words])
    dictionary.filter_extremes(no_below=no_below,
                               no_above=no_above,
                               keep_n=keep_n,
                               keep_tokens=None)

    params = {
        'Input Column': input_col,
        'Minimum Number of Occurrence': no_below,
        'Maximum Fraction of Occurrence': no_above,
        'Keep N most Frequent': keep_n
    }

    empty_description = ''
    if len(list(dictionary.dfs.values())) == 0:
        out_table = pd.DataFrame([], columns=['token', 'document_frequency'])
        empty_description = 'Out table is empty since parameter \"Minimum Number of Occurrence\" is greater than the maximum of document frequency.'
    else:
        out_table = pd.DataFrame.from_dict(dictionary.token2id,
                                           orient='index').drop([0], axis=1)
        out_table.insert(loc=0,
                         column='token',
                         value=dictionary.token2id.keys())

        token_cnt = sorted(dictionary.dfs.items(), key=operator.itemgetter(0))
        dfs_list = []
        for i in range(len(dictionary.dfs)):
            dfs_list.append(token_cnt[i][1])
        out_table['document_frequency'] = dfs_list

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
        |# Bag of Words Result
        |### Parameters
        |
        | {display_params}
        |
        | {description}
        |
        """.format(display_params=dict2MD(params),
                   description=empty_description)))

    model = _model_dict('bow')
    model['dict_table'] = out_table
    model['dictionary'] = dictionary
    model['add_words'] = add_words
    model['_repr_brtc_'] = rb.get()

    return {'model': model, 'out_table': out_table}
示例#23
0
def _holt_winters_train(table, input_cols, period, model_type='additive'):

    rb = BrtcReprBuilder()
    model = _model_dict('holt_winters_train')
    rb.addMD(
        strip_margin("""
        |
        |## Holt-Winters Train Result
        |
        """.format()))

    for column in input_cols:
        hw = ExponentialSmoothing(table[column],
                                  trend=model_type,
                                  seasonal=model_type,
                                  seasonal_periods=period).fit()
        model['hw_' + str(column)] = hw
        model['origin_table'] = table

        rb.addMD(
            strip_margin("""
        |
        |### Column : {col}
        |
        | - Model Type : {mt}
        | - Period : {pd}
        | - SSE : {sse}
        | - AIC : {aic}
        | - BIC : {bic}
        |
        """.format(col=column,
                   mt=model_type,
                   pd=period,
                   sse=hw.sse,
                   aic=hw.aic,
                   bic=hw.bic)))
        model['sse_' + str(column)] = hw.sse
        model['aic_' + str(column)] = hw.aic
        model['bic_' + str(column)] = hw.bic

    model['input_columns'] = input_cols
    model['_repr_brtc_'] = rb.get()
    model['model_type'] = model_type
    model['period'] = period

    return {'model': model}
示例#24
0
 def test_var_pop(self):
     query = strip_margin('''
     | select var_pop(sepal_length) as var_pop_sepal_length from #{DF(0)}
     ''')
     result_df = sql_execute(df_iris, query)['out_table']
     print(result_df)
     self.assertAlmostEqual(0.6811222222222235, result_df.values[0][0], 10,
                            'var_pop gives a wrong result.')
示例#25
0
 def test_split(self):
     query = strip_margin('''
     | select split(species, 't') from #{DF(0)}
     ''')
     result_df = sql_execute(df_iris, query)['out_table']
     print(result_df)
     self.assertEqual(['se', 'osa'], result_df.values[0][0],
                      'split gives a wrong result.')
示例#26
0
 def test_exp2(self):
     query = strip_margin('''
     | select exp2(sepal_length), exp2(10) from #{DF(0)}
     ''')
     result_df = sql_execute(df_iris, query)['out_table']
     print(result_df)
     self.assertAlmostEqual(34.29675080116137, result_df.values[0][0], 10,
                            'exp2 gives a wrong result.')
示例#27
0
 def test_exp(self):
     query = strip_margin('''
     | select exp(sepal_length), exp(10) from #{DF(0)}
     ''')
     result_df = sql_execute(df_iris, query)['out_table']
     print(result_df)
     self.assertAlmostEqual(164.0219072999017, result_df.values[0][0], 10,
                            'exp gives a wrong result.')
示例#28
0
 def test_log2(self):
     query = strip_margin('''
     | select log2(sepal_length), log2(10) from #{DF(0)}
     ''')
     result_df = sql_execute(df_iris, query)['out_table']
     print(result_df)
     self.assertAlmostEqual(2.350497247084133, result_df.values[0][0], 10,
                            'log2 gives a wrong result.')
示例#29
0
 def test_log10(self):
     query = strip_margin('''
     | select log10(sepal_length), log10(10) from #{DF(0)}
     ''')
     result_df = sql_execute(df_iris, query)['out_table']
     print(result_df)
     self.assertAlmostEqual(0.7075701760979364, result_df.values[0][0], 10,
                            'log10 gives a wrong result.')
示例#30
0
 def test_pi(self):
     query = strip_margin('''
     | select sepal_length + pi() from #{DF(0)}
     ''')
     result_df = sql_execute(df_iris, query)['out_table']
     print(result_df)
     self.assertAlmostEqual(8.241592653589793, result_df.values[0][0], 10,
                            'pi gives a wrong result.')