Пример #1
0
 def default(self, obj):
     # TODO add more support types
     if isinstance(obj, set):
         return list(obj)
     elif isinstance(obj, numpy.ndarray):
         return _to_default_list(obj)
     else:
         rb = BrtcReprBuilder()
         rb.addRawTextMD(str(obj))
         return {'type': 'python object', '_repr_brtc_': rb.get()}
Пример #2
0
def _profile_table(table, bins=10, check_correlation=False, correlation_threshold=0.9, correlation_overrides=None):
    
    rb = BrtcReprBuilder()
    
    profile = pd_profiling.ProfileReport(table, bins=bins, check_correlation=check_correlation, correlation_threshold=correlation_threshold, correlation_overrides=correlation_overrides)
    rb.addHTML(profile.html)
    summary = dict()
    summary['_repr_brtc_'] = rb.get()
    
    return {'result': summary}
Пример #3
0
def _kmeans_train_predict(table, input_cols, n_clusters=3, prediction_col='prediction', init='k-means++', n_init=10,
             max_iter=300, tol=1e-4, precompute_distances='auto', seed=None,
             n_jobs=1, algorithm='auto', n_samples=None):
    inputarr = table[input_cols]
    if n_samples is None:
        n_samples = len(inputarr)
        
    validate(greater_than_or_equal_to(n_clusters, 1, 'n_clusters'),
             greater_than_or_equal_to(n_init, 1, 'n_init'),
             greater_than_or_equal_to(max_iter, 1, 'max_iter'),
             greater_than(tol, 0.0, 'tol'),
             greater_than_or_equal_to(n_jobs, 1, 'n_jobs'),
             greater_than_or_equal_to(n_samples, 0, 'n_samples'))
        
    k_means = SKKMeans(n_clusters=n_clusters, init=init, n_init=n_init,
             max_iter=max_iter, tol=tol, precompute_distances=precompute_distances,
             verbose=0, random_state=seed, copy_x=True, n_jobs=n_jobs, algorithm=algorithm)
    
    k_means.fit(inputarr)
    
    params = {'input_cols':input_cols, 'n_clusters':n_clusters, 'init':init, 'n_init':n_init, 'max_iter':max_iter, 'tol':tol,
              'precompute_distances':precompute_distances, 'seed':seed, 'n_jobs':n_jobs, 'algorithm':algorithm}
    
    cluster_centers = k_means.cluster_centers_
    labels = k_means.labels_
    
    pca2_model = PCA(n_components=2).fit(inputarr)
    pca2 = pca2_model.transform(inputarr)
    
    fig_centers = _kmeans_centers_plot(input_cols, cluster_centers)
    fig_samples = _kmeans_samples_plot(table, input_cols, n_samples, cluster_centers)
    fig_pca = _kmeans_pca_plot(labels, cluster_centers, pca2_model, pca2)
    
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## Kmeans Result
    | - Number of iterations run: {n_iter_}.
    | - Coordinates of cluster centers
    | {fig_cluster_centers} 
    | - Samples
    | {fig_pca}
    | {fig_samples}
    |
    | ### Parameters
    | {params}
    """.format(n_iter_=k_means.n_iter_, fig_cluster_centers=fig_centers, fig_pca=fig_pca, fig_samples=fig_samples, params=dict2MD(params))))
    
    model = _model_dict('kmeans')
    model['model'] = k_means
    model['input_cols'] = input_cols
    model['_repr_brtc_'] = rb.get()
    
    out_table = table.copy()
    out_table[prediction_col] = labels
    return {'out_table':out_table, 'model':model}
Пример #4
0
def _discretize_quantile(table,
                         input_col,
                         num_of_buckets=2,
                         out_col_name='bucket_number'):
    out_table = table.copy()
    out_table[out_col_name], buckets = pd.qcut(table[input_col],
                                               num_of_buckets,
                                               labels=False,
                                               retbins=True,
                                               precision=10,
                                               duplicates='drop')

    params = {
        'input_col': input_col,
        'num_of_buckets': num_of_buckets,
        'out_col_name': out_col_name
    }

    cnt = Counter(out_table[out_col_name].values)

    # index_list, bucket_list
    index_list = []
    bucket_list = []
    cnt_list = []
    for i in range(len(buckets) - 1):
        left = '[' if i == 0 else '('
        index_list.append(i)
        cnt_list.append(cnt[i])
        bucket_list.append("{left}{lower}, {upper}]".format(
            left=left, lower=buckets[i],
            upper=buckets[i + 1]))  # 'buckets' is tuple type data.

    # Build model
    result = pd.DataFrame.from_items([['bucket number', index_list],
                                      ['buckets', bucket_list],
                                      ['count', cnt_list]])

    # Build model
    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Quantile-based Discretization Result
    | ### Result
    | {result}
    |
    | ### Parameters
    | {params} 
    """.format(result=pandasDF2MD(result), params=dict2MD(params))))

    model = _model_dict('discretize_quantile')
    model['result'] = result
    model['params'] = params
    model['_repr_brtc_'] = rb.get()

    return {'out_table': out_table, 'model': model}
Пример #5
0
def _evaluate_regression(table, label_col, prediction_col):
    label = table[label_col]
    predict = table[prediction_col]

    # compute metrics
    evs = explained_variance_score(label, predict)
    mse = mean_squared_error(label, predict)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(label, predict)
    mape = _mean_absolute_percentage_error(label, predict)
    mdae = median_absolute_error(label, predict)
    r2 = r2_score(label, predict)

    # json
    summary = dict()
    summary['label_col'] = label_col
    summary['prediction_col'] = prediction_col
    summary['r2_score'] = r2
    summary['mean_squared_error'] = mse
    summary['root_mean_squared_error'] = rmse
    summary['mean_absolute_error'] = mae
    summary['median_absolute_error'] = mdae
    summary['explained_variance_score'] = evs

    # report
    all_dict_list = [{
        'r2_score': r2,
        'mean_squared_error': mse,
        'root_mean_squared_error': rmse,
        'mean_absolute_error': mae,
        'mean_absolute_percentage_error': mape,
        'median_absolute_error': mdae,
        'explained_variance_score': evs
    }]
    all_df = pd.DataFrame(all_dict_list)
    all_df = all_df[[
        'r2_score', 'mean_squared_error', 'root_mean_squared_error',
        'mean_absolute_error', 'mean_absolute_percentage_error',
        'median_absolute_error', 'explained_variance_score'
    ]]
    summary['metrics'] = all_df

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Evaluate Regression Result
    | ### Metrics
    | {table1}
    |
    |
    """.format(table1=pandasDF2MD(all_df))))
    summary['_repr_brtc_'] = rb.get()

    return {'result': summary}
Пример #6
0
 def default(self, obj):
     if isinstance(obj, set):
         return list(obj)
     elif isinstance(obj, numpy.ndarray):
         return obj.tolist()
     # TODO add more support types
     else:
     # elif hasattr(obj, '__str__'):
         rb = BrtcReprBuilder()
         rb.addRawTextMD(str(obj))
         return {'type':'python object', '_repr_brtc_':rb.get()}
def _random_forest_classification_train(table, feature_cols, label_col,
                                 n_estimators=10, criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1,
                                 min_weight_fraction_leaf=0, max_features="sqrt",
                                 max_leaf_nodes=None, min_impurity_decrease=0, bootstrap=True, oob_score=False,
                                 n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None):   
    
    X_train = table[feature_cols]
    y_train = table[label_col]   
    
    if max_features == "None":
        max_features = None
            
    classifier = RandomForestClassifier(n_estimators, criterion, max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_features, max_leaf_nodes, min_impurity_decrease)  # , bootstrap, oob_score, n_jobs, random_state, verbose, warm_start, class_weight)
    classifier.fit(X_train, y_train) 

    params = {'feature_cols': feature_cols,
             'label_col': label_col,
             'n_estimators': n_estimators,
             'criterion': criterion,
             'max_depth': max_depth,
             'min_samples_split': min_samples_split,
             'min_samples_leaf': min_samples_leaf,
             'min_weight_fraction_leaf': min_weight_fraction_leaf,
             'max_features': max_features,
             'max_leaf_nodes': max_leaf_nodes,
             'min_impurity_decrease': min_impurity_decrease,
             'bootstrap': bootstrap,
             'oob_score': oob_score,
             'n_jobs': n_jobs,
             'random_state': random_state,
             'verbose': verbose,
             'warm_start': warm_start,
             'class_weight': class_weight}
    
    model = dict()
    # get_param = classifier.get_params()
    model['classifier'] = classifier
    model['params'] = params

    fig_feature_importances = _plot_feature_importances(feature_cols, classifier)
           
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## Random Forest Classification Train Result
    |
    | ### Feature Importance
    | {fig_feature_importances}
    |
    """.format(fig_feature_importances=fig_feature_importances))) 
        
    model['_repr_brtc_'] = rb.get()   
               
    return {'model' : model}
Пример #8
0
def _svm_classification_train(table,
                              feature_cols,
                              label_col,
                              c=1.0,
                              kernel='rbf',
                              degree=3,
                              gamma='auto',
                              coef0=0.0,
                              shrinking=True,
                              probability=True,
                              tol=1e-3,
                              max_iter=-1,
                              random_state=None):
    validate(greater_than(c, 0.0, 'c'))

    _table = table.copy()

    _feature_cols = _table[feature_cols]
    _label_col = _table[label_col]

    if (sklearn_utils.multiclass.type_of_target(_label_col) == 'continuous'):
        raise_runtime_error('''Label Column should not be continuous.''')

    _svc = svm.SVC(C=c,
                   kernel=kernel,
                   degree=degree,
                   gamma=gamma,
                   coef0=coef0,
                   shrinking=shrinking,
                   probability=probability,
                   tol=tol,
                   max_iter=max_iter,
                   random_state=random_state)
    _svc_model = _svc.fit(_feature_cols, _label_col)

    get_param = _svc.get_params()
    get_param['feature_cols'] = feature_cols
    get_param['label_col'] = label_col

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## SVM Classification Result
    | ### Parameters
    | {table_parameter} 
    """.format(table_parameter=dict2MD(get_param))))

    _model = _model_dict('svc_model')
    _model['svc_model'] = _svc_model
    _model['features'] = feature_cols
    _model['_repr_brtc_'] = rb.get()

    return {'model': _model}
Пример #9
0
def _scale(table, input_cols, scaler, suffix=None):
    if scaler == 'RobustScaler':
        if suffix is None:
            suffix = '_robust'
        scale = RobustScaler()
    elif scaler == 'StandardScaler':
        if suffix is None:
            suffix = '_standard'
        scale = StandardScaler()
    elif scaler == 'MaxAbsScaler':
        if suffix is None:
            suffix = '_max_abs'
        scale = MaxAbsScaler()
    else:  # minmax
        if suffix is None:
            suffix = '_min_max'
        scale = MinMaxScaler()

    scaled_cols = []
    for col in input_cols:
        scaled_cols.append(col + suffix)

    out_table = table.copy()
    scaled_table = scale.fit_transform(out_table[input_cols])
    out_table[scaled_cols] = pd.DataFrame(data=scaled_table)

    out_model = _model_dict('scaler')
    out_model['input_cols'] = input_cols
    out_model['used_scaler'] = scaler
    out_model['scaler'] = scale
    out_model['suffix'] = suffix
    rb = BrtcReprBuilder()
    params = {
        "Input columns": input_cols,
        "Normalization method": scaler,
        "Suffix": suffix
    }
    summary_table = pd.DataFrame()
    summary_table['Input columns'] = input_cols
    summary_table['Normalization method'] = [scaler] * len(input_cols)
    summary_table['New column names'] = scaled_cols
    rb.addMD(
        strip_margin("""
    | ## Label Encoder Model
    | ### Parameters
    | {params}
    |
    | ### Summary table
    | {summary_table}
    """.format(params=dict2MD(params),
               summary_table=pandasDF2MD(summary_table))))
    out_model['_repr_brtc_'] = rb.get()
    return {'out_table': out_table, 'model': out_model}
Пример #10
0
def _unit_root_test(table,
                    input_col,
                    maxlag=None,
                    regression='c',
                    autolag='AIC'):
    if autolag == 'None':
        autolag = None
    result = adfuller(table[input_col], maxlag, regression, autolag)
    model = dict()
    if autolag is not None:
        rb = BrtcReprBuilder()
        rb.addMD(
            strip_margin("""
        ## Augmented Dickey-Fuller unit root test result
        | - null hypothesis : A unit root is present in a time series sample
        | - alternative hypothesis : There is no unit root
        | - Test statistic : {adf}
        | - p-value : {p_value}
        | - Number of observations used for the ADF regression and calculation of the critical values : {nobs}
        | - Number of lags used : {usedlag}
        | - Critical values for the test statistic at the 1 %, 5 %, and 10 % levels : {critical_values}
        | - The maximized information criterion if autolag is not None : {icbest}
        |
        """.format(adf=result[0],
                   p_value=result[1],
                   usedlag=result[2],
                   nobs=result[3],
                   critical_values=result[4],
                   icbest=result[5])))
    else:
        rb = BrtcReprBuilder()
        rb.addMD(
            strip_margin("""
        ## Augmented Dickey-Fuller unit root test result
        | - null hypothesis : A unit root is present in a time series sample
        | - alternative hypothesis : There is no unit root
        | - Test statistic : {adf}
        | - p-value : {p_value}
        | - Number of observations used for the ADF regression and calculation of the critical values : {nobs}
        | - Number of lags used : {usedlag}
        | - Critical values for the test statistic at the 1 %, 5 %, and 10 % levels : {critical_values}
        |
        """.format(adf=result[0],
                   p_value=result[1],
                   usedlag=result[2],
                   nobs=result[3],
                   critical_values=result[4])))
    model['adf'] = result[0]
    model['p_value'] = result[1]
    model['usedlag'] = result[2]
    model['nobs'] = result[3]
    model['critical_values'] = result[4]
    if autolag is not None:
        model['icbest'] = result[5]
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Пример #11
0
def _kmeans_train_predict(table, input_cols, n_clusters=3, prediction_col='prediction', init='k-means++', n_init=10,
             max_iter=300, tol=1e-4, precompute_distances='auto', seed=None,
             n_jobs=1, algorithm='auto', n_samples=None):
    feature_names, inputarr = check_col_type(table, input_cols)
    if n_samples is None:
        n_samples = len(inputarr)
        
    k_means = SKKMeans(n_clusters=n_clusters, init=init, n_init=n_init,
             max_iter=max_iter, tol=tol, precompute_distances=precompute_distances,
             verbose=0, random_state=seed, copy_x=True, n_jobs=n_jobs, algorithm=algorithm)
    
    k_means.fit(inputarr)
    
    params = {'input_cols':feature_names, 'n_clusters':n_clusters, 'init':init, 'n_init':n_init, 'max_iter':max_iter, 'tol':tol,
              'precompute_distances':precompute_distances, 'seed':seed, 'n_jobs':n_jobs, 'algorithm':algorithm, 'n_samples':n_samples}
    
    cluster_centers = k_means.cluster_centers_
    n_clusters = len(cluster_centers)
    colors = cm.nipy_spectral(np.arange(n_clusters).astype(float) / n_clusters)
    labels = k_means.labels_
    
    pca2_model = PCA(n_components=min(2, len(feature_names))).fit(inputarr)
    pca2 = pca2_model.transform(inputarr)
    
    fig_centers = _kmeans_centers_plot(feature_names, cluster_centers, colors)
    fig_samples = _kmeans_samples_plot(table, input_cols, n_samples, cluster_centers, seed, colors)
    fig_pca = _kmeans_pca_plot(labels, cluster_centers, pca2_model, pca2, colors)
    
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## Kmeans Result
    | - Number of iterations run: {n_iter_}.
    | - Sum of square error: {sse_}.
    | - Coordinates of cluster centers
    | {fig_cluster_centers} 
    | - Samples
    | {fig_pca}
    | {fig_samples}
    |
    | ### Parameters
    | {params}
    """.format(n_iter_=k_means.n_iter_, sse_=k_means.inertia_, fig_cluster_centers=fig_centers, fig_pca=fig_pca, fig_samples=fig_samples, params=dict2MD(params))))
    
    model = _model_dict('kmeans')
    model['model'] = k_means
    model['input_cols'] = input_cols
    model['_repr_brtc_'] = rb.get()
    
    out_table = table.copy()
    out_table[prediction_col] = labels
    return {'out_table':out_table, 'model':model}
Пример #12
0
def _tfidf_gensim(table,
                  input_col,
                  output_col_name="sparse_vectors",
                  tf_weighing='n',
                  df_weighing='t',
                  document_normalization='c'):

    out_table = table.copy()
    tokens = out_table[input_col]
    smartirs = tf_weighing + df_weighing + document_normalization

    dictionary = Dictionary(tokens)
    word_count_vector_list = [dictionary.doc2bow(text) for text in tokens]

    tfidf_model = TfidfModel(word_count_vector_list, smartirs=smartirs)
    tfidf_vector_list = [*tfidf_model[word_count_vector_list]]

    sparse_matrix = corpus2csc(tfidf_vector_list,
                               num_terms=len(dictionary.token2id)).T

    rb = BrtcReprBuilder()

    dictionary_data = [[
        index, word, tfidf_model.dfs[index], tfidf_model.idfs[index]
    ] for index, word in dictionary.items()]
    dictionary_table = pd.DataFrame(data=dictionary_data,
                                    columns=['index', 'word', 'count', 'idf'])
    dictionary_table = dictionary_table.sort_values(["count"],
                                                    ascending=[False])

    rb.addMD(
        strip_margin("""
    | ## TFIDF Result
    | ### Dictionary
    | {table1}
    """.format(table1=pandasDF2MD(dictionary_table))))

    out_table[output_col_name] = csr_matrix_to_sparse_vector_json_list(
        sparse_matrix)

    model = _model_dict('tfidf_model')
    model['dictionary_table'] = dictionary_table
    model['dictionary'] = dictionary
    model['tfidf_model'] = tfidf_model
    model['input_col'] = input_col
    model['output_col_name'] = output_col_name
    model['_repr_brtc_'] = rb.get()

    return {'out_table': out_table, 'model': model}
Пример #13
0
def _ada_boost_classification_train(table, feature_cols, label_col, max_depth=1,
                                    n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None):
    
    x_train = table[feature_cols]
    y_train = table[label_col]

    base_estimator = DecisionTreeClassifier(max_depth=max_depth)

    classifier = AdaBoostClassifier(base_estimator, n_estimators, learning_rate, algorithm, random_state)

    classifier.fit(x_train, y_train)

    params = {'feature_cols': feature_cols,
              'label_col': label_col,
              'feature_importance': classifier.feature_importances_,
              'n_estimators': n_estimators,
              'learning_rate': learning_rate,
              'algorithm': algorithm,
              'random_state': random_state}
    
    model = _model_dict('ada_boost_classification_model')
    get_param = classifier.get_params()
    model['parameters'] = get_param
    model['classifier'] = classifier
    model['params'] = params

    fig_feature_importance = _plot_feature_importance(feature_cols, classifier)
    params = dict2MD(get_param)

    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## AdaBoost Classification Train Result
    |
    | ### Feature Importance
    | {fig_feature_importance}
    |
    | ### Parameters
    | {list_parameters}
    |
    """.format(fig_feature_importance=fig_feature_importance,
               list_parameters=params
               )))

    model['_repr_brtc_'] = rb.get()
    feature_importance = classifier.feature_importances_
    feature_importance_table = pd.DataFrame([[feature_cols[i], feature_importance[i]] for i in range(len(feature_cols))], columns=['feature_name', 'importance'])
    model['feature_importance_table'] = feature_importance_table
    return {'model': model}
Пример #14
0
def agglomerative_clustering_train_predict(input_table,
                                           input_cols,
                                           n_clusters=3,
                                           affinity='euclidean',
                                           compute_full_tree=True,
                                           linkage='ward',
                                           prediction_col='prediction',
                                           figw=6.4,
                                           figh=4.8):
    inputarr = input_table[input_cols]

    agglomerative_clustering = SKAgglomerativeClustering(
        n_clusters=n_clusters,
        affinity=affinity,
        memory=None,
        connectivity=None,
        compute_full_tree=compute_full_tree,
        linkage=linkage)
    agglomerative_clustering.fit(inputarr)
    input_table[prediction_col] = agglomerative_clustering.labels_

    children = agglomerative_clustering.children_
    distance = np.arange(children.shape[0])
    no_of_observations = np.arange(2, children.shape[0] + 2)
    linkage_matrix = np.column_stack([children, distance,
                                      no_of_observations]).astype(float)
    plt.figure(figsize=(figw, figh))
    dendrogram(linkage_matrix)
    plot_dendrogram = plt2MD(plt)
    plt.clf()

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Agglomerative Clustering Result
    | {plot_dendrogram}
    """.format(plot_dendrogram=plot_dendrogram)))

    agglomerative_clustering_result = {
        'model': agglomerative_clustering,
        'input_cols': input_cols,
        '_repr_brtc_': rb.get()
    }

    return {
        'out_table': input_table,
        'agglomerative_result': agglomerative_clustering_result
    }
Пример #15
0
def _doc_term_mtx(table, model, input_col, result_type='doc_to_bow_token'):
    corpus = table[input_col].tolist()

    dictionary = model['dictionary']

    bow_corpus = []
    for doc in corpus:
        bow_corpus.append(dictionary.doc2bow(doc))

    doc_to_bow = []
    for i in range(len(corpus)):
        token_cnt = []
        for j in range(len(bow_corpus[i])):
            token_cnt.append('({token}, {cnt})'.format(
                token=dictionary[bow_corpus[i][j][0]],
                cnt=bow_corpus[i][j][1]))
        doc_to_bow.append(token_cnt)
    doc_to_bow_list = []
    for doc in doc_to_bow:
        doc_to_bow_list.append('{}'.format(list(doc)))

    doc_idx = ['doc_{}'.format(i) for i in range(len(corpus))]
    terms = [term for term in dictionary.token2id.keys()]

    if result_type == 'doc_to_bow_token':
        out_table = pd.DataFrame(data=doc_to_bow_list, columns=['doc_to_bow'])
        out_table.insert(loc=0, column='doc_idx', value=doc_idx)
    elif result_type == 'doc_term_mtx':
        out_table = pd.DataFrame(
            matutils.corpus2dense(bow_corpus,
                                  num_terms=len(dictionary.token2id)).T)
        out_table.insert(loc=0, column=' ', value=doc_idx)
        out_table.columns = np.append('', terms)
    elif result_type == 'term_doc_mtx':
        out_table = pd.DataFrame(
            matutils.corpus2dense(bow_corpus,
                                  num_terms=len(dictionary.token2id)))
        out_table.insert(loc=0, column=' ', value=terms)
        out_table.columns = np.append('', doc_idx)
    else:
        raise_runtime_error("Please check 'result_type'.")

    rb = BrtcReprBuilder()
    model = _model_dict('doc_term_mtx')
    model['bow_corpus'] = bow_corpus
    model['_repr_brtc_'] = rb.get()

    return {'out_table': out_table}
Пример #16
0
def _wilcoxon_test(table,
                   response_col,
                   factor_col,
                   zero_method='wilcox',
                   correction=False):
    result = dict()
    rb = BrtcReprBuilder()
    rb.addMD("""## Wilcoxon Test Result""")

    groups = dict()
    for name, group in table.groupby(factor_col):
        groups[name] = group
    for name1, name2 in itertools.combinations(groups.keys(), 2):
        stats, pval = wilcoxon(x=groups[name1][response_col],
                               y=groups[name2][response_col],
                               zero_method=zero_method,
                               correction=correction)
        rb.addMD(
            strip_margin("""
        | ## {name1} vs {name2}
        |
        | ### The sum of the ranks of the differences: {stats}
        |
        | ### The two-sided p-value for the test: {pval}
        """.format(name1=name1, name2=name2, stats=stats, pval=pval)))

        name = str(name1) + '_' + str(name2)
        result[name] = dict()
        result[name]['Statistics'] = stats
        result[name]['P value'] = pval

    result['_repr_brtc_'] = rb.get()

    return {'result': result}
Пример #17
0
def _kruskal_wallis_test(table,
                         response_cols,
                         factor_col,
                         nan_policy='propagate'):
    result = dict()
    rb = BrtcReprBuilder()
    rb.addMD("""## Kruskal Wallis test Result""")

    groups = dict()
    for name, group in table.groupby(factor_col):
        groups[name] = group

    for response_col in response_cols:
        stats, pval = kruskal(*[x[response_col] for x in groups.values()])
        rb.addMD(
            strip_margin("""
        | ## {response_col} by {factor_col}
        |
        | ### Statistics value: {stats}
        |
        | ### P value: {pval}
        """.format(response_col=response_col,
                   factor_col=factor_col,
                   stats=stats,
                   pval=pval)))

        name = response_col + '_' + factor_col
        result[name] = dict()
        result[name]['Statistics'] = stats
        result[name]['P value'] = pval

    result['_repr_brtc_'] = rb.get()

    return {'result': result}
Пример #18
0
def _mann_whitney_test(table, response_col, factor_col, use_continuity=True):
    result = dict()
    rb = BrtcReprBuilder()
    rb.addMD("""## Mann Whitney test Result""")
    
    groups = dict()
    uniq_factor = table[factor_col].unique()
    for name in uniq_factor:
        groups[name] = np.array(table[response_col])[np.where(table[factor_col] == name)]
    group_name = []
    stats = []
    pvals = []
    for name1, name2 in itertools.combinations(uniq_factor, 2):
        name = str(name1) + ' vs ' + str(name2)
        stat, pval = mannwhitneyu(groups[name1], groups[name2], use_continuity=use_continuity)
        group_name.append(name)
        stats.append(stat)
        pvals.append(pval)
            
        result[name] = dict()
        result[name]['Statistics'] = stat
        result[name]['P value'] = pval
        
    rb.addMD(strip_margin("""
    | {table}
    """.format(table=pandasDF2MD(pd.DataFrame({'': group_name, 'Test Statistics': stats, 'P Value': pvals})))))
    result['_repr_brtc_'] = rb.get()
        
    return {'result': result}
Пример #19
0
def _ljung_box_test(table, input_cols, lags=None):
    result = dict()
    rb = BrtcReprBuilder()
    rb.addMD("""## Ljung Box test Result""")

    for input_col in input_cols:
        lbvalue, pvalue = acorr_ljungbox(x=table[input_col], lags=lags)

        lb_res = dict()
        lb_res['lags'] = range(1, len(lbvalue) + 1)
        lb_res['test statistic'] = lbvalue
        lb_res['p-value based on chi-square distribution'] = pvalue
        lb_res = pd.DataFrame(lb_res)

        rb.addMD(
            strip_margin("""
        | ## {input_col} test result
        |
        | {lb_res}
        """.format(input_col=input_col,
                   lb_res=pandasDF2MD(lb_res, num_rows=lb_res.shape[0]))))

        result[input_col] = lb_res

    result['_repr_brtc_'] = rb.get()

    return {'result': result}
Пример #20
0
def _kruskal_wallis_test(table, response_cols, factor_col, nan_policy='propagate'):
    result = dict()
    rb = BrtcReprBuilder()
    rb.addMD("""## Kruskal Wallis test Result""")
    
    groups = dict()
    for name, group in table.groupby(factor_col):
        groups[name] = group
        
    group_name = []
    df = [len(groups) - 1] * len(response_cols)
    stats = []
    pvals = []
    for response_col in response_cols:
        stat, pval = kruskal(*[x[response_col] for x in groups.values()])
        group_name.append(response_col + ' by ' + factor_col)
        stats.append(stat)
        pvals.append(pval)
            
        name = response_col + '_' + factor_col
        result[name] = dict()
        result[name]['Statistics'] = stat
        result[name]['P value'] = pval
        
    rb.addMD(strip_margin("""
    | {table}
    """.format(table=pandasDF2MD(pd.DataFrame({'': group_name, 
                                                'Degree of Freedom': df, 
                                                'Test Statistics': stats, 
                                                'P value': pvals})))))
    result['_repr_brtc_'] = rb.get()
        
    return {'result': result}
Пример #21
0
def _function_by_group2(function,
                        table=None,
                        model=None,
                        columns=None,
                        group_by=None,
                        **params):
    if isinstance(model, dict) and '_grouped_data' not in model:
        raise Exception('Unsupported model. model requires _grouped_data.')
    if isinstance(model, dict):
        groups = model['_grouped_data']['groups']
        group_by = model['_grouped_data']['group_by']
    if isinstance(table, pd.DataFrame):
        table, groups = _group(
            table, params,
            group_by)  # use group keys from table even there is a model.
    sample_result = _sample_result(function, table, model, params, groups)
    res_keys, df_keys, model_keys_containing_repr = _info_from_sample_result(
        sample_result, group_by, groups)
    res_dict, success_keys = _function_by_group_key(function, table, model,
                                                    params, groups, res_keys,
                                                    group_by)
    for repr_key in model_keys_containing_repr:
        rb = BrtcReprBuilder()
        for group in success_keys:
            rb.addMD(
                '--- \n\n ### Group by {group_by} : {tmp_group}\n\n---'.format(
                    group_by=group_by, tmp_group=group))
            rb.merge(res_dict[repr_key]['_grouped_data']['data'][tuple(group)]
                     ['_repr_brtc_'])
        res_dict[repr_key]['_repr_brtc_'] = rb.get()
    for df_key in df_keys:
        res_dict[df_key] = _flatten(res_dict[df_key], groups, group_by,
                                    columns)
    return res_dict
Пример #22
0
def _mann_whitney_test(table, response_col, factor_col, use_continuity=True):
    result = dict()
    rb = BrtcReprBuilder()
    rb.addMD("""## Mann Whitney test Result""")

    groups = dict()
    uniq_factor = table[factor_col].unique()
    for name in uniq_factor:
        groups[name] = np.array(
            table[response_col])[np.where(table[factor_col] == name)]
    for name1, name2 in itertools.combinations(uniq_factor, 2):
        stats, pval = mannwhitneyu(groups[name1],
                                   groups[name2],
                                   use_continuity=use_continuity)
        rb.addMD(
            strip_margin("""
        | ## {name1} vs {name2}
        |
        | ### Statistics U value: {stats}
        |
        | ### P value: {pval}
        """.format(name1=name1, name2=name2, stats=stats, pval=pval)))

        name = str(name1) + '_' + str(name2)
        result[name] = dict()
        result[name]['Statistics'] = stats
        result[name]['P value'] = pval

    result['_repr_brtc_'] = rb.get()

    return {'result': result}
Пример #23
0
def _wilcoxon_test2(table,
                    first_col,
                    second_col,
                    zero_method='wilcox',
                    correction=False):
    result = dict()
    rb = BrtcReprBuilder()
    rb.addMD("""## Wilcoxon Test Result""")

    alter_hypothesis = []
    stats = []
    pvals = []

    stat, pval = wilcoxon(x=table[first_col],
                          y=table[second_col],
                          zero_method=zero_method,
                          correction=correction)
    alter_hypothesis.append('Median of the differences != 0')
    stats.append(stat)
    pvals.append(pval)

    result_table = pd.DataFrame({
        'Alternative hypothesis': alter_hypothesis,
        'Sum of differences ranks': stats,
        'P-value': pvals
    })

    rb.addMD(
        strip_margin("""
    | {table}
    """.format(table=pandasDF2MD(result_table))))
    result['_repr_brtc_'] = rb.get()

    return {'result': result}
Пример #24
0
def _outlier_detection_lof(table, input_cols, n_neighbors=20, result_type='add_prediction', new_column_name='is_outlier'):
    out_table = table.copy()
    features = out_table[input_cols]
    lof_model = LocalOutlierFactor(n_neighbors, algorithm='auto', leaf_size=30, metric='minkowski', p=2, novelty=True, contamination=0.1)
    lof_model.fit(features)
    
    isinlier = lambda _: 'in' if _ == 1 else 'out'
    out_table[new_column_name] = [isinlier(lof_predict) for lof_predict in lof_model.predict(features)]
    
    if result_type == 'add_prediction':
        pass
    elif result_type == 'remove_outliers':
        out_table = out_table[out_table[new_column_name] == 'in']
        out_table = out_table.drop(new_column_name, axis=1)
    elif result_type == 'both':
        out_table = out_table[out_table[new_column_name] == 'in']
    else:
        raise_runtime_error("Please check 'result_type'.")        
    
    params = {
        'Input Columns': input_cols,
        'Result Type': result_type,
        'Number of Neighbors': n_neighbors,
    }
    
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## Outlier Detection (Local Outlier Factor) Result
    | ### Parameters
    |
    | {display_params}
    |
    """.format(display_params=dict2MD(params))))
    
    model = _model_dict('outlier_detection_lof')
    model['params'] = params
    model['lof_model'] = lof_model
    model['input_cols'] = input_cols
    model['result_type'] = result_type
    model['num_neighbors'] = n_neighbors
    model['_repr_brtc_'] = rb.get()
    
    return {'out_table': out_table, 'model': model}
Пример #25
0
def _pls_regression_train(table, feature_cols, label_cols, n_components=2, scale=True, max_iter=500, tol=1e-6):
    pls_model = PLS(n_components=n_components, scale=scale, max_iter=max_iter, tol=tol)
    _, features = check_col_type(table, feature_cols)
    _, labels = check_col_type(table, label_cols)
    pls_model.fit(features, labels)
    predict = pls_model.predict(features)
    _mean_absolute_error = mean_absolute_error(labels, predict)
    _mean_squared_error = mean_squared_error(labels, predict)
    _r2_score = r2_score(labels, predict)
    result_table = pd.DataFrame.from_items([
        ['Metric', ['Mean Absolute Error', 'Mean Squared Error', 'R2 Score']],
        ['Score', [_mean_absolute_error, _mean_squared_error, _r2_score]]
    ])
    label_name = {
        'n_components': 'Number of components',
        'scale': "Scale",
        'max_iter': 'Max iteration',
        'tol': 'Tolerance'
    }
    get_param = pls_model.get_params()
    param_table = pd.DataFrame.from_items([
        ['Parameter', list(label_name.values())],
        ['Value', [get_param[x] for x in list(label_name.keys())]]
    ])
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ### PLS Regression Result
    | {result}
    | ### Parameters
    | {list_parameters}
    """.format(result=pandasDF2MD(result_table), list_parameters=pandasDF2MD(param_table)
               )))
    model = _model_dict('pls_regression_model')
    model['feature_cols'] = feature_cols
    model['label'] = label_cols
    model['mean_absolute_error'] = _mean_absolute_error
    model['mean_squared_error'] = _mean_squared_error
    model['r2_score'] = _r2_score
    model['max_iter'] = max_iter
    model['tol'] = tol
    model['pls_model'] = pls_model
    model['_repr_brtc_'] = rb.get()
    return {'model': model}
Пример #26
0
def _autocorrelation(table, input_col, nlags=20, conf_level=0.95):
    data = table[input_col]
    
    plt.figure()
    plot_acf(data, lags=nlags, alpha=1 - conf_level)
    fig_plt_acf = plt2MD(plt)
    plt.clf()
    
    plt.figure()
    plot_pacf(data, lags=nlags, alpha=1 - conf_level)
    fig_plt_pacf = plt2MD(plt)
    plt.clf()
    
    acf_ret = acf(data, nlags=nlags, alpha=1-conf_level)
    pacf_ret = pacf(data, nlags=nlags, alpha=1-conf_level)
    
    result_table1 = pd.DataFrame([])
    result_table1['lag'] = list(range(nlags + 1))
    result_table1['ACF'] = acf_ret[0]
    
    if conf_level is not None:
        result_table1['%g%% confidence Interval' % (conf_level * 100)] = [str((acf_ret[1][i][0], acf_ret[1][i][1]))  for i in range(nlags + 1)]
    
    result_table2 = pd.DataFrame([])
    result_table2['lag'] = list(range(nlags + 1))
    result_table2['PACF'] = pacf_ret[0]
    
    if conf_level is not None:
        result_table2['%g%% confidence Interval' % (conf_level * 100)] = [str((pacf_ret[1][i][0], pacf_ret[1][i][1])) for i in range(nlags + 1)]
    
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""# Autocorrelation / Partial Autocorrelation Result"""))
    rb.addMD(strip_margin("""
    |## Autocorrelation
    |
    |{image1}
    |
    |### Autocorrelation Table
    |
    |{result_table1}
    |
    |## Partial Autocorrelation
    |
    |{image2}
    |
    |### Partial Autocorrelation Table
    |
    |{result_table2}
    |
    """.format(image1=fig_plt_acf, result_table1=pandasDF2MD(result_table1, num_rows=nlags + 1), image2=fig_plt_pacf, result_table2=pandasDF2MD(result_table2, num_rows=nlags + 1))))

    model = _model_dict('autocorrelation')
    model['autocorrelation_table'] = result_table1
    model['partial_autocorrelation_table'] = result_table2
    model['_repr_brtc_'] = rb.get()
        
    return {'model':model}
Пример #27
0
def _hierarchical_clustering_post(table,
                                  model,
                                  num_clusters,
                                  cluster_col='prediction'):
    Z = model['model']
    mode = model['input_mode']
    if mode == 'matrix':
        distance_matrix = model['dist_matrix']
    out_table = model['linkage_matrix']

    predict = fcluster(Z, t=num_clusters, criterion='maxclust')
    if mode == 'original':
        prediction_table = table.copy()
    elif mode == 'matrix':
        prediction_table = distance_matrix
    prediction_table[cluster_col] = predict

    L, M = leaders(Z, predict)
    which_cluster = []
    for leader in L:
        if leader in Z[:, 0]:
            select_indices = np.where(Z[:, 0] == leader)[0][0]
            which_cluster.append(out_table['joined_column1'][select_indices])
        elif leader in Z[:, 1]:
            select_indices = np.where(Z[:, 1] == leader)[0][0]
            which_cluster.append(out_table['joined_column2'][select_indices])

    clusters_info_table = pd.DataFrame([])
    clusters_info_table[cluster_col] = M
    clusters_info_table['name_of_clusters'] = which_cluster
    clusters_info_table = clusters_info_table.sort_values(cluster_col)
    cluster_count = np.bincount(prediction_table[cluster_col])
    cluster_count = cluster_count[cluster_count != 0]
    clusters_info_table['num_of_entities'] = list(cluster_count)

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""### Hierarchical Clustering Post Process Result"""))
    rb.addMD(
        strip_margin("""
    |### Parameters
    |
    |{display_params}
    |
    |## Clusters Information
    |
    |{clusters_info_table}
    |
    """.format(display_params=dict2MD(model['parameters']),
               clusters_info_table=pandasDF2MD(clusters_info_table))))

    model = _model_dict('hierarchical_clustering_post')
    model['clusters_info'] = clusters_info_table
    model['_repr_brtc_'] = rb.get()

    return {'out_table': prediction_table, 'model': model}
Пример #28
0
def _plot_roc_pr_curve(table,
                       label_col,
                       probability_col,
                       fig_w=6.4,
                       fig_h=4.8,
                       pos_label=None):
    label = table[label_col]
    probability = table[probability_col]

    threshold, fig_tpr_fpr, fig_roc, fig_precision_recall, fig_pr, fig_confusion = \
        _plot_binary(label, probability, fig_size=(fig_w, fig_h), pos_label=pos_label)

    summary = dict()
    summary['threshold'] = threshold
    summary['label_col'] = label_col
    summary['probability_col'] = probability_col

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Plot ROC Curve and PR Curve Result
    |
    | ### ROC Curve
    | {fig_tpr_fpr}
    | {fig_roc}
    |
    | ### PR Curve
    | {fig_precision_recall}
    | {fig_pr}
    |
    | ### Confusion Matrix
    | {fig_confusion}
    """.format(fig_roc=fig_roc,
               fig_tpr_fpr=fig_tpr_fpr,
               fig_pr=fig_pr,
               fig_precision_recall=fig_precision_recall,
               fig_confusion=fig_confusion)))
    summary['_repr_brtc_'] = rb.get()

    return {'result': summary}
Пример #29
0
def _pairplot(table,
              x_vars,
              y_vars=None,
              kind='scatter',
              diag_kind='auto',
              markers=None,
              palette=None,
              height=2.5,
              aspect=1,
              dropna=True,
              hue=None):

    validate(greater_than(height, 0, 'height'),
             greater_than(aspect, 0, 'aspect'))

    s_default = plt.rcParams['lines.markersize']**2.
    plot_kws = {"s": s_default * height / 6.4}

    if y_vars is None:
        y_vars = x_vars

    if kind == 'scatter':
        g = sns.pairplot(table, x_vars=x_vars, y_vars=y_vars, kind=kind, diag_kind=diag_kind, markers=markers, height=height, aspect=aspect, \
                         dropna=dropna, hue=hue, palette=palette, plot_kws=plot_kws)
    else:
        scatter_kws = {'scatter_kws': plot_kws}
        g = sns.pairplot(table, x_vars=x_vars, y_vars=y_vars, kind=kind, diag_kind=diag_kind, markers=markers, height=height, aspect=aspect, \
                         dropna=dropna, hue=hue, palette=palette, plot_kws=scatter_kws)

    if height <= 2.5:
        for ax in g.axes.flatten():
            for label in ax.get_xticklabels():
                label.set_rotation(90 * (2.5 - height))

    rb = BrtcReprBuilder()
    rb.addPlt(plt)
    plt.clf()

    return {'result': {'_repr_brtc_': rb.get()}}
Пример #30
0
def _term_term_mtx(table, model, input_col, result_type='sparse'):
    corpus = table[input_col].tolist()

    dictionary = model['dictionary']

    bow_corpus = []
    for doc in corpus:
        bow_corpus.append(dictionary.doc2bow(doc))

    csr_matrix = matutils.corpus2csc(bow_corpus).T
    csr_matrix.data = np.array([1 for _ in range(len(csr_matrix.data))])
    term_term = (csr_matrix.T @ csr_matrix).tocoo()

    if result_type == 'sparse':
        term_term = sparse.triu(term_term, k=1)
        out_table = pd.DataFrame([dictionary[i] for i in term_term.row],
                                 columns=['term1'])
        out_table['term2'] = [dictionary[j] for j in term_term.col]
        out_table['number_of_documents_containing_terms'] = term_term.data

    elif result_type == 'dense':
        if model['add_words'] is None:
            model['add_words'] = []
        num_origin = len(dictionary) - len(model['add_words'])
        terms = [term for term in dictionary.token2id.keys()][:num_origin]
        doc_idx = ['doc_{}'.format(i) for i in range(len(corpus))]
        out_table = pd.DataFrame(term_term.todense())
        out_table.insert(loc=0, column=' ', value=terms)
        out_table.columns = np.append(" ", terms)

    else:
        raise_runtime_error("Please check 'result_type'.")

    rb = BrtcReprBuilder()
    model = _model_dict('term_term_mtx')
    model['term_term_mtx'] = term_term
    model['_repr_brtc_'] = rb.get()

    return {'out_table': out_table}