Exemplo n.º 1
0
def read_from_db(datasource, sql):
    if sql is None:
        raise_runtime_error('sql is required parameter')

    with DbEngine(**datasource) as engine:
        df = pd.read_sql_query(sql, engine)
        util.validate_column_name(df)
        return {'table': df}
Exemplo n.º 2
0
def _logistic_regression_train(table, feature_cols, label_col, penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear', max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=1):
    features = table[feature_cols]
    label = table[label_col]

    if(sklearn_utils.multiclass.type_of_target(label) == 'continuous'):
        raise_runtime_error('''Label Column should not be continuous.''')
    
    lr_model = LogisticRegression(penalty, dual, tol, C, fit_intercept, intercept_scaling, class_weight, random_state, solver, max_iter, multi_class, verbose, warm_start, n_jobs)
    lr_model.fit(features, label)

    featureNames = np.append("Intercept", feature_cols)
    intercept = lr_model.intercept_
    coefficients = lr_model.coef_
    classes = lr_model.classes_
    is_binary = len(classes) == 2

    if (fit_intercept == True):
        summary = pd.DataFrame({'features': ['intercept'] + feature_cols})
        print(intercept)
        print(coefficients)
        
        coef_trans = np.concatenate(([intercept], np.transpose(coefficients)), axis=0)
        if not is_binary:
            summary = pd.concat((summary, pd.DataFrame(coef_trans, columns=classes)), axis=1)
        elif is_binary:
            summary = pd.concat((summary, pd.DataFrame(coef_trans, columns=[classes[0]])), axis=1)
            
    else:
        summary = pd.DataFrame({'features': feature_cols})
        coef_trans = np.transpose(coefficients)
        
        if not is_binary:
            summary = pd.concat((summary, pd.DataFrame(coef_trans, columns=classes)), axis=1)
        elif is_binary:
            summary = pd.concat((summary, pd.DataFrame(coef_trans, columns=[classes[0]])), axis=1)
    
    prob = lr_model.predict_proba(features)
    
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## Logistic Regression Result
    | ### Summary
    | {table1}
    """.format(table1=pandasDF2MD(summary)
               )))

    model = dict()
    model['features'] = feature_cols
    model['label'] = label_col
    model['intercept'] = lr_model.intercept_
    model['coefficients'] = lr_model.coef_
    model['class'] = lr_model.classes_
    model['penalty'] = penalty
    model['solver'] = solver
    model['lr_model'] = lr_model
    model['_repr_brtc_'] = rb.get()

    return {'model' : model}
Exemplo n.º 3
0
def _svm_classification_train(table,
                              feature_cols,
                              label_col,
                              c=1.0,
                              kernel='rbf',
                              degree=3,
                              gamma='auto',
                              coef0=0.0,
                              shrinking=True,
                              probability=True,
                              tol=1e-3,
                              max_iter=-1,
                              random_state=None):
    validate(greater_than(c, 0.0, 'c'))

    _table = table.copy()

    _feature_cols = _table[feature_cols]
    _label_col = _table[label_col]

    if (sklearn_utils.multiclass.type_of_target(_label_col) == 'continuous'):
        raise_runtime_error('''Label Column should not be continuous.''')

    _svc = svm.SVC(C=c,
                   kernel=kernel,
                   degree=degree,
                   gamma=gamma,
                   coef0=coef0,
                   shrinking=shrinking,
                   probability=probability,
                   tol=tol,
                   max_iter=max_iter,
                   random_state=random_state)
    _svc_model = _svc.fit(_feature_cols, _label_col)

    get_param = _svc.get_params()
    get_param['feature_cols'] = feature_cols
    get_param['label_col'] = label_col

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## SVM Classification Result
    | ### Parameters
    | {table_parameter} 
    """.format(table_parameter=dict2MD(get_param))))

    _model = _model_dict('svc_model')
    _model['svc_model'] = _svc_model
    _model['features'] = feature_cols
    _model['_repr_brtc_'] = rb.get()

    return {'model': _model}
Exemplo n.º 4
0
def _kmeans_predict(table, model, prediction_col='prediction'):
    if model['_context'] == 'python' and model['_type'] == 'kmeans':
        k_means = model['model']
        input_cols = model['input_cols']
        predict = k_means.predict(table[input_cols])
        out_table = table.copy()
        out_table[prediction_col] = predict
    elif model['_context'] == 'python' and model['_type'] == 'kmeans_silhouette':
        k_means = model['best_model']
        input_cols = model['input_cols']
        predict = k_means.predict(table[input_cols])
        out_table = table.copy()
        out_table[prediction_col] = predict
    else:
        raise_runtime_error("Unsupported model")
        # raise Exception("Unsupported model")
    
    return {'out_table':out_table}
Exemplo n.º 5
0
def _one_hot_encoder(table,
                     input_cols,
                     prefix='list',
                     prefix_list=None,
                     suffix='index',
                     n_values='auto',
                     categorical_features='all',
                     sparse=True,
                     handle_unknown='error'):
    out_table = table.copy()
    sparse = False
    enc_list = []
    le_list = []
    prefix_list_index = 0
    if prefix == 'list':
        len_prefix_list = 0 if prefix_list is None else len(prefix_list)
        if len(input_cols) != len_prefix_list:
            # TODO: make the error message code
            raise_runtime_error(
                'The number of Input Columns and the numnber of Prefixes should be equal.'
            )
    for col_name in input_cols:
        enc = OneHotEncoder(n_values=n_values,
                            categorical_features=categorical_features,
                            sparse=sparse,
                            handle_unknown=handle_unknown)
        le = LabelEncoder()
        new_col_names = []
        if suffix == 'index':
            if prefix == 'list':
                for i in range(0, len(np.unique(out_table[col_name].values))):
                    new_col_names.append(prefix_list[prefix_list_index] + '_' +
                                         str(i))
            else:
                for i in range(0, len(np.unique(out_table[col_name].values))):
                    new_col_names.append(col_name + '_' + str(i))
        else:
            if prefix == 'list':
                for stri in np.unique(out_table[col_name].values):
                    new_col_names.append(prefix_list[prefix_list_index] + '_' +
                                         stri)
            else:
                for stri in np.unique(out_table[col_name].values):
                    new_col_names.append(col_name + '_' + stri)
        out_table = pd.concat([
            out_table.reset_index(drop=True),
            pd.DataFrame(enc.fit_transform(
                le.fit_transform(out_table[col_name]).reshape(-1, 1)),
                         columns=new_col_names)
        ],
                              axis=1)
        enc_list.append(enc)
        le_list.append(le)
        prefix_list_index = prefix_list_index + 1

    out_model = _model_dict('one_hot_encoder')
    out_model['one_hot_encoder_list'] = enc_list
    out_model['label_encoder_list'] = le_list
    out_model['input_cols'] = input_cols
    out_model['classes'] = le.classes_
    out_model['active_features'] = enc.active_features_
    out_model['feature_indices'] = enc.feature_indices_
    out_model['n_values'] = enc.n_values_
    out_model['prefix'] = prefix
    out_model['prefix_list'] = prefix_list
    out_model['suffix'] = suffix

    return {'out_table': out_table, 'model': out_model}
Exemplo n.º 6
0
def write_to_db(table, tableName, datasource, ifExists='fail'):
    if not isinstance(table, pd.DataFrame):
        raise_runtime_error('table is not pandas.DataFrame')

    with DbEngine(**datasource) as engine:
        table.to_sql(tableName, engine, if_exists=ifExists, index=False)
Exemplo n.º 7
0
def _glm_train(table,
               feature_cols,
               label_col,
               family="Gaussian",
               link="ident",
               fit_intercept=True):
    features = table[feature_cols]
    label = table[label_col]

    if label_col in feature_cols:
        raise_runtime_error("%s is duplicated." % label_col)

    if family == "Gaussian":
        sm_family = sm.families.Gaussian()
    elif family == "inv_Gaussian":
        sm_family = sm.families.InverseGaussian()
    elif family == "binomial":
        sm_family = sm.families.Binomial()
    elif family == "Poisson":
        sm_family = sm.families.Poisson()
    elif family == "neg_binomial":
        sm_family = sm.families.NegativeBinomial()
    elif family == "gamma":
        sm_family = sm.families.Gamma()
    elif family == "Tweedie":
        sm_family = sm.families.Tweedie()

    if link == "ident":
        sm_link = sm.families.links.identity
    elif link == "log":
        sm_link = sm.families.links.log
    elif link == "logit":
        sm_link = sm.families.links.logit
    elif link == "probit":
        sm_link = sm.families.links.probit
    elif link == "cloglog":
        sm_link = sm.families.links.cLogLog
    elif link == "pow":
        sm_link = sm.families.links.Power
    elif link == "nbinom":
        sm_link = sm.families.links.binom

    if fit_intercept == True:
        glm_model = sm.GLM(label,
                           sm.add_constant(features),
                           family=sm_family,
                           link=sm_link).fit()
    else:
        glm_model = sm.GLM(label, features, family=sm_family,
                           link=sm_link).fit()
    summary = glm_model.summary().as_html()

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## GLM Result
    | ### Summary
    |
    """))
    rb.addHTML(summary)

    model = _model_dict('glm_model')
    model['features'] = feature_cols
    model['label'] = label_col
    model['family'] = family
    model['link'] = link
    model['coefficients'] = glm_model.params
    model['aic'] = glm_model.aic
    model['bic'] = glm_model.bic
    model['tvalues'] = glm_model.tvalues
    model['pvalues'] = glm_model.pvalues
    model['fit_intercept'] = fit_intercept
    model['glm_model'] = glm_model
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Exemplo n.º 8
0
def _tfidf(table,
           input_col,
           max_df=None,
           min_df=1,
           num_voca=1000,
           idf_weighting_scheme='inverseDocumentFrequency',
           norm='l2',
           smooth_idf=True,
           sublinear_tf=False,
           output_type=False):
    corpus = table[input_col]
    if max_df == None:
        max_df = len(corpus)
    tf_vectorizer = CountVectorizer(stop_words='english',
                                    max_df=max_df,
                                    min_df=min_df,
                                    max_features=num_voca)
    tf_vectorizer.fit(corpus)

    voca_dict = tf_vectorizer.vocabulary_

    tfidf_vectorizer = TfidfVectorizer(stop_words='english',
                                       max_df=max_df,
                                       min_df=min_df,
                                       max_features=num_voca,
                                       norm=norm,
                                       use_idf=True,
                                       smooth_idf=smooth_idf,
                                       sublinear_tf=sublinear_tf)
    tfidf_vectorizer.fit(corpus)

    tf_feature_names = tf_vectorizer.get_feature_names()
    idf_table = pd.DataFrame()
    idf_table['vocabulary'] = tf_feature_names
    if idf_weighting_scheme == 'inverseDocumentFrequency':
        idf_table['idf weight'] = tfidf_vectorizer.idf_.tolist()
    elif idf_weighting_scheme == 'unary':
        idf_table['idf weight'] = float(1)

    tfidf_table = pd.DataFrame()
    for doc in range(len(corpus)):
        each_tfidf_table = pd.DataFrame()
        each_tfidf_table[input_col] = [
            str(corpus[doc]) for j in range(len(voca_dict.keys()))
        ]
        each_tfidf_table['vocabulary'] = voca_dict.keys()
        each_tfidf_table['index'] = voca_dict.values()
        each_tfidf_table['frequency'] = [
            np.ravel(tf_vectorizer.transform([corpus[doc]]).toarray())[idx]
            for idx in voca_dict.values()
        ]
        if idf_weighting_scheme == 'inverseDocumentFrequency':
            each_tfidf_table['tfidf score'] = [
                np.ravel(tfidf_vectorizer.transform([corpus[doc]
                                                     ]).toarray())[idx]
                for idx in voca_dict.values()
            ]
        elif idf_weighting_scheme == 'unary':
            each_tfidf_table['tfidf score'] = [
                np.ravel(tfidf_vectorizer.transform([corpus[doc]
                                                     ]).toarray())[idx] /
                float(tfidf_vectorizer.idf_[idx])
                for idx in voca_dict.values()
            ]
        each_tfidf_table = each_tfidf_table.sort_values(by=['index'], axis=0)
        tfidf_table = pd.concat([tfidf_table, each_tfidf_table], axis=0)

    if output_type == False:
        pass
    elif output_type == True:
        remain_idx = tfidf_table['frequency'].apply(lambda x: x != 0)
        tfidf_table = tfidf_table[remain_idx.values]
    else:
        raise_runtime_error("Please check 'output_type'.")

    params = {
        'Input Column': input_col,
        'Max DF': max_df,
        'Min DF': min_df,
        'Number of Vocabularies': num_voca,
        'IDF Weighting Scheme': idf_weighting_scheme,
        'Norm': norm,
        'Smooth IDF': smooth_idf,
        'Sublinear TF': sublinear_tf,
        'Remove Zero Counts': output_type
    }

    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""# TF-IDF Result"""))
    rb.addMD(
        strip_margin("""
    |
    |### Parameters
    |
    |{display_params}
    |
    |### IDF Table
    |
    |{idf_table}
    |
    |### TFIDF Table
    |
    |{tfidf_table}
    |
    """.format(display_params=dict2MD(params),
               idf_table=pandasDF2MD(idf_table,
                                     num_rows=len(tf_feature_names) + 1),
               tfidf_table=pandasDF2MD(
                   tfidf_table,
                   num_rows=len(tf_feature_names) * len(corpus) + 1))))

    model = _model_dict('tfidf')
    model['idf_table'] = idf_table
    model['tfidf_table'] = tfidf_table
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Exemplo n.º 9
0
def _lda(table, input_col, num_voca=1000, num_topic=3, num_topic_word=3, max_iter=20, learning_method='online', learning_offset=10., random_state=None):
    corpus = table[input_col]
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=num_voca, stop_words='english')
    term_count = tf_vectorizer.fit_transform(corpus)
    tf_feature_names = tf_vectorizer.get_feature_names()

    if learning_method == 'online':
        lda_model = LatentDirichletAllocation(n_components=num_topic, max_iter=max_iter, learning_method=learning_method, learning_offset=learning_offset, random_state=random_state).fit(term_count)
    elif learning_method == 'batch':
        lda_model = LatentDirichletAllocation(n_components=num_topic, max_iter=max_iter, learning_method=learning_method, random_state=random_state).fit(term_count)
    else:
        raise_runtime_error("Please check 'learning_method'.")
    
    topic_model = pd.DataFrame([])
    topic_idx_list = []
    voca_weights_list = []
    for topic_idx, weights in enumerate(lda_model.components_):
        topic_idx_list.append("Topic {}".format(topic_idx))
        pairs = []
        for term_idx, value in enumerate(weights):
            pairs.append((abs(value), tf_feature_names[term_idx]))
        pairs.sort(key=lambda x: x[0], reverse=True)
        voca_weights = []
        for pair in pairs[:num_topic_word]:
            voca_weights.append("{}: {}".format(pair[1], pair[0]))
        voca_weights_list.append(voca_weights)
    topic_model['topic idx'] = topic_idx_list
    topic_model['topic vocabularies'] = voca_weights_list

    doc_topic = lda_model.transform(term_count)

    doc_classification = pd.DataFrame()
    doc_classification['documents'] = [doc for doc in corpus]
    doc_classification['top topic'] = ["Topic {}".format(doc_topic[i].argmax()) for i in range(len(corpus))]
    
    params = {
        'Input Column': input_col,
        'Number of Vocabularies': num_voca,
        'Number of Topics': num_topic,
        'Number of Terminologies': num_topic_word,
        'Iteration': max_iter,
        'Learning Method': learning_method,
    }
    
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""# Latent Dirichlet Allocation Result"""))
    rb.addMD(strip_margin("""
    |
    |### Parameters
    |
    | {display_params}
    |
    |### Topic Model
    |
    |{topic_model}
    |
    |### Documents Classification
    |
    |{doc_classification}
    |
    """.format(display_params=dict2MD(params), topic_model=pandasDF2MD(topic_model, num_rows=num_topic + 1), doc_classification=pandasDF2MD(doc_classification, num_rows=len(corpus) + 1))))
    
    model = _model_dict('lda')
    model['topic_model'] = topic_model
    model['documents_classification'] = doc_classification
    model['_repr_brtc_'] = rb.get()

    return {'model' : model}
Exemplo n.º 10
0
def _hierarchical_clustering(table, input_cols, input_mode='original', key_col=None, link='complete', met='euclidean', num_rows=20, figure_height=6.4, orient='right'):
    out_table = table.copy()
    features = out_table[input_cols]
    
    if input_mode == 'original':
        len_features = len(features)
        if key_col != None:
            data_names = list(out_table[key_col])
        elif key_col == None:
            data_names = ['pt_' + str(i) for i in range(len_features)]
        out_table['name']=data_names
        Z = linkage(ssd.pdist(features, metric=met), method=link, metric=met)
    elif input_mode == 'matrix':
        len_features = len(input_cols)
        if key_col != None:
            data_names = []
            for column in input_cols:
                data_names.append(out_table[key_col][out_table.columns.get_loc(column)])
        elif key_col == None:
            data_names = []
            for column in input_cols:
                data_names.append(out_table.columns[out_table.columns.get_loc(column)])
        col_index = []
        for column in input_cols:
            col_index.append(out_table.columns.get_loc(column))
        dist_matrix = features.iloc[col_index]
        
        Z = linkage(ssd.squareform(dist_matrix), method=link, metric=met)
        dist_matrix['name'] = data_names
    else:
        raise_runtime_error("Please check 'input_mode'.")

    range_len_Z = range(len(Z))
    linkage_matrix = pd.DataFrame([])
    linkage_matrix['linkage step'] = [x + 1 for x in reversed(range_len_Z)]
    linkage_matrix['name of clusters'] = ['CL_' + str(i + 1) for i in reversed(range_len_Z)]
    joined_column1 = []
    for i in range_len_Z:
        if Z[:, 0][i] < len_features:
            joined_column1.append(data_names[int(Z[:, 0][i])])
        elif Z[:, 0][i] >= len_features:
            joined_column1.append(linkage_matrix['name of clusters'][Z[:, 0][i] - len_features])
    linkage_matrix['joined column1'] = joined_column1
    joined_column2 = []
    for i in range_len_Z:
        if Z[:, 1][i] < len_features:
            joined_column2.append(data_names[int(Z[:, 1][i])])
        elif Z[:, 1][i] >= len_features:
            joined_column2.append(linkage_matrix['name of clusters'][Z[:, 1][i] - len_features])
    linkage_matrix['joined column2'] = joined_column2
    
    linkage_matrix['distance'] = [distance for distance in Z[:, 2]]
    linkage_matrix['number of original'] = [int(entities) for entities in Z[:, 3]]
    linkage_matrix = linkage_matrix.reindex(index=linkage_matrix.index[::-1])[0:]
    
    # calculate full dendrogram
 
    plt.figure(figsize=(8.4, figure_height))
    dendrogram(
        Z,
        truncate_mode='none',
        get_leaves=True,
        orientation=orient,
        labels=data_names,
        leaf_rotation=45,
        leaf_font_size=10.,
        show_contracted=False
    )
    plt.title('Hierarchical Clustering Dendrogram')
    if orient == 'top':
        plt.xlabel('Samples')
        plt.ylabel('Distance')
    elif orient == 'right':
        plt.xlabel('Distance')
        plt.ylabel('Samples')
    plt.tight_layout()
    plt2 = plt2MD(plt)
    plt.clf()
    
    params = { 
        'Input Columns': input_cols,
        'Input Mode': input_mode,
        'Linkage Method': link,
        'Metric': met,
        'Number of Rows in Linkage Matrix': num_rows
    }
    
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""# Hierarchical Clustering Result"""))
    rb.addMD(strip_margin("""
    |### Dendrogram
    |
    |{image}
    |
    |### Parameters
    |
    |{display_params}
    |
    |### Linkage Matrix
    |
    |{out_table1}
    |
    """.format(image=plt2, display_params=dict2MD(params), out_table1=pandasDF2MD(linkage_matrix.head(num_rows)))))

    model = _model_dict('hierarchical_clustering')
    model['model'] = Z
    model['input_mode'] = input_mode
    model['table']=out_table
    if input_mode == 'matrix':
        model['dist_matrix'] = dist_matrix
    model['parameters'] = params
    model['linkage_matrix'] = linkage_matrix
    model['_repr_brtc_'] = rb.get()
        
    return {'model':model}
def _penalized_linear_regression_train(table, feature_cols, label_col, regression_type='ridge', alpha=1.0, l1_ratio=0.5, fit_intercept=True, max_iter=1000, tol=0.0001, random_state=None):
    out_table = table.copy()
    features = out_table[feature_cols]
    label = out_table[label_col]
    if regression_type == 'ridge':
        regression_model = Ridge(alpha=alpha, fit_intercept=fit_intercept, max_iter=None, tol=tol, solver='auto', random_state=random_state)
    elif regression_type == 'lasso':
        regression_model = Lasso(alpha=alpha, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, random_state=random_state, selection='random')
    elif regression_type == 'elastic_net':
        regression_model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, random_state=random_state, selection='random')
    else:
        raise_runtime_error("Please check 'regression_type'.")
    
    regression_model.fit(features, label)
    
    out_table1 = pd.DataFrame([])
    out_table1['x_variable_name'] = [variable for variable in feature_cols]
    out_table1['coefficient'] = regression_model.fit(features, label).coef_
    intercept = pd.DataFrame([['intercept', regression_model.fit(features, label).intercept_]], columns=['x_variable_name', 'coefficient'])
    if fit_intercept == True:
        out_table1 = out_table1.append(intercept, ignore_index=True)
        
    predict = regression_model.predict(features)
    residual = label - predict
    
    out_table['predict'] = predict
    out_table['residual'] = residual

    if regression_type == 'elastic_net':
        params = {
        
        'Feature Columns' : feature_cols,
        'Label Column' : label_col,
        'Regression Type': regression_type,
        'Regularization (Penalty Weight)' : alpha,
        'L1 Ratio': l1_ratio,
        'Fit Intercept' : fit_intercept,
        'Maximum Number of Iterations' : max_iter,
        'Tolerance' : tol
        
        }
    else:
        params = {
        
        'Feature Columns' : feature_cols,
        'Label Column' : label_col,
        'Regression Type': regression_type,
        'Regularization (Penalty Weight)' : alpha,
        'Fit Intercept' : fit_intercept,
        'Maxium Number of Iterations' : max_iter,
        'Tolerance' : tol
        
        }
    
    score = {
        'MSE' : mean_squared_error(label, predict),
        'R2' : r2_score(label, predict)
    }

    plt.figure()
    plt.scatter(predict, label)
    plt.xlabel('Predicted values for ' + label_col)
    plt.ylabel('Actual values for ' + label_col)
    x = predict
    p1x = np.min(x)
    p2x = np.max(x)
    plt.plot([p1x, p2x], [p1x, p2x], 'r--')
    fig_actual_predict = plt2MD(plt)
    plt.clf()

    plt.figure()
    plt.scatter(predict, residual)
    plt.xlabel('Predicted values for ' + label_col)
    plt.ylabel('Residuals')
    plt.axhline(y=0, color='r', linestyle='--')
    fig_residual_1 = plt2MD(plt)
    plt.clf()

    plt.figure()
    sm.qqplot(residual, line='s')
    plt.ylabel('Residuals')
    fig_residual_2 = plt2MD(plt)
    plt.clf()

    plt.figure()
    sns.distplot(residual)
    plt.xlabel('Residuals')
    fig_residual_3 = plt2MD(plt)
    plt.clf()
    
    # checking the magnitude of coefficients
    
    plt.figure()
    predictors = features.columns
    coef = Series(regression_model.coef_, predictors).sort_values()
    coef.plot(kind='bar', title='Model Coefficients')
    plt.tight_layout()
    fig_model_coefficients = plt2MD(plt)
    plt.clf()

    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | # Penalized Linear Regression Result
    | ### Selected Parameters: 
    | {params}
    |
    | ## Results
    | ### Model Parameters
    | {out_table1}
    |
    | ### Prediction and Residual
    | {out_table2}
    |
    | ### Regression Score
    | {score}
    |
    """.format(params=dict2MD(params), out_table1=pandasDF2MD(out_table1), out_table2=pandasDF2MD(out_table, num_rows=len(out_table) + 1), score=dict2MD(score))))
    rb.addMD(strip_margin("""
    |
    | ### Predicted vs Actual
    | {image1}
    |
    | ### Fit Diagnostics
    | {image2}
    | {image3}
    | {image4}
    |
    | ### Magnitude of Coefficients
    | {image5}
    |
    """.format(image1=fig_actual_predict,
               image2=fig_residual_1,
               image3=fig_residual_2,
               image4=fig_residual_3,
               image5=fig_model_coefficients
               )))

    model = _model_dict('penalized_linear_regression_model')
    model['feature_cols'] = feature_cols
    model['label_col'] = label_col
    model['regression_type'] = regression_type
    model['regression_model'] = regression_model
    model['model_parameters'] = out_table1
    model['prediction_residual'] = out_table
    model['_repr_brtc_'] = rb.get()

    return {'model' : model}
Exemplo n.º 12
0
def pivot(table, values, aggfunc, index=None, columns=None):  # TODO

    if index is None and columns is None:
        # TODO: assign an error code.
        raise_runtime_error('Group key value is required: Index or Columns.')

    def count(x):
        return len(x)

    def mean(x):
        return np.mean(x)

    def std(x):
        return np.std(x)

    def var(x):
        return np.var(x)

    def min(x):
        return np.min(x)

    def _25th(x):
        return np.percentile(x, 0.25)

    def median(x):
        return np.median(x)

    def _75th(x):
        return np.percentile(x, 0.75)

    def max(x):
        return np.max(x)

    def sum(x):
        return np.sum(x)

    def _mi2index(mi):
        return pd.Index([_replace_col(col) for col in mi.get_values()])

    def _replace_col(tup):
        col = '__'.join(str(elem) for elem in tup)

        for char in ' ,;{}()\n\t=':
            col.replace(char, '')

        return col

    func_list = []
    for func_name in aggfunc:
        if func_name == 'count':
            func_list.append(count)
        elif func_name == 'mean':
            func_list.append(mean)
        elif func_name == 'std':
            func_list.append(std)
        elif func_name == 'var':
            func_list.append(var)
        elif func_name == 'min':
            func_list.append(min)
        elif func_name == '_25th':
            func_list.append(_25th)
        elif func_name == 'median':
            func_list.append(median)
        elif func_name == '_75th':
            func_list.append(_75th)
        elif func_name == 'max':
            func_list.append(max)
        elif func_name == 'sum':
            func_list.append(sum)

    pivoted = pd.pivot_table(table,
                             values=values,
                             index=index,
                             columns=columns,
                             aggfunc=func_list,
                             fill_value=None,
                             margins=False,
                             margins_name='All')
    pivoted.columns = _mi2index(pivoted.columns)
    out_table = pd.concat([pivoted.index.to_frame(), pivoted], axis=1)
    return {'out_table': out_table}