示例#1
0
文件: encoder.py 项目: shovsj/studio
 def default(self, o):
     # TODO add more support types
     if isinstance(o, set):
         return {'__set__': list(o)}
     elif isinstance(o, numpy.ndarray):
         return {'__numpy__': _to_default_list(o)}
     elif hasattr(o, '_repr_html_'):
         rb = BrtcReprBuilder()
         rb.addHTML(o._repr_html_())
         return {
             '_repr_brtc_': rb.get(),
             '__pickled__': list(pickle.dumps(o))
         }
     elif hasattr(o, 'savefig'):
         rb = BrtcReprBuilder()
         rb.addPlt(o)
         return {
             '_repr_brtc_': rb.get(),
             '__pickled__': list(pickle.dumps(o))
         }
     else:
         rb = BrtcReprBuilder()
         rb.addRawTextMD(str(o))
         return {
             '_repr_brtc_': rb.get(),
             '__pickled__': list(pickle.dumps(o))
         }
示例#2
0
def _profile_table(table, bins=10, check_correlation=False, correlation_threshold=0.9, correlation_overrides=None):
    
    rb = BrtcReprBuilder()
    
    profile = pd_profiling.ProfileReport(table, bins=bins, check_correlation=check_correlation, correlation_threshold=correlation_threshold, correlation_overrides=correlation_overrides)
    rb.addHTML(profile.html)
    summary = dict()
    summary['_repr_brtc_'] = rb.get()
    
    return {'result': summary}
示例#3
0
def _tukeys_range_test(table, response_cols, factor_col, alpha=0.05):

    rb = BrtcReprBuilder()
    rb.addMD("""## Tukey's range test Result""")

    for response_col in response_cols:
        data = table[response_col]
        posthoc = pairwise_tukeyhsd(data, table[factor_col], alpha=alpha)
        posthoc_html = posthoc._results_table.as_html()
        posthoc.plot_simultaneous()

        rb.addMD("""### {response_col}""".format(response_col=response_col))
        rb.addHTML(posthoc_html)
        rb.addPlt(plt)
        plt.clf()

    return {'result': {'_repr_brtc_': rb.get()}}
示例#4
0
def tukeys_range_test(table, response_cols, factor_col, alpha=0.05):
    if alpha < 0.001 or alpha >= 0.9:
        raise BrighticsFunctionException("0006", ['alpha', 0.001, 0.9])

    rb = BrtcReprBuilder()
    rb.addMD("""## Tukey's range test Result""")

    for response_col in response_cols:
        data = table[response_col]
        posthoc = pairwise_tukeyhsd(data, table[factor_col], alpha=alpha)
        posthoc_html = posthoc._results_table.as_html()
        posthoc.plot_simultaneous()

        rb.addMD("""### {response_col}""".format(response_col=response_col))
        rb.addHTML(posthoc_html)
        rb.addPlt(plt)
        plt.clf()

    return {'result': {'_repr_brtc_': rb.get()}}
示例#5
0
def _profile_table(table,
                   bins=10,
                   check_correlation=False,
                   correlation_threshold=0.9,
                   correlation_overrides=None):

    validate(greater_than_or_equal_to(bins, 1, 'bins'),
             greater_than(correlation_threshold, 0.0, 'correlation_threshold'))

    rb = BrtcReprBuilder()

    profile = pd_profiling.ProfileReport(
        table,
        bins=bins,
        check_correlation=check_correlation,
        correlation_threshold=correlation_threshold,
        correlation_overrides=correlation_overrides)
    rb.addHTML(profile.html)
    summary = dict()
    summary['_repr_brtc_'] = rb.get()

    return {'result': summary}
示例#6
0
def _lda4(table,
          input_col,
          topic_name='topic',
          num_voca=1000,
          num_topic=5,
          num_topic_word=10,
          max_iter=20,
          learning_method='online',
          learning_offset=10.,
          random_state=None):
    # generate model
    corpus = np.array(table[input_col])
    if isinstance(corpus[0], np.ndarray):
        tf_vectorizer = CountVectorizer(preprocessor=' '.join,
                                        stop_words='english',
                                        max_df=0.95,
                                        min_df=2,
                                        max_features=num_voca)
    else:
        tf_vectorizer = CountVectorizer(max_df=0.95,
                                        min_df=2,
                                        max_features=num_voca,
                                        stop_words='english')
    term_count = tf_vectorizer.fit_transform(corpus)
    tf_feature_names = tf_vectorizer.get_feature_names()
    if learning_method == 'online':
        lda_model = LatentDirichletAllocation(
            n_components=num_topic,
            max_iter=max_iter,
            learning_method=learning_method,
            learning_offset=learning_offset,
            random_state=random_state).fit(term_count)
    elif learning_method == 'batch':
        lda_model = LatentDirichletAllocation(
            n_components=num_topic,
            max_iter=max_iter,
            learning_method=learning_method,
            random_state=random_state).fit(term_count)
    else:
        raise_runtime_error("Please check 'learning_method'.")
    log_likelihood = lda_model.score(term_count)
    perplexity = lda_model.perplexity(term_count)

    # create topic table
    vocab_weights_list = []
    vocab_list = []
    weights_list = []
    topic_term_prob = normalize(lda_model.components_, norm='l1')
    for vector in topic_term_prob:
        pairs = []
        for term_idx, value in enumerate(vector):
            pairs.append((abs(value), tf_feature_names[term_idx]))
        pairs.sort(key=lambda x: x[0], reverse=True)
        vocab_weights = []
        vocab = []
        weights = []
        for pair in pairs[:num_topic_word]:
            vocab_weights.append("{}: {}".format(pair[1], pair[0]))
            vocab.append(pair[1])
            weights.append(pair[0])
        vocab_weights_list.append(vocab_weights)
        vocab_list.append(vocab)
        weights_list.append(weights)
    topic_table = pd.DataFrame({
        'vocabularies_weights': vocab_weights_list,
        'vocabularies': vocab_list,
        'weights': weights_list
    })
    topic_table['index'] = [idx + 1 for idx in topic_table.index]
    topic_table = topic_table[[
        'index', 'vocabularies_weights', 'vocabularies', 'weights'
    ]]

    # create output table
    doc_topic = lda_model.transform(term_count)
    out_table = pd.DataFrame.copy(table, deep=True)
    topic_dist_name = topic_name + '_distribution'
    if topic_name in table.columns or topic_dist_name in table.columns:
        raise BrighticsFunctionException.from_errors([{
            '0100':
            "Existing table contains Topic Column Name. Please choose again."
        }])
    out_table[topic_name] = [
        doc_topic[i].argmax() + 1 for i in range(len(corpus))
    ]
    out_table[topic_dist_name] = doc_topic.tolist()

    # pyLDAvis
    prepared_data = ldavis.prepare(lda_model, term_count, tf_vectorizer)
    html_result = pyLDAvis.prepared_data_to_html(prepared_data)

    # generate report
    params = {
        'Input column': input_col,
        'Topic column name': topic_name,
        'Number of topics': num_topic,
        'Number of words for each topic': num_topic_word,
        'Maximum number of iterations': max_iter,
        'Learning method': learning_method,
        'Learning offset': learning_offset,
        'Seed': random_state
    }
    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Latent Dirichlet Allocation Result
    | ### Summary
    |
    """))
    rb.addHTML(html_result)
    rb.addMD(
        strip_margin("""
    |
    | ### Log Likelihood
    | {log_likelihood}
    |
    | ### Perplexity
    | {perplexity}
    |
    | ### Parameters
    | {params}
    """.format(log_likelihood=log_likelihood,
               perplexity=perplexity,
               params=dict2MD(params))))

    # create model
    model = _model_dict('lda_model')
    model['params'] = params
    model['lda_model'] = lda_model
    model['_repr_brtc_'] = rb.get()

    return {'out_table': out_table, 'topic_table': topic_table, 'model': model}
示例#7
0
def _linear_regression_train(table,
                             feature_cols,
                             label_col,
                             fit_intercept=True,
                             is_vif=False,
                             vif_threshold=10):
    feature_names, features = check_col_type(table, feature_cols)
    label = table[label_col]

    if fit_intercept == True:
        features = sm.add_constant(features, has_constant='add')
        lr_model_fit = sm.OLS(label, features).fit()
    else:
        lr_model_fit = sm.OLS(label, features).fit()

    predict = lr_model_fit.predict(features)
    residual = label - predict

    summary = lr_model_fit.summary()
    summary_tables = simple_tables2df_list(summary.tables, drop_index=True)
    summary0 = summary_tables[0]
    summary1 = summary_tables[1]

    if type(features) != type(table):
        features = pd.DataFrame(features)

    if is_vif:
        summary1['VIF'] = [
            variance_inflation_factor(features.values, i)
            for i in range(features.shape[1])
        ]
        summary1['VIF>{}'.format(vif_threshold)] = summary1['VIF'].apply(
            lambda _: 'true' if _ > vif_threshold else 'false')
    summary.tables[1] = _df_to_simpletable(summary1)
    summary2 = summary_tables[2]

    html_result = summary.as_html()

    plt.figure()
    plt.scatter(predict, label)
    plt.xlabel('Predicted values for ' + label_col)
    plt.ylabel('Actual values for ' + label_col)
    x = predict
    p1x = np.min(x)
    p2x = np.max(x)
    plt.plot([p1x, p2x], [p1x, p2x], 'r--')
    fig_actual_predict = plt2MD(plt)

    plt.figure()
    plt.scatter(predict, residual)
    plt.xlabel('Predicted values for ' + label_col)
    plt.ylabel('Residuals')
    plt.axhline(y=0, color='r', linestyle='--')
    fig_residual_1 = plt2MD(plt)

    plt.figure()
    sm.qqplot(residual, line='s')
    plt.ylabel('Residuals')
    fig_residual_2 = plt2MD(plt)

    plt.figure()
    sns.distplot(residual)
    plt.xlabel('Residuals')
    fig_residual_3 = plt2MD(plt)

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Linear Regression Result
    | ### Summary
    |
    """))
    rb.addHTML(html_result)
    rb.addMD(
        strip_margin("""
    |
    | ### Predicted vs Actual
    | {image1}
    |
    | ### Fit Diagnostics
    | {image2}
    | {image3}
    | {image4}
    """.format(image1=fig_actual_predict,
               image2=fig_residual_1,
               image3=fig_residual_2,
               image4=fig_residual_3)))

    model = _model_dict('linear_regression_model')
    model['features'] = feature_cols
    model['label'] = label_col
    model['coefficients'] = lr_model_fit.params
    model['fit_intercept'] = fit_intercept
    model['r2'] = lr_model_fit.rsquared
    model['adjusted_r2'] = lr_model_fit.rsquared_adj
    model['aic'] = lr_model_fit.aic
    model['bic'] = lr_model_fit.bic
    model['f_static'] = lr_model_fit.fvalue
    model['tvalues'] = lr_model_fit.tvalues
    model['pvalues'] = lr_model_fit.pvalues
    model['_repr_brtc_'] = rb.get()

    model['summary0'] = summary0
    model['summary1'] = summary1
    model['summary2'] = summary2
    lr_model_fit.remove_data()
    model['lr_model'] = lr_model_fit
    return {'model': model}
示例#8
0
文件: glm.py 项目: ymorzart/studio
def _glm_train(table, feature_cols, label_col, family="Gaussian", link="ident", fit_intercept=True):
    features = table[feature_cols]
    label = table[label_col]

    if label_col in feature_cols:
        raise_runtime_error("%s is duplicated." % label_col)

    if family == "Gaussian": 
        sm_family = sm.families.Gaussian()
    elif family == "inv_Gaussian":
        sm_family = sm.families.InverseGaussian()
    elif family == "binomial":
        sm_family = sm.families.Binomial()
    elif family == "Poisson":
        sm_family = sm.families.Poisson()
    elif family == "neg_binomial":
        sm_family = sm.families.NegativeBinomial()
    elif family == "gamma":
        sm_family = sm.families.Gamma()
    elif family == "Tweedie":
        sm_family = sm.families.Tweedie()

    if link == "ident":
        sm_link = sm.families.links.identity
    elif link == "log":
        sm_link = sm.families.links.log
    elif link == "logit":
        sm_link = sm.families.links.logit
    elif link == "probit":
        sm_link = sm.families.links.probit
    elif link == "cloglog":
        sm_link = sm.families.links.cLogLog
    elif link == "pow":
        sm_link = sm.families.links.Power
    elif link == "nbinom":
        sm_link = sm.families.links.binom

    if fit_intercept == True:
        glm_model = sm.GLM(label, sm.add_constant(features), family=sm_family, link=sm_link).fit()
    else:
        glm_model = sm.GLM(label, features, family=sm_family, link=sm_link).fit()
    summary = glm_model.summary().as_html()

    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## GLM Result
    | ### Summary
    |
    """))
    rb.addHTML(summary)

    model = _model_dict('glm_model')
    model['features'] = feature_cols
    model['label'] = label_col
    model['family'] = family
    model['link'] = link
    model['coefficients'] = glm_model.params
    model['aic'] = glm_model.aic
    model['bic'] = glm_model.bic
    model['tvalues'] = glm_model.tvalues
    model['pvalues'] = glm_model.pvalues
    model['fit_intercept'] = fit_intercept
    model['glm_model'] = glm_model
    model['_repr_brtc_'] = rb.get()

    return {'model' : model}
示例#9
0
def _dtm(table, input_col, topic_name='topic', num_topic=5, num_topic_word=10, max_iter=20, time_slice=None,
         coherence='u_mass', vis_time=0, seed=None):
    running_os = platform.system()
    is_os_64bit = platform.machine().endswith('64')
    if running_os == 'Linux':
        if is_os_64bit:
            dtm_filename = 'dtm-linux64'
        else:
            dtm_filename = 'dtm-linux32'
    elif running_os == 'Windows':
        if is_os_64bit:
            dtm_filename = 'dtm-win64.exe'
        else:
            dtm_filename = 'dtm-win32.exe'
    else:  # Mac
        dtm_filename = 'dtm-darwin64'
    dtm_path = os.path.join(str(pathlib.Path(__file__).parent.absolute()), 'dtm', dtm_filename)
    if running_os != 'Windows':
        bash_command = "chmod +x {}".format(dtm_path)
        os.system(bash_command)
    tokenized_doc = np.array(table[input_col])
    num_doc = len(tokenized_doc)
    if time_slice is None:
        time_slice = [num_doc]
    elif sum(time_slice) != num_doc:
        raise_runtime_error("The sum of time slice list does not match the number of documents.")
    if vis_time < 0 or vis_time >= len(time_slice):
        raise_runtime_error("Invalid time parameter: {}".format(vis_time))
    dictionary = corpora.Dictionary(tokenized_doc)
    corpus = [dictionary.doc2bow(text) for text in tokenized_doc]
    dtm_params = {"corpus": corpus,
                  "id2word": dictionary,
                  "time_slices": time_slice,
                  "num_topics": num_topic,
                  "lda_sequence_max_iter": max_iter,
                  "model": 'dtm'}
    if seed is not None:
        dtm_params["rng_seed"] = seed
    dtm_model = DtmModel(dtm_path, **dtm_params)

    topic_time = [[dtm_model.show_topic(topicid=id, time=t, topn=num_topic_word) for id in range(num_topic)]
                  for t in range(len(time_slice))]
    topic_time = [[["{}: {}".format(tup[1], tup[0]) for tup in topic] for topic in time] for time in topic_time]
    timeline = ["{} ({} docs)".format(ind, t) for ind, t in enumerate(time_slice)]
    columns = ["topic_{}".format(i + 1) for i in range(num_topic)]
    topic_table = pd.DataFrame(topic_time, columns=columns)
    topic_table['time'] = timeline
    topic_table = topic_table[['time'] + columns]

    prop_arr = dtm_model.gamma_
    out_table = pd.DataFrame.copy(table, deep=True)
    if topic_name in table.columns:
        raise BrighticsFunctionException.from_errors(
            [{'0100': "Existing table contains Topic Column Name. Please choose again."}])
    out_table[topic_name] = [item.argmax() + 1 for item in prop_arr]
    out_table['topic_distribution'] = prop_arr.tolist()

    coherence_topic_arr = [dtm_model.dtm_coherence(time) for time in range(len(time_slice))]
    if coherence == 'u_mass':
        coh_arr = [CoherenceModel(topics=item, dictionary=dictionary, corpus=corpus, coherence='u_mass').get_coherence()
                   for item in coherence_topic_arr]
    else:
        coh_arr = [CoherenceModel(topics=item, dictionary=dictionary, corpus=corpus, texts=tokenized_doc,
                                  coherence='c_v').get_coherence() for item in coherence_topic_arr]

    doc_topic, topic_term, doc_lengths, term_frequency, vocab = dtm_model.dtm_vis(corpus, vis_time)
    prepared_data = plv.prepare(topic_term, doc_topic, doc_lengths, vocab, term_frequency, sort_topics=False)
    html_result = plv.prepared_data_to_html(prepared_data)

    params = {'Input column': input_col,
              'Topic column name': topic_name,
              'Number of topics': num_topic,
              'Number of words for each topic': num_topic_word,
              'Maximum number of iterations': max_iter,
              'Time slice': time_slice,
              'Coherence measure': coherence,
              'Time to visualize': vis_time}
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## Dynamic Topic Modeling Result
    | ### Summary
    |
    """))
    rb.addHTML(html_result)
    rb.addMD(strip_margin("""
    | ### Coherence for each period
    | {coh_arr}
    |
    | ### Parameters
    | {params}
    """.format(coh_arr=coh_arr, params=dict2MD(params))))

    model = _model_dict('dtm_model')
    model['params'] = params
    model['dtm_model'] = dtm_model
    model['coherences'] = coh_arr
    model['corpus'] = corpus
    model['_repr_brtc_'] = rb.get()

    return {'out_table': out_table, 'topic_table': topic_table, 'model': model}
示例#10
0
文件: gsdmm.py 项目: yemode2k/studio
def _gsdmm(table,
           input_col,
           topic_name='topic',
           K=10,
           alpha=0.1,
           beta=0.1,
           max_iter=50,
           num_topic_words=3):
    docs = np.array(table[input_col])
    docs_set = [set(doc) for doc in docs]
    docs_preprocessed = [list(doc_set) for doc_set in docs_set]
    vocab_set = list(set.union(*docs_set))
    vocab_size = len(vocab_set)

    # initialize and train a GSDMM model
    mgp = gsdmm_rwalk.MovieGroupProcess(K=K,
                                        alpha=alpha,
                                        beta=beta,
                                        n_iters=max_iter)
    topics = mgp.fit(docs_preprocessed, vocab_size)

    # generate topic table
    topic_word_count = mgp.cluster_word_distribution
    topic_words_raw = [[ind, _count_to_ratio_raw(word_count)]
                       for ind, word_count in enumerate(topic_word_count)
                       if word_count]
    topic_words = [[item[0]] + _gen_table(item[1], num_topic_words)
                   for item in topic_words_raw]

    # reset topic ids
    nonempty_topic_indices = [item[0] for item in topic_words]
    reset_topic_ind = {
        old_ind: (new_ind + 1)
        for new_ind, old_ind in enumerate(nonempty_topic_indices)
    }
    topics = [reset_topic_ind[old_ind] for old_ind in topics]
    topic_words = [[reset_topic_ind[old_item[0]]] + old_item[1:]
                   for old_item in topic_words]

    # generate output dataframes
    out_table = pd.DataFrame.copy(table, deep=True)
    if topic_name in table.columns:
        raise BrighticsFunctionException.from_errors([{
            '0100':
            "Existing table contains the topic column name. Please choose another name."
        }])
    out_table[topic_name] = topics
    columns = ['index', 'vocabularies_weights', 'vocabularies', 'weights']
    topic_table = pd.DataFrame(topic_words, columns=columns)
    topic_table['weights'] = topic_table['weights'].apply(pd.to_numeric)

    # pyLDAvis
    if len(topic_words) == 1:
        html_result = None
    else:
        topic_words_dicts = [item[1] for item in topic_words_raw]
        topic_term_dists = [[
            topic_words_dict.get(word, 0) for word in vocab_set
        ] for topic_words_dict in topic_words_dicts]
        num_docs = len(topics)
        num_topics = len(topic_words_raw)
        doc_topic_dists = np.zeros((num_docs, num_topics))
        for doc_id, topic_id in enumerate(topics):
            doc_topic_dists[doc_id][topic_id - 1] = 1.0
        doc_lengths = [len(doc) for doc in docs_preprocessed]
        vocab_count = functools.reduce(
            lambda dict_1, dict_2: {
                word: dict_1.get(word, 0) + dict_2.get(word, 0)
                for word in set(dict_1).union(dict_2)
            }, topic_word_count)
        term_frequency = [vocab_count.get(word) for word in vocab_set]

        prepared_data = pyLDAvis.prepare(topic_term_dists, doc_topic_dists,
                                         doc_lengths, vocab_set,
                                         term_frequency)
        html_result = pyLDAvis.prepared_data_to_html(prepared_data)

    # generate report
    params = {
        'Input column': input_col,
        'Topic column name': topic_name,
        'K': K,
        'Alpha': alpha,
        'Beta': beta,
        'Maximum number of iterations': max_iter,
        'Number of words for each topic': num_topic_words
    }
    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## GSDMM Result
    | ### Summary
    |
    """))
    if html_result is not None:
        rb.addHTML(html_result)
        rb.addMD(strip_margin("""
        |
        """))
    rb.addMD(
        strip_margin("""
    | ### Final Number of Topics
    | {num_topics}
    |
    | ### Parameters
    | {params}
    """.format(num_topics=len(topic_words_raw), params=dict2MD(params))))

    # create model
    model = _model_dict('lda_model')
    model['params'] = params
    model['gsdmm_model'] = mgp
    model['_repr_brtc_'] = rb.get()

    return {'out_table': out_table, 'topic_table': topic_table, 'model': model}
示例#11
0
def _linear_regression_train(table,
                             feature_cols,
                             label_col,
                             fit_intercept=True,
                             is_vif=True,
                             vif_threshold=10):
    features = table[feature_cols]
    label = table[label_col]
    lr_model = LinearRegression(fit_intercept)
    lr_model.fit(features, label)

    predict = lr_model.predict(features)
    residual = label - predict

    if fit_intercept == True:
        features = sm.add_constant(features)
        lr_model_fit = sm.OLS(label, features).fit()
    else:
        lr_model_fit = sm.OLS(label, features).fit()

    summary = lr_model_fit.summary()
    summary_tables = simple_tables2df_list(summary.tables, drop_index=True)
    summary0 = summary_tables[0]
    summary1 = summary_tables[1]
    if is_vif:
        summary1['VIF'] = [
            variance_inflation_factor(features.values, i)
            for i in range(features.shape[1])
        ]
        summary1['VIF>{}'.format(vif_threshold)] = summary1['VIF'].apply(
            lambda _: 'true' if _ > vif_threshold else 'false')
    summary.tables[1] = _df_to_simpletable(summary1)
    summary2 = summary_tables[2]

    html_result = summary.as_html()

    plt.figure()
    plt.scatter(predict, label)
    plt.xlabel('Predicted values for ' + label_col)
    plt.ylabel('Actual values for ' + label_col)
    x = predict
    y = np.array(label)
    a = x.size
    b = np.sum(x)
    c = b
    d = 0
    for i in x:
        d += +i * i
    e = np.sum(y)
    f = 0
    for i in range(0, x.size - 1):
        f += x[i] * y[i]
    det = a * d - b * c
    aa = (d * e - b * f) / det
    bb = (a * f - c * e) / det
    p1x = np.min(x)
    p1y = aa + bb * p1x
    p2x = np.max(x)
    p2y = aa + bb * p2x
    plt.plot([p1x, p2x], [p1y, p2y], 'r--')
    fig_actual_predict = plt2MD(plt)

    plt.figure()
    plt.scatter(predict, residual)
    plt.xlabel('Predicted values for ' + label_col)
    plt.ylabel('Residuals')
    plt.axhline(y=0, color='r', linestyle='--')
    fig_residual_1 = plt2MD(plt)

    plt.figure()
    sm.qqplot(residual, line='s')
    plt.ylabel('Residuals')
    fig_residual_2 = plt2MD(plt)

    plt.figure()
    sns.distplot(residual)
    plt.xlabel('Residuals')
    fig_residual_3 = plt2MD(plt)

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Linear Regression Result
    | ### Summary
    |
    """))
    rb.addHTML(html_result)
    rb.addMD(
        strip_margin("""
    |
    | ### Predicted vs Actual
    | {image1}
    |
    | ### Fit Diagnostics
    | {image2}
    | {image3}
    | {image4}
    """.format(image1=fig_actual_predict,
               image2=fig_residual_1,
               image3=fig_residual_2,
               image4=fig_residual_3)))

    model = _model_dict('linear_regression_model')
    model['features'] = feature_cols
    model['label'] = label_col
    model['coefficients'] = lr_model_fit.params
    model['r2'] = lr_model_fit.rsquared
    model['adjusted_r2'] = lr_model_fit.rsquared_adj
    model['aic'] = lr_model_fit.aic
    model['bic'] = lr_model_fit.bic
    model['f_static'] = lr_model_fit.fvalue
    model['tvalues'] = lr_model_fit.tvalues
    model['pvalues'] = lr_model_fit.pvalues
    model['lr_model'] = lr_model
    model['_repr_brtc_'] = rb.get()

    model['summary0'] = summary0
    model['summary1'] = summary1
    model['summary2'] = summary2

    return {'model': model}
示例#12
0
def _linear_regression_train(table, feature_cols, label_col, fit_intercept=True):
    features = table[feature_cols]
    label = table[label_col]
    lr_model = LinearRegression(fit_intercept)
    lr_model.fit(features, label)

    predict = lr_model.predict(features)
    residual = label - predict

    if fit_intercept == True:
        lr_model_fit = sm.OLS(label, sm.add_constant(features)).fit()
    else:
        lr_model_fit = sm.OLS(label, features).fit()
    summary = lr_model_fit.summary().as_html()

    plt.figure()
    plt.scatter(predict, label)
    plt.xlabel('Predicted values for ' + label_col)
    plt.ylabel('Actual values for ' + label_col)
    x = predict
    y = np.array(label)
    a = x.size
    b = np.sum(x)
    c = b
    d = 0
    for i in x: d += +i * i
    e = np.sum(y)
    f = 0
    for i in range(0, x.size - 1): f += x[i] * y[i]
    det = a * d - b * c
    aa = (d * e - b * f) / det
    bb = (a * f - c * e) / det
    p1x = np.min(x)
    p1y = aa + bb * p1x
    p2x = np.max(x)
    p2y = aa + bb * p2x
    plt.plot([p1x, p2x], [p1y, p2y], 'r--')
    fig_actual_predict = plt2MD(plt)

    plt.figure()
    plt.scatter(predict, residual)
    plt.xlabel('Predicted values for ' + label_col)
    plt.ylabel('Residuals')
    plt.axhline(y=0, color='r', linestyle='--')
    fig_residual_1 = plt2MD(plt)

    plt.figure()
    sm.qqplot(residual, line='s')
    plt.ylabel('Residuals')
    fig_residual_2 = plt2MD(plt)

    plt.figure()
    sns.distplot(residual)
    plt.xlabel('Residuals')
    fig_residual_3 = plt2MD(plt)

    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## Linear Regression Result
    | ### Summary
    |
    """))
    rb.addHTML(summary)
    rb.addMD(strip_margin("""
    |
    | ### Predicted vs Actual
    | {image1}
    |
    | ### Fit Diagnostics
    | {image2}
    | {image3}
    | {image4}
    """.format(image1=fig_actual_predict,
               image2=fig_residual_1,
               image3=fig_residual_2,
               image4=fig_residual_3
               )))

    model = _model_dict('linear_regression_model')
    model['features'] = feature_cols
    model['label'] = label_col
    model['coefficients'] = lr_model_fit.params
    model['r2'] = lr_model_fit.rsquared
    model['adjusted_r2'] = lr_model_fit.rsquared_adj
    model['aic'] = lr_model_fit.aic
    model['bic'] = lr_model_fit.bic
    model['f_static'] = lr_model_fit.fvalue
    model['tvalues'] = lr_model_fit.tvalues
    model['pvalues'] = lr_model_fit.pvalues
    model['lr_model'] = lr_model
    model['_repr_brtc_'] = rb.get()

    return {'model' : model}