def _oneway_anova(table, response_cols, factor_col): rb = ReportBuilder() rb.addMD(strip_margin(""" ## One-way Analysis of Variance Result """)) groups = table[factor_col].unique() groups.sort() sum_len = np.sum([ len(str(group)) for group in groups ]) result = dict() result['_grouped_data'] = dict() for response_col in response_cols: data = table[response_col] result['_grouped_data'][response_col] = dict() ax = sns.boxplot(x=factor_col, y=response_col, data=table, order=groups) if sum_len > 512: ax.set_xticklabels(ax.get_xticklabels(), rotation=90) elif sum_len > 64: ax.set_xticklabels(ax.get_xticklabels(), rotation=45) fig_box = plt2MD(plt) plt.clf() model = ols("""Q('{response_col}') ~ C(Q('{factor_col}'))""".format(response_col=response_col, factor_col=factor_col), table).fit() # TODO factor_col = class => error anova = anova_lm(model) anova_df = pandasDF2MD(anova) p_value = anova["""PR(>F)"""][0] residual = model.resid sns.distplot(residual) distplot = plt2MD(plt) plt.clf() sm.qqplot(residual, line='s') qqplot = plt2MD(plt) plt.clf() rb.addMD(strip_margin(""" | ## {response_col} by {factor_col} | {fig_box} | | ### ANOVA | {anova_df} | | ### Diagnostics | {distplot} | | {qqplot} """.format(response_col=response_col, factor_col=factor_col, fig_box=fig_box, anova_df=anova_df, distplot=distplot, qqplot=qqplot))) result['_grouped_data'][response_col]['p_value'] = p_value result['report'] = rb.get() return {'result': result}
def wordcloud(table,input_col,font_path = '/fonts/NanumGothic.ttf',width=800, height=800, background_color="white"): texts='' for tokens in table[input_col]: for token in tokens: texts += ' ' + token wordcloud = WordCloud( font_path = font_path, width = width, height = height, background_color=background_color ) wordclud = wordcloud.generate_from_text(texts) array = wordcloud.to_array() fig = plt.figure(figsize=(10, 10)) plt.imshow(array, interpolation="bilinear") plt.axis('off') fig_image=plt2MD(plt) rb = ReportBuilder() rb.addMD(strip_margin(""" | ## Word Cloud Result | {fig} """.format(fig=fig_image))) model = _model_dict('wordcloud') model['plt'] = fig_image model['report']=rb.get() return {'model': model}
def _screeplot(explained_variance, explained_variance_ratio, n_components, ax=None): if ax is None: ax = plt.gca() n_components_range = range(1, len(explained_variance) + 1) cum_explained_variance = explained_variance_ratio.cumsum() plt.xticks(n_components_range, n_components_range) ax.plot(n_components_range, explained_variance, 'o--') ax.set_ylabel('Explained Variance') ax2 = ax.twinx() ax2.plot(n_components_range, cum_explained_variance, 'x-') ax2.set_ylim([0, 1.05]) ax2.set_ylabel('Cumulative Explained Variance Ratio') ax2.text(n_components, cum_explained_variance[n_components - 1] - 0.05, '%0.4f' % cum_explained_variance[n_components - 1], va='center', ha='center') fig_scree = plt2MD(plt) plt.clf() return fig_scree
def _kmeans_pca_plot(labels, cluster_centers, pca2_model, pca2): n_clusters = len(cluster_centers) colors = cm.nipy_spectral(np.arange(n_clusters).astype(float) / n_clusters) for i, color in zip(range(n_clusters), colors): plt.scatter(pca2[:, 0][labels == i], pca2[:, 1][labels == i], color=color) pca2_centers = pca2_model.transform(cluster_centers) plt.scatter(pca2_centers[:, 0], pca2_centers[:, 1], marker='x', edgecolors=1, s=200, color=colors) plt.tight_layout() fig_pca = plt2MD(plt) plt.clf() return fig_pca
def agglomerative_clustering_train_predict(input_table, input_cols, n_clusters=3, affinity='euclidean', compute_full_tree=True, linkage='ward', prediction_col='prediction', figw=6.4, figh=4.8): inputarr = input_table[input_cols] agglomerative_clustering = SKAgglomerativeClustering( n_clusters=n_clusters, affinity=affinity, memory=None, connectivity=None, compute_full_tree=compute_full_tree, linkage=linkage) agglomerative_clustering.fit(inputarr) input_table[prediction_col] = agglomerative_clustering.labels_ children = agglomerative_clustering.children_ distance = np.arange(children.shape[0]) no_of_observations = np.arange(2, children.shape[0] + 2) linkage_matrix = np.column_stack([children, distance, no_of_observations]).astype(float) plt.figure(figsize=(figw, figh)) dendrogram(linkage_matrix) plot_dendrogram = plt2MD(plt) plt.clf() rb = ReportBuilder() rb.addMD( strip_margin(""" | ## Agglomerative Clustering Result | {plot_dendrogram} """.format(plot_dendrogram=plot_dendrogram))) agglomerative_clustering_result = { 'model': agglomerative_clustering, 'input_cols': input_cols, 'report': rb.get() } return { 'out_table': input_table, 'agglomerative_result': agglomerative_clustering_result }
def _kmeans_centers_plot(input_cols, cluster_centers): sum_len_cols = np.sum([len(col) for col in input_cols]) x = range(len(input_cols)) if sum_len_cols >= 512: plt.xticks(x, input_cols, rotation='vertical') elif sum_len_cols >= 64: plt.xticks(x, input_cols, rotation=45, ha='right') else: plt.xticks(x, input_cols) for idx, centers in enumerate(cluster_centers): plt.plot(x, centers, "o-", label=idx) plt.legend() plt.tight_layout() fig_centers = plt2MD(plt) plt.clf() return fig_centers
def _kmeans_samples_plot(table, input_cols, n_samples, cluster_centers): sum_len_cols = np.sum([len(col) for col in input_cols]) sample = table[input_cols].sample(n=n_samples) x = range(len(input_cols)) if sum_len_cols >= 512: plt.xticks(x, input_cols, rotation='vertical') elif sum_len_cols >= 64: plt.xticks(x, input_cols, rotation=45, ha='right') else: plt.xticks(x, input_cols) for idx in sample.index: plt.plot(x, sample.transpose()[idx], color='grey', linewidth=1) for idx, centers in enumerate(cluster_centers): plt.plot(x, centers, "o-", label=idx, linewidth=4) plt.tight_layout() fig_samples = plt2MD(plt) plt.clf() return fig_samples
def _linear_regression_train(table, feature_cols, label_col, fit_intercept=True): features = table[feature_cols] label = table[label_col] lr_model = LinearRegression(fit_intercept) lr_model.fit(features, label) predict = lr_model.predict(features) residual = label - predict if fit_intercept == True: lr_model_fit = sm.OLS(label, sm.add_constant(features)).fit() else: lr_model_fit = sm.OLS(label, features).fit() summary = lr_model_fit.summary() summary_tables = simple_tables2df_list(summary.tables) summary0 = summary_tables[0] summary1 = summary_tables[1] summary2 = summary_tables[2] html_result = summary.as_html() plt.figure() plt.scatter(predict, label) plt.xlabel('Predicted values for ' + label_col) plt.ylabel('Actual values for ' + label_col) x = predict y = np.array(label) a = x.size b = np.sum(x) c = b d = 0 for i in x: d += +i * i e = np.sum(y) f = 0 for i in range(0, x.size - 1): f += x[i] * y[i] det = a * d - b * c aa = (d * e - b * f) / det bb = (a * f - c * e) / det p1x = np.min(x) p1y = aa + bb * p1x p2x = np.max(x) p2y = aa + bb * p2x plt.plot([p1x, p2x], [p1y, p2y], 'r--') fig_actual_predict = plt2MD(plt) plt.figure() plt.scatter(predict, residual) plt.xlabel('Predicted values for ' + label_col) plt.ylabel('Residuals') plt.axhline(y=0, color='r', linestyle='--') fig_residual_1 = plt2MD(plt) plt.figure() sm.qqplot(residual, line='s') plt.ylabel('Residuals') fig_residual_2 = plt2MD(plt) plt.figure() sns.distplot(residual) plt.xlabel('Residuals') fig_residual_3 = plt2MD(plt) rb = ReportBuilder() rb.addMD(strip_margin(""" | ## Linear Regression Result | ### Summary | """)) rb.addHTML(html_result) rb.addMD(strip_margin(""" | | ### Predicted vs Actual | {image1} | | ### Fit Diagnostics | {image2} | {image3} | {image4} """.format(image1=fig_actual_predict, image2=fig_residual_1, image3=fig_residual_2, image4=fig_residual_3 ))) model = _model_dict('linear_regression_model') model['features'] = feature_cols model['label'] = label_col model['coefficients'] = lr_model_fit.params model['r2'] = lr_model_fit.rsquared model['adjusted_r2'] = lr_model_fit.rsquared_adj model['aic'] = lr_model_fit.aic model['bic'] = lr_model_fit.bic model['f_static'] = lr_model_fit.fvalue model['tvalues'] = lr_model_fit.tvalues model['pvalues'] = lr_model_fit.pvalues model['lr_model'] = lr_model model['report'] = rb.get() model['summary0'] = summary0 model['summary1'] = summary1 model['summary2'] = summary2 return {'model' : model}
def _biplot(xidx, yidx, data, pc_columns, columns, singular_values, components, explained_variance_ratio, alpha=1, ax=None, hue=None, key_col=None): if ax is None: ax = plt.gca() xs = data[pc_columns[xidx]] * singular_values[xidx]**alpha ys = data[pc_columns[yidx]] * singular_values[yidx]**alpha if key_col is not None and hue is not None: groups = data[hue].unique() k = len(data[hue].unique()) colors = cm.viridis(np.arange(k).astype(float) / k) for j, color in zip(range(k), colors): group_data = data[data[hue] == groups[j]] for idx in group_data.index: ax.text(xs[idx], ys[idx], data[key_col][idx], color=color, va='center', ha='center') ax.legend([Patch(color=colors[i]) for i, _ in enumerate(groups)], groups.tolist()) elif key_col is not None and hue is None: for i in range(data.shape[0]): ax.text(xs[i], ys[i], data[key_col][i], color='black', va='center', ha='center') elif hue is not None: sns.scatterplot(xs, ys, hue=data[hue], data=data, ax=ax) else: sns.scatterplot(xs, ys, data=data, ax=ax) ax.set_xlabel('%s (%0.4f)' % (pc_columns[xidx], explained_variance_ratio[xidx])) ax.set_ylabel('%s (%0.4f)' % (pc_columns[yidx], explained_variance_ratio[yidx])) axs = components[xidx] * singular_values[xidx]**(1 - alpha) ays = components[yidx] * singular_values[yidx]**(1 - alpha) xmax = np.amax(np.concatenate((xs, axs * 1.5))) xmin = np.amin(np.concatenate((xs, axs * 1.5))) ymax = np.amax(np.concatenate((ys, ays * 1.5))) ymin = np.amin(np.concatenate((ys, ays * 1.5))) for i, col in enumerate(columns): x, y = axs[i], ays[i] ax.arrow(0, 0, x, y, color='r', width=0.001, head_width=0.05) ax.text(x * 1.3, y * 1.3, col, color='r', ha='center', va='center') ys, ye = ax.get_ylim() xs, xe = ax.get_xlim() m = 1.2 ax.set_xlim(xmin * m, xmax * m) ax.set_ylim(ymin * m, ymax * m) # plt.title('PCA result with two components') # plt.show() plt_two = plt2MD(plt) plt.clf() return plt_two
def _pca(table, input_cols, new_column_name='projected_', n_components=None, copy=True, whiten=False, svd_solver='auto', tol=0.0, iterated_power='auto', random_state=None, hue=None, alpha=0, key_col=None): num_feature_cols = len(input_cols) if n_components is None: n_components = num_feature_cols pca = PCA(None, copy, whiten, svd_solver, tol, iterated_power, random_state) pca_model = pca.fit(table[input_cols]) column_names = [] for i in range(0, n_components): column_names.append(new_column_name + str(i)) # print(column_names) pca_result = pca_model.transform(table[input_cols]) out_df = pd.DataFrame(data=pca_result[:, :n_components], columns=[column_names]) out_df = pd.concat([table.reset_index(drop=True), out_df], axis=1) out_df.columns = table.columns.values.tolist() + column_names res_components = pca_model.components_ res_components_df = pd.DataFrame(data=res_components[:n_components], columns=[input_cols]) res_explained_variance = pca_model.explained_variance_ res_explained_variance_ratio = pca_model.explained_variance_ratio_ res_singular_values = pca_model.singular_values_ res_mean = pca_model.mean_ res_n_components = pca_model.n_components_ res_noise_variance = pca_model.noise_variance_ res_get_param = pca_model.get_params() res_get_covariance = pca_model.get_covariance() res_get_precision = pca_model.get_precision() # visualization plt.figure() if n_components == 1: sns.scatterplot(column_names[0], column_names[0], hue=hue, data=out_df) plt_two = plt2MD(plt) plt.clf() else: plt_two = _biplot( 0, 1, pc_columns=column_names, columns=input_cols, singular_values=res_singular_values, components=res_components, explained_variance_ratio=res_explained_variance_ratio, alpha=alpha, hue=hue, data=out_df, ax=plt.gca(), key_col=key_col) plt.figure() fig_scree = _screeplot(res_explained_variance, res_explained_variance_ratio, n_components) table_explained_variance = pd.DataFrame(res_explained_variance, columns=['explained_variance']) table_explained_variance[ 'explained_variance_ratio'] = res_explained_variance_ratio table_explained_variance[ 'cum_explained_variance_ratio'] = res_explained_variance_ratio.cumsum( ) rb = ReportBuilder() rb.addMD( strip_margin(""" | ## PCA Result | ### Plot | {image1} | | ### Explained Variance | {fig_scree} | {table_explained_variance} | | ### Components | {table2} | | ### Parameters | {parameter1} """.format(image1=plt_two, fig_scree=fig_scree, table_explained_variance=pandasDF2MD(table_explained_variance), parameter1=dict2MD(res_get_param), table2=pandasDF2MD(res_components_df)))) model = _model_dict('pca') model['components'] = res_components model['explained_variance'] = res_explained_variance model['explained_variance_ratio'] = res_explained_variance_ratio model['singular_values'] = res_singular_values model['mean'] = res_mean model['n_components'] = res_n_components model['noise_variance'] = res_noise_variance model['parameters'] = res_get_param model['covariance'] = res_get_covariance model['precision'] = res_get_precision model['report'] = rb.get() model['pca_model'] = pca_model model['input_cols'] = input_cols return {'out_table': out_df, 'model': model}
def _pca(table, input_cols, new_column_name='projected_', n_components=None, copy=True, whiten=False, svd_solver='auto', tol=0.0, iterated_power='auto', random_state=None): num_feature_cols = len(input_cols) if n_components is None: n_components = num_feature_cols pca = PCA(n_components, copy, whiten, svd_solver, tol, iterated_power, random_state) pca_model = pca.fit(table[input_cols]) column_names = [] for i in range(0, n_components): column_names.append(new_column_name + str(i)) # print(column_names) pca_result = pca_model.transform(table[input_cols]) out_df = pd.DataFrame(data=pca_result, columns=[column_names]) res_components = pca_model.components_ res_components_df = pd.DataFrame(data=res_components, columns=[input_cols]) res_explained_variance = pca_model.explained_variance_ res_explained_variance_ratio = pca_model.explained_variance_ratio_ res_singular_values = pca_model.singular_values_ res_mean = pca_model.mean_ res_n_components = pca_model.n_components_ res_noise_variance = pca_model.noise_variance_ res_get_param = pca_model.get_params() res_get_covariance = pca_model.get_covariance() res_get_precision = pca_model.get_precision() # visualization plt.figure() if res_n_components == 1: plt.scatter(pca_result[:, 0], pca_result[:, 0]) else: plt.scatter(pca_result[:, 0], pca_result[:, 1]) # plt.title('PCA result with two components') # plt.show() plt_two = plt2MD(plt) plt.clf() rb = ReportBuilder() rb.addMD( strip_margin(""" | | ### Plot | The x-axis and y-axis of the following plot is projected0 and projected1, respectively. | {image1} | | ### Result | {table1} | only showing top 20 rows | | ### Parameters | {parameter1} | | ### Components | {table2} | | ### Mean | {array1} | | ### Explained Variance | {array2} | """.format(table1=pandasDF2MD(out_df, 20), image1=plt_two, parameter1=dict2MD(res_get_param), table2=pandasDF2MD(res_components_df), array1=res_mean, array2=res_explained_variance))) model = _model_dict('pca') model['components'] = res_components model['explained_variance'] = res_explained_variance model['explained_variance_ratio'] = res_explained_variance_ratio model['singular_values'] = res_singular_values model['mean'] = res_mean model['n_components'] = res_n_components model['noise_variance'] = res_noise_variance model['parameters'] = res_get_param model['covariance'] = res_get_covariance model['precision'] = res_get_precision model['report'] = rb.get() model['pca_model'] = pca_model model['input_cols'] = input_cols out_df = pd.concat([table.reset_index(drop=True), out_df], axis=1) out_df.columns = table.columns.values.tolist() + column_names return {'out_table': out_df, 'model': model}
def _hierarchical_clustering(table, input_cols, link='complete', met='euclidean', p=2, num_rows=20, figure_height=6.4, orient='right'): table = table.copy() df = table[input_cols] Z = linkage(df, method=link, metric=met) out_table = pd.DataFrame([]) out_table['linkage_step'] = [x + 1 for x in reversed(range(len(Z)))] out_table['joined_column1'] = ['pt_' + str(int(Z[:, 0][i])) for i in range(len(Z))] out_table['joined_column2'] = ['pt_' + str(int(Z[:, 1][i])) for i in range(len(Z))] out_table['name_of_clusters'] = ['CL_' + str(i + 1) for i in reversed(range(len(Z)))] out_table['distance'] = [distance for distance in Z[:, 2]] out_table['number_of_original'] = [int(entities) for entities in Z[:, 3]] # switch name of point to cluster name for i in range(len(Z)): if Z[:, 0][i] >= len(df) : out_table['joined_column1'][i] = out_table['name_of_clusters'][Z[:, 0][i] - len(df)] if Z[:, 1][i] >= len(df) : out_table['joined_column2'][i] = out_table['name_of_clusters'][Z[:, 1][i] - len(df)] out_table = out_table.reindex(index=out_table.index[::-1])[0:] out_table1 = out_table.head(num_rows) # calculate full dendrogram def _llf(id): n = len(df) if id < n: return 'pt_' + str(id) plt.figure(figsize=(8.4, figure_height)) _fancy_dendrogram( Z, truncate_mode='none', # show only the last p merged clusters (if another) get_leaves=True, orientation=orient, labels=True, leaf_label_func=_llf, leaf_rotation=45, leaf_font_size=5., show_contracted=False, # to get a distribution impression in truncated branches annotate_above=float(10), # useful in small plots so annotations don't overlap # max_d=distance_threshold, # will plot a horizontal cut-off line, max_d as in max_distance ) plt.title('Hierarchical Clustering Dendrogram') if orient=='top': plt.xlabel('Samples') plt.ylabel('Distance') elif orient=='right': plt.xlabel('Distance') plt.ylabel('Samples') plt2 = plt2MD(plt) plt.clf() rb = ReportBuilder() params = { 'Input Columns': input_cols, 'Linkage Method': link, 'Metric': met, 'Number of Rows in Linkage Matrix': num_rows } rb.addMD(strip_margin("""### Hierarchical Clustering Result""")) rb.addMD(strip_margin(""" |## Dendrogram | |{image} | |### Parameters | | {display_params} | |## Linkage Matrix | |{out_table1} | """.format(image=plt2, display_params=dict2MD(params), out_table1=pandasDF2MD(out_table1)))) model = _model_dict('hierarchical_clustering') model['model'] = Z model['input_cols'] = input_cols model['parameters'] = params model['outtable'] = out_table model['report'] = rb.get() return { 'model':model}
def _correlation(table, vars, method='pearson', height=2.5, corr_prec=2): size = len(vars) s_default = plt.rcParams['lines.markersize']**2. scatter_kws = {"s": s_default * height / 6.4} result_arr = [] for i in range(size): for j in range(i): if method == 'pearson': r, p = stats.pearsonr(table[vars[i]], table[vars[j]]) elif method == 'spearman': r, p = stats.spearmanr(table[vars[i]], table[vars[j]]) elif method == 'kendal': r, p = stats.kendalltau(table[vars[i]], table[vars[j]]) result_arr.append([vars[i], vars[j], r, p]) df_result = pd.DataFrame(result_arr, columns=['x', 'y', 'corr', 'p_value']) def corr(x, y, **kwargs): if kwargs['method'] == 'pearson': r, p = stats.pearsonr(x, y) elif kwargs['method'] == 'spearman': r, p = stats.spearmanr(x, y) elif kwargs['method'] == 'kendal': r, p = stats.kendalltau(x, y) p_stars = '' if p <= 0.05: p_stars = '*' if p <= 0.01: p_stars = '**' if p <= 0.001: p_stars = '***' corr_text = '{:.{prec}f}'.format(r, prec=corr_prec) font_size = abs(r) * 15 * 2 / corr_prec + 5 ax = plt.gca() ax.annotate(corr_text, [ .5, .5, ], xycoords="axes fraction", ha='center', va='center', fontsize=font_size * height) ax.annotate(p_stars, xy=(0.65, 0.6), xycoords=ax.transAxes, color='red', fontsize=17 * height) g = sns.PairGrid(table, vars=vars, height=height) g.map_diag(sns.distplot) if method == 'pearson': g.map_lower(sns.regplot, scatter_kws=scatter_kws) else: g.map_lower(sns.regplot, lowess=True, scatter_kws=scatter_kws) g.map_upper(corr, method=method) fig_corr = plt2MD(plt) plt.clf() rb = ReportBuilder() rb.addMD( strip_margin(""" ## Correlation Results | ### Correlation Matrix | {fig_corr} | | ### Correlation Table | {table} """.format(fig_corr=fig_corr, table=pandasDF2MD(df_result)))) params = {'vars': vars, 'method': method, 'height': height} res = dict() res['params'] = params res['corr_table'] = df_result res['report'] = rb.get() return {'result': res}
def _xgb_regression_train(table, feature_cols, label_col, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objectibe='reg:linear', booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, random_state=0, seed=None, missing=None, sample_weight=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, xgb_model=None, sample_weight_eval_set=None): regressor = XGBRegressor(max_depth, learning_rate, n_estimators, silent, objectibe, booster, n_jobs, nthread, gamma, min_child_weight, max_delta_step, subsample, colsample_bytree, colsample_bylevel, reg_alpha, reg_lambda, scale_pos_weight, base_score, random_state, seed, missing) regressor.fit(table[feature_cols], table[label_col], sample_weight, eval_set, eval_metric, early_stopping_rounds, verbose, xgb_model, sample_weight_eval_set) # json get_param = regressor.get_params() feature_importance = regressor.feature_importances_ # plt.rcdefaults() plot_importance(regressor) plt.tight_layout() fig_plot_importance = plt2MD(plt) plt.clf() # plt.rcParams['figure.dpi'] = figure_dpi # plot_tree(regressor) # fig_plot_tree_UT = plt2MD(plt) # plt.clf() # plt.rcParams['figure.dpi'] = figure_dpi # plot_tree(regressor, rankdir='LR') # fig_plot_tree_LR = plt2MD(plt) # plt.rcdefaults() # plt.clf() out_model = _model_dict('xgb_regression_model') out_model['feature_cols'] = feature_cols out_model['label_col'] = label_col out_model['parameters'] = get_param out_model['feature_importance'] = feature_importance out_model['regressor'] = regressor out_model['plot_importance'] = fig_plot_importance # out_model['plot_tree_UT'] = fig_plot_tree_UT # out_model['plot_tree_LR'] = fig_plot_tree_LR # out_model['to_graphviz'] = md_to_graphviz # report get_param_list = [] get_param_list.append(['feature_cols', feature_cols]) get_param_list.append(['label_col', label_col]) for key, value in get_param.items(): temp = [key, value] get_param_list.append(temp) get_param_df = pd.DataFrame(data=get_param_list, columns=['parameter', 'value']) feature_importance_df = pd.DataFrame(data=feature_importance, index=feature_cols).T rb = ReportBuilder() rb.addMD( strip_margin(""" | ## XGB Regression Result | | ### Plot Importance | {image_importance} | | ### Feature Importance | {table_feature_importance} | | ### Parameters | {table_parameter} | """.format(image_importance=fig_plot_importance, table_feature_importance=pandasDF2MD(feature_importance_df, 20), table_parameter=pandasDF2MD(get_param_df)))) out_model['report'] = rb.get() return {'model': out_model}
def _evaluate_classification(table, label_col, prediction_col): label = table[label_col] predict = table[prediction_col] # compute metrics accuracy = accuracy_score(label, predict) f1 = f1_score(label, predict, average="weighted") precision = precision_score(label, predict, average="weighted") recall = recall_score(label, predict, average="weighted") class_names = np.unique(np.union1d(label.values, predict.values)) # Plot non-normalized confusion matrix plt.figure() _plot_confusion_matrix(label, predict, classes=class_names, title='Confusion matrix, without normalization') fig_cnf_matrix = plt2MD(plt) # Plot normalized confusion matrix plt.figure() _plot_confusion_matrix(label, predict, classes=class_names, normalize=True, title='Normalized confusion matrix') fig_cnf_matrix_normalized = plt2MD(plt) plt.clf() # json summary = dict() summary['label_col'] = label_col summary['prediction_col'] = prediction_col summary['f1_score'] = f1 summary['accuracy_score'] = accuracy summary['precision_score'] = precision summary['recall_score'] = recall # report all_dict_list = [{ 'f1': f1, 'accuracy': accuracy, 'precision': precision, 'recall': recall }] all_df = pd.DataFrame(all_dict_list) all_df = all_df[['f1', 'accuracy', 'precision', 'recall']] summary['metrics'] = all_df rb = ReportBuilder() rb.addMD( strip_margin(""" | ## Evaluate Classification Result | ### Metrics | {table1} | | ### Confusion matrix | {fig_confusion_matrix} | | {fig_confusion_matrix_normalized} | """.format(table1=pandasDF2MD(all_df), fig_confusion_matrix=fig_cnf_matrix, fig_confusion_matrix_normalized=fig_cnf_matrix_normalized))) summary['report'] = rb.get() return {'result': summary}
def _decision_tree_classification_train( table, feature_cols, label_col, # fig_size=np.array([6.4, 4.8]), criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None, presort=False, sample_weight=None, check_input=True, X_idx_sorted=None): classifier = DecisionTreeClassifier( criterion, splitter, max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_features, random_state, max_leaf_nodes, min_impurity_decrease, min_impurity_split, class_weight, presort) classifier.fit(table[feature_cols], table[label_col], sample_weight, check_input, X_idx_sorted) try: from sklearn.externals.six import StringIO from sklearn.tree import export_graphviz import pydotplus dot_data = StringIO() export_graphviz(classifier, out_file=dot_data, feature_names=feature_cols, class_names=table[label_col].astype('str').unique(), filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) from brightics.common.report import png2MD fig_tree = png2MD(graph.create_png()) except: fig_tree = "Graphviz is needed to draw a Decision Tree graph. Please download it from http://graphviz.org/download/ and install it to your computer." # json model = _model_dict('decision_tree_classification_model') model['feature_cols'] = feature_cols model['label_col'] = label_col model['classes'] = classifier.classes_ feature_importance = classifier.feature_importances_ model['feature_importance'] = feature_importance model['max_features'] = classifier.max_features_ model['n_classes'] = classifier.n_classes_ model['n_features'] = classifier.n_features_ model['n_outputs'] = classifier.n_outputs_ model['tree'] = classifier.tree_ get_param = classifier.get_params() model['parameters'] = get_param model['classifier'] = classifier # report indices = np.argsort(feature_importance) sorted_feature_cols = np.array(feature_cols)[indices] plt.title('Feature Importances') plt.barh(range(len(indices)), feature_importance[indices], color='b', align='center') for i, v in enumerate(feature_importance[indices]): plt.text(v, i, " {:.2f}".format(v), color='b', va='center', fontweight='bold') plt.yticks(range(len(indices)), sorted_feature_cols) plt.xlabel('Relative Importance') plt.xlim(0, 1.1) plt.tight_layout() fig_feature_importances = plt2MD(plt) plt.clf() params = dict2MD(get_param) feature_importance_df = pd.DataFrame(data=feature_importance, index=feature_cols).T # Add tree plot rb = ReportBuilder() rb.addMD( strip_margin(""" | ## Decision Tree Classification Train Result | ### Decision Tree | {fig_tree} | | ### Feature Importance | {fig_feature_importances} | | ### Parameters | {list_parameters} | """.format(fig_tree=fig_tree, fig_feature_importances=fig_feature_importances, list_parameters=params))) model['report'] = rb.get() return {'model': model}
def _plot_binary(label, probability, threshold=None, fig_size=(6.4, 4.8), pos_label=None): fpr, tpr, threshold_roc = roc_curve(label, probability, pos_label=pos_label) # tpf 1-fpr if threshold is None: argmin = np.argmin(np.abs(tpr + fpr - 1)) threshold = threshold_roc[argmin] fpr_prop = fpr[argmin] tpr_prop = tpr[argmin] plt.plot(threshold_roc, tpr, color='blue', label='TPR') plt.plot(threshold_roc, 1 - fpr, color='red', label='1-FPR') plt.xlabel('Threshold') plt.ylabel('TPR or 1-FPR') plt.legend(loc="lower center") plt.axvline(threshold, linestyle='--') plt.text(threshold + 0.02, 0.5, 'threshold: %0.2f' % threshold, rotation=90, verticalalignment='center') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) fig_tpr_fpr = plt2MD(plt) plt.clf() # roc auc_score = auc(fpr, tpr) plt.figure(figsize=fig_size) plt.plot(fpr, tpr, color='darkorange', label='ROC curve (area = %0.2f)' % auc_score) plt.plot([0, 1], [0, 1], color='navy', linestyle='--') plt.plot(fpr_prop, tpr_prop, 'g*', markersize=10, color="red", label='threshold: %0.2f' % threshold) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic') plt.legend(loc="lower right") fig_roc = plt2MD(plt) plt.clf() # pr precision, recall, threshold_pr = precision_recall_curve( label, probability, pos_label=pos_label) precision_prop = precision[argmin] recall_prop = recall[argmin] step_kwargs = ({ 'step': 'post' } if 'step' in signature(plt.fill_between).parameters else {}) plt.step(recall, precision, color='b', alpha=0.2, where='post') plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs) plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.05]) plt.xlim([0.0, 1.0]) plt.plot(recall_prop, precision_prop, 'g*', markersize=10, color="red", label='threshold: %0.2f' % threshold) plt.title('Precision-Recall curve') # TODO Average precision score plt.legend() fig_pr = plt2MD(plt) plt.clf() threshold_pr = np.append(threshold_pr, 1) plt.plot(threshold_pr, precision, color='blue', label='Precision') plt.plot(threshold_pr, recall, color='red', label='Recall') plt.xlabel('Threshold') plt.ylabel('Precision or Recall') plt.legend(loc="lower center") plt.axvline(threshold, linestyle='--') plt.text(threshold + 0.02, 0.5, 'threshold: %0.2f' % threshold, rotation=90, verticalalignment='center') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) fig_precision_recall = plt2MD(plt) plt.clf() classes = label.unique() neg_label = [cls for cls in classes if cls != pos_label][0] predict = probability.apply(lambda x: pos_label if x >= threshold else neg_label) _plot_confusion_matrix(label, predict, [pos_label, neg_label], normalize=False, title='Confusion matrix', cmap=plt.cm.Blues) fig_confusion = plt2MD(plt) plt.clf() return threshold, fig_tpr_fpr, fig_roc, fig_precision_recall, fig_pr, fig_confusion
def _naive_bayes_train(table, feature_cols, label_col, alpha=1.0, fit_prior=True, class_prior=None): features = table[feature_cols] label = table[label_col] label_encoder = preprocessing.LabelEncoder() label_encoder.fit(label) label_correspond = label_encoder.transform(label) if class_prior is not None: tmp_class_prior = [0 for x in range(len(class_prior))] for elems in class_prior: tmp = elems.split(":") tmp_class_prior[label_encoder.transform([tmp[0] ])[0]] = float(tmp[1]) class_prior = tmp_class_prior nb_model = MultinomialNB(alpha, fit_prior, class_prior) nb_model.fit(features, label_correspond) class_log_prior = nb_model.class_log_prior_ feature_log_prob_ = nb_model.feature_log_prob_ tmp_result = np.hstack( (list(map(list, zip(*[label_encoder.classes_] + [class_log_prior]))), (feature_log_prob_))) column_names = ['labels', 'pi'] for feature_col in feature_cols: column_names += ['theta_' + feature_col] result_table = pd.DataFrame.from_records(tmp_result, columns=column_names) prediction_correspond = nb_model.predict(features) get_param = dict() get_param['Lambda'] = alpha # get_param['Prior Probabilities of the Classes'] = class_prior get_param['Fit Class Prior Probability'] = fit_prior get_param['Feature Columns'] = feature_cols get_param['Label Column'] = label_col cnf_matrix = confusion_matrix(label_correspond, prediction_correspond) plt.figure() _plot_confusion_matrix(cnf_matrix, classes=label_encoder.classes_, title='Confusion Matrix') fig_confusion_matrix = plt2MD(plt) accuracy = nb_model.score(features, label_correspond) * 100 rb = ReportBuilder() rb.addMD( strip_margin(""" | ## Naive Bayes Classification Result | | ### Model:Multinomial | {result_table} | ### Parameters | {table_parameter} | ### Predicted vs Actual | {image1} | #### Accuacy = {accuracy}% | """.format(image1=fig_confusion_matrix, accuracy=accuracy, result_table=pandasDF2MD(result_table), table_parameter=dict2MD(get_param)))) model = _model_dict('naive_bayes_model') model['features'] = feature_cols model['label_col'] = label_col model['label_encoder'] = label_encoder model['nb_model'] = nb_model model['report'] = rb.get() return {'model': model}
def _kmeans_silhouette_train_predict(table, input_cols, n_clusters_list=range(2, 10), prediction_col='prediction', init='k-means++', n_init=10, max_iter=300, tol=1e-4, precompute_distances='auto', seed=None, n_jobs=1, algorithm='auto', n_samples=None): if n_samples is None: n_samples = len(table) inputarr = table[input_cols] pca2_model = PCA(n_components=2).fit(inputarr) pca2 = pca2_model.transform(inputarr) silhouette_list = [] silouette_samples_list = [] models = [] centers_list = [] images = [] for k in n_clusters_list: k_means = SKKMeans(n_clusters=k, init=init, n_init=n_init, max_iter=max_iter, tol=tol, precompute_distances=precompute_distances, verbose=0, random_state=seed, copy_x=True, n_jobs=n_jobs, algorithm=algorithm) k_means.fit(inputarr) models.append(k_means) predict = k_means.labels_ centersk = k_means.cluster_centers_ centers_list.append(centersk) score = silhouette_score(inputarr, predict) silhouette_list.append(score) samples = silhouette_samples(inputarr, predict) silouette_samples_list.append(samples) pca2_centers = pca2_model.transform(centersk) _, (ax1, ax2) = plt.subplots(1, 2) colors = cm.nipy_spectral(np.arange(k).astype(float) / k) y_lower = 0 for i, color in zip(range(k), colors): si = samples[predict == i] si.sort() sizei = si.shape[0] y_upper = y_lower + sizei ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, si, facecolor=color, edgecolor=color, alpha=0.7) y_lower = y_upper ax2.scatter(pca2[:, 0][predict == i], pca2[:, 1][predict == i], color=color) ax1.axvline(x=score, color="red") ax2.scatter(pca2_centers[:, 0], pca2_centers[:, 1], marker='x', edgecolors=1, s=200, color=colors) imagek = plt2MD(plt) plt.clf() images.append(imagek) argmax = np.argmax(silhouette_list) best_k = n_clusters_list[argmax] best_model = models[argmax] predict = best_model.predict(inputarr) best_centers = best_model.cluster_centers_ best_labels = best_model.labels_ fig_centers = _kmeans_centers_plot(input_cols, best_centers) fig_samples = _kmeans_samples_plot(table, input_cols, n_samples, best_centers) fig_pca = _kmeans_pca_plot(predict, best_centers, pca2_model, pca2) x_clusters = range(len(n_clusters_list)) plt.xticks(x_clusters, n_clusters_list) plt.plot(x_clusters, silhouette_list, '.-') fig_silhouette = plt2MD(plt) plt.clf() rb = ReportBuilder() rb.addMD( strip_margin(""" | ## Kmeans Silhouette Result | - silloutte metrics: | {fig_silhouette} | - best K: {best_k} | - best centers: | {fig_pca} | {fig_centers} | {fig_samples} | """.format(fig_silhouette=fig_silhouette, best_k=best_k, fig_pca=fig_pca, fig_centers=fig_centers, fig_samples=fig_samples))) for k, image in zip(n_clusters_list, images): rb.addMD( strip_margin(""" | ### k = {k} | {image} | """.format(k=k, image=image))) model = _model_dict('kmeans_silhouette') model['best_k'] = best_k model['best_centers'] = best_centers model['best_model'] = best_model model['input_cols'] = input_cols model['report'] = rb.get() out_table = table.copy() out_table[prediction_col] = predict return {'out_table': out_table, 'model': model}
def _xgb_classification_train(table, feature_cols, label_col, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective='binary:logistic', booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, random_state=0, seed=None, missing=None, sample_weight=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, xgb_model=None, sample_weight_eval_set=None): classifier = XGBClassifier(max_depth, learning_rate, n_estimators, silent, objective, booster, n_jobs, nthread, gamma, min_child_weight, max_delta_step, subsample, colsample_bytree, colsample_bylevel, reg_alpha, reg_lambda, scale_pos_weight, base_score, random_state, seed, missing) classifier.fit(table[feature_cols], table[label_col], sample_weight, eval_set, eval_metric, early_stopping_rounds, verbose, xgb_model, sample_weight_eval_set) # json get_param = classifier.get_params() feature_importance = classifier.feature_importances_ # plt.rcdefaults() plot_importance(classifier) plt.tight_layout() fig_plot_importance = plt2MD(plt) plt.clf() # plt.rcParams['figure.dpi'] = figure_dpi # plot_tree(classifier) # fig_plot_tree_UT = plt2MD(plt) # plt.clf() # plt.rcParams['figure.dpi'] = figure_dpi # plot_tree(classifier, rankdir='LR') # fig_plot_tree_LR = plt2MD(plt) # plt.rcdefaults() # plt.clf() model = _model_dict('xgb_classification_model') model['feature_cols'] = feature_cols model['label_col'] = label_col model['parameters'] = get_param model['feature_importance'] = feature_importance model['classifier'] = classifier # report # get_param_list = [] # get_param_list.append(['feature_cols', feature_cols]) # get_param_list.append(['label_col', label_col]) params = dict2MD(get_param) # for key, value in get_param.items(): # temp = [key, value] # get_param_list.append(temp) # get_param_df = pd.DataFrame(data=get_param_list, columns=['parameter', 'value']) feature_importance_df = pd.DataFrame(data=feature_importance, index=feature_cols).T rb = ReportBuilder() rb.addMD( strip_margin(""" | ## XGB Classification Train Result | | ### Plot Importance | {fig_importance} | | ### Feature Importance | {table_feature_importance} | | ### Parameters | {list_parameters} | """.format(fig_importance=fig_plot_importance, table_feature_importance=pandasDF2MD(feature_importance_df, 20), list_parameters=params))) model['report'] = rb.get() return {'model': model}
def naive_bayes_train(table, feature_cols, label_col, alpha=1.0, fit_prior=True, class_prior=None): features = table[feature_cols] label = table[label_col] label_encoder = preprocessing.LabelEncoder() label_encoder.fit(label) label_correspond = label_encoder.transform(label) if class_prior is not None: tmp_class_prior = [0 for x in range(len(class_prior))] for elems in class_prior: tmp = elems.split(":") tmp_class_prior[label_encoder.transform([tmp[0] ])[0]] = float(tmp[1]) class_prior = tmp_class_prior nb_model = MultinomialNB(alpha, fit_prior, class_prior) nb_model.fit(features, label_correspond) prediction_correspond = nb_model.predict(features) get_param = dict() get_param['Lambda'] = alpha # get_param['Prior Probabilities of the Classes'] = class_prior # get_param['Fit Class Prior Probability'] = fit_prior get_param['Feature Columns'] = feature_cols get_param['Label Column'] = label_col cnf_matrix = confusion_matrix(label_correspond, prediction_correspond) plt.figure() plot_confusion_matrix(cnf_matrix, classes=label_encoder.classes_, title='Confusion Matrix') fig_confusion_matrix = plt2MD(plt) accuracy = nb_model.score(features, label_correspond) * 100 rb = ReportBuilder() rb.addMD( strip_margin(""" | ## Naive Bayes Classification Result | ### Parameters | {table_parameter} | ### Predicted vs Actual | {image1} | #### Accuacy = {accuracy}% | """.format(image1=fig_confusion_matrix, accuracy=accuracy, table_parameter=dict2MD(get_param)))) model = _model_dict('naive_bayes_model') model['features'] = feature_cols model['label_col'] = label_col model['label_encoder'] = label_encoder model['nb_model'] = nb_model model['report'] = rb.get() return {'model': model}