def correlation_analysis(dataset): # read tb = pd.read_sql_table(dataset, db, index_col = 'ID') X = tb.iloc[:, :-1]; y = tb.iloc[:, -1] # compute correlation X.drop(X.columns[X.var() < 1e-5], axis = 1, inplace = True) r = np.array([pearsonr(X.ix[:,i], y) for i in range(X.shape[1])]) rank = np.abs(r[:, 0]).argsort()[::-1] # plot top ones N = 9 top = rank[:N] traces = [] names = [] for (i, c) in enumerate(X.columns[top]): names.append('{}<br>(r={:0.2g} p={:0.2g})'.format( c, r[top[i], 0], r[top[i], 1])) traces.append(go.Scatter(x = X[c].values.tolist(), y = y.values.tolist(), mode = 'markers', showlegend = False)) fig = tools.make_subplots(rows = 3, cols = 3, subplot_titles = names, vertical_spacing = 0.1, horizontal_spacing = 0.1) for (i, p) in enumerate(traces): fig.append_trace(p, i // 3 + 1, i % 3 + 1) fig['layout'].update(height = 700, width = 1100) fig['layout'].update(margin = go.Margin(l = 50, r = 50, b = 50, t = 50, pad = 0)) for a in fig.layout.annotations: a['font'].update(size = 14) return (X.columns[rank], utils.plot_to_div(fig))
def rlasso_scores_plot(names, scores): trace = go.Bar(x = names, y = scores, opacity = 0.7) layout= go.Layout(margin = go.Margin(l = 50, r = 80, b = 70, t = 10, pad = 0), height = 200, yaxis = dict(title = 'Score')) fig = go.Figure(data = [trace], layout = layout) return utils.plot_to_div(fig)
def plot_correlation_matrix(X): cor = X.corr() trace = go.Heatmap(x = cor.columns, y = cor.index[::-1], z = cor.values[::-1]) layout= go.Layout(margin = go.Margin(l = 200, r = 0, b = 120, t = 0, pad = 0), width = 680, height = 500) fig = go.Figure(data = [trace], layout = layout) return utils.plot_to_div(fig)
def plot_time_series(runs, cols): # loop through all runs and cols and plot traces = {c: [] for c in cols} for r in runs: tb = pd.read_sql_table(r, db, index_col="Time") for c in cols: if c in tb.columns: data = tb[c].dropna() traces[c].append(go.Scatter(x=data.index, y=data.values, mode="lines", name=r, showlegend=False)) # set up subplot fig = tools.make_subplots(rows=len(cols), cols=1, subplot_titles=cols) for (c, t) in traces.items(): for p in t: fig.append_trace(p, cols.index(c) + 1, 1) fig["layout"].update(height=100 + 300 * len(cols)) for a in fig.layout.annotations: a["font"].update(size=12) return utils.plot_to_div(fig)
def plot_pca(X, y): # normalization X = (X - X.mean()) / X.std() z = np.array(y.values.tolist()) z = ((z - z.min()) / (z.max() - z.min()) + 0.5) * 20 # pca pca = PCA(n_components = 3) proj = pca.fit_transform(X) # plot trace = go.Scatter3d(x = proj[:, 0], y = proj[:, 1], z = proj[:, 2], text = map(lambda v: '{:0.2f}'.format(v), y.values), mode = 'markers', marker = dict(sizemode = 'diameter', size = z, opacity = 0.4)) layout = go.Layout(margin = go.Margin(l = 200, r = 0, b = 0, t = 0, pad = 0), height = 550, width = 700) fig = dict(data = [trace], layout = layout) # compute variance covered var_percent = "{:0.2f}%".format(100 * sum(pca.explained_variance_ratio_)) return (var_percent, utils.plot_to_div(fig))
def scatter_plot(dataset, varx, vary): # read data tb = pd.read_sql_table(dataset, db, index_col = 'ID') x = tb[varx].values.tolist() y = tb[vary].values.tolist() z = np.array(tb[tb.columns[-1]].values.tolist()) z = ((z - z.min()) / (z.max() - z.min()) + 0.5) * 40 # plot trace = go.Scatter(x = x, y = y, mode = 'markers', marker=dict(color = 'rgb(93, 164, 214)', opacity = 0.35, size = z, sizemode = 'diameter',)) layout= go.Layout(margin = go.Margin(l = 100, r = 100, b = 50, t = 30, pad = 0), width = 550, height = 500, xaxis = dict(title = varx), yaxis = dict(title = vary)) fig = go.Figure(data = [trace], layout = layout) return utils.plot_to_div(fig)
# fetch data db = create_engine('sqlite:///data/processed.db') metadata = pd.read_sql_table('metadata', db, index_col = 'index') counts = pd.read_sql_table('counts', db, index_col = 'Date') # read metadata as dictionary metadata = metadata.to_dict()['0'] # # fill in the days when there were nothing # # does not look nice. # counts.index = counts.index.map(lambda x: datetime.datetime.strptime(x,'%y/%m/%d')) # rng = pd.date_range(counts.index[0], counts.index[-1]) # counts = counts.reindex(rng).fillna(0) # yy/mm/dd -> mm/dd. it is just too long to be axis label. counts.index = counts.index.map(lambda x: x[3:]) # create the timeline plot from utils import plot_to_div import plotly.graph_objs as go traces = [go.Bar(x = counts.index, y = counts[label].values, name = label, opacity = 0.7) for label in counts] layout= go.Layout(barmode='stack', margin = go.Margin(l = 50, r = 50, b = 50, t = 0, pad = 0), height = 300, yaxis = dict(title = '# of runs')) fig = go.Figure(data = traces, layout = layout) timeline_plot = plot_to_div(fig)
import pandas as pd import numpy as np from sqlalchemy import create_engine import utils import plotly.graph_objs as go # main database db = create_engine('sqlite:///data/processed.db') tb = pd.read_sql_table('cfr', db, index_col = 'ID') # parameter count num_params = tb.shape[1] # plot availability availability = (1.0 - tb.isnull().sum() / tb.shape[0]) * 100 trace = go.Bar(x = tb.columns, y = availability, marker = dict(color = 'rgb(158,202,225)', line = dict(color = 'rgb(8,48,107)', width = 1.5)), opacity = 0.6) layout= go.Layout(margin = go.Margin(l = 50, r = 30, b = 120, t = 0, pad = 0), height = 300, xaxis = dict(tickfont = dict(size = 12)), yaxis = dict(title = 'availability (%)')) fig = go.Figure(data = [trace], layout = layout) avail_plot = utils.plot_to_div(fig)