Exemplo n.º 1
0
def correlation_analysis(dataset):
    # read
    tb = pd.read_sql_table(dataset, db, index_col = 'ID')
    X = tb.iloc[:, :-1]; y = tb.iloc[:, -1]
    # compute correlation
    X.drop(X.columns[X.var() < 1e-5], axis = 1, inplace = True)
    r = np.array([pearsonr(X.ix[:,i], y) for i in range(X.shape[1])])
    rank = np.abs(r[:, 0]).argsort()[::-1]
    # plot top ones
    N = 9
    top = rank[:N]
    traces = []
    names = []
    for (i, c) in enumerate(X.columns[top]):
        names.append('{}<br>(r={:0.2g} p={:0.2g})'.format(
            c, r[top[i], 0], r[top[i], 1]))
        traces.append(go.Scatter(x = X[c].values.tolist(),
                                 y = y.values.tolist(),
                                 mode = 'markers',
                                 showlegend = False))
    fig = tools.make_subplots(rows = 3, cols = 3,
                              subplot_titles = names,
                              vertical_spacing = 0.1,
                              horizontal_spacing = 0.1)
    for (i, p) in enumerate(traces):
        fig.append_trace(p, i // 3 + 1, i % 3 + 1)
    fig['layout'].update(height = 700, width = 1100)
    fig['layout'].update(margin = go.Margin(l = 50, r = 50, b = 50,
                                            t = 50, pad = 0))
    for a in fig.layout.annotations:
        a['font'].update(size = 14)
    return (X.columns[rank], utils.plot_to_div(fig))
Exemplo n.º 2
0
def rlasso_scores_plot(names, scores):
    trace = go.Bar(x = names, y = scores,  opacity = 0.7)
    layout= go.Layout(margin = go.Margin(l = 50, r = 80,
                                         b = 70, t = 10, pad = 0),
                      height = 200,
                      yaxis = dict(title = 'Score'))
    fig = go.Figure(data = [trace], layout = layout)
    return utils.plot_to_div(fig)
Exemplo n.º 3
0
def plot_correlation_matrix(X):
    cor = X.corr()
    trace = go.Heatmap(x = cor.columns,
                       y = cor.index[::-1],
                       z = cor.values[::-1])
    layout= go.Layout(margin = go.Margin(l = 200, r = 0, b = 120, t = 0, pad = 0),
                      width = 680, height = 500)
    fig = go.Figure(data = [trace], layout = layout)
    return utils.plot_to_div(fig)
Exemplo n.º 4
0
def plot_time_series(runs, cols):
    # loop through all runs and cols and plot
    traces = {c: [] for c in cols}
    for r in runs:
        tb = pd.read_sql_table(r, db, index_col="Time")
        for c in cols:
            if c in tb.columns:
                data = tb[c].dropna()
                traces[c].append(go.Scatter(x=data.index, y=data.values, mode="lines", name=r, showlegend=False))
    # set up subplot
    fig = tools.make_subplots(rows=len(cols), cols=1, subplot_titles=cols)
    for (c, t) in traces.items():
        for p in t:
            fig.append_trace(p, cols.index(c) + 1, 1)
    fig["layout"].update(height=100 + 300 * len(cols))
    for a in fig.layout.annotations:
        a["font"].update(size=12)
    return utils.plot_to_div(fig)
Exemplo n.º 5
0
def plot_pca(X, y):
    # normalization
    X = (X - X.mean()) / X.std()
    z = np.array(y.values.tolist())
    z = ((z - z.min()) / (z.max() - z.min()) + 0.5) * 20
    # pca
    pca = PCA(n_components = 3)
    proj = pca.fit_transform(X)
    # plot
    trace = go.Scatter3d(x = proj[:, 0], y = proj[:, 1], z = proj[:, 2],
                         text = map(lambda v: '{:0.2f}'.format(v), y.values),
                         mode = 'markers',
                         marker = dict(sizemode = 'diameter', size = z, opacity = 0.4))
    layout = go.Layout(margin = go.Margin(l = 200, r = 0, b = 0, t = 0, pad = 0),
                       height = 550, width = 700)
    fig = dict(data = [trace], layout = layout)
    # compute variance covered
    var_percent = "{:0.2f}%".format(100 * sum(pca.explained_variance_ratio_))
    return (var_percent, utils.plot_to_div(fig))
Exemplo n.º 6
0
def scatter_plot(dataset, varx, vary):
    # read data
    tb = pd.read_sql_table(dataset, db, index_col = 'ID')
    x = tb[varx].values.tolist()
    y = tb[vary].values.tolist()
    z = np.array(tb[tb.columns[-1]].values.tolist())
    z = ((z - z.min()) / (z.max() - z.min()) + 0.5) * 40
    # plot
    trace = go.Scatter(x = x, y = y, mode = 'markers',
                       marker=dict(color = 'rgb(93, 164, 214)',
                                   opacity = 0.35,
                                  size = z,
                                   sizemode = 'diameter',))
    layout= go.Layout(margin = go.Margin(l = 100, r = 100, b = 50,
                                         t = 30, pad = 0),
                      width = 550, height = 500,
                      xaxis = dict(title = varx),
                      yaxis = dict(title = vary))
    fig = go.Figure(data = [trace], layout = layout)
    return utils.plot_to_div(fig)
Exemplo n.º 7
0
# fetch data
db = create_engine('sqlite:///data/processed.db')
metadata = pd.read_sql_table('metadata', db, index_col = 'index')
counts = pd.read_sql_table('counts', db, index_col = 'Date')

# read metadata as dictionary
metadata = metadata.to_dict()['0']

# # fill in the days when there were nothing
# # does not look nice.
# counts.index = counts.index.map(lambda x: datetime.datetime.strptime(x,'%y/%m/%d'))
# rng = pd.date_range(counts.index[0], counts.index[-1])
# counts = counts.reindex(rng).fillna(0)

# yy/mm/dd -> mm/dd. it is just too long to be axis label.
counts.index = counts.index.map(lambda x: x[3:])

# create the timeline plot
from utils import plot_to_div
import plotly.graph_objs as go
traces = [go.Bar(x = counts.index, y = counts[label].values, name = label, opacity = 0.7)
          for label in counts]
layout= go.Layout(barmode='stack',
                  margin = go.Margin(l = 50, r = 50, b = 50, t = 0, pad = 0),
                  height = 300,
                  yaxis = dict(title = '# of runs'))
fig = go.Figure(data = traces, layout = layout)
timeline_plot = plot_to_div(fig)

Exemplo n.º 8
0
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import utils
import plotly.graph_objs as go

# main database
db = create_engine('sqlite:///data/processed.db')
tb = pd.read_sql_table('cfr', db, index_col = 'ID')

# parameter count
num_params = tb.shape[1]

# plot availability
availability = (1.0 - tb.isnull().sum() / tb.shape[0]) * 100
trace = go.Bar(x = tb.columns,
               y = availability,
               marker = dict(color = 'rgb(158,202,225)',
                             line = dict(color = 'rgb(8,48,107)',
                                         width = 1.5)),
               opacity = 0.6)
layout= go.Layout(margin = go.Margin(l = 50, r = 30, b = 120, t = 0,
                                     pad = 0),
                  height = 300,
                  xaxis = dict(tickfont = dict(size = 12)),
                  yaxis = dict(title = 'availability (%)'))
fig = go.Figure(data = [trace], layout = layout)
avail_plot = utils.plot_to_div(fig)