def compute_graph_lasso_covariance(subject_id,
                                   group,
                                   session='func1',
                                   preprocessing_folder='pipeline_1',
                                   plot=True,
                                   save=True,
                                   save_file=True,
                                   msdl=False):
    """Returns graph lasso covariance for a subject_id
    """
    # load timeseries
    if msdl:
        ts = load_dynacomp_msdl_timeseries(
            subject_id,
            session=session,
            preprocessing_folder=preprocessing_folder)
        roi_names, roi_coords = load_msdl_names_and_coords()
    else:
        ts = load_dynacomp_roi_timeseries(
            subject_id,
            session=session,
            preprocessing_folder=preprocessing_folder)
        # load rois
        roi_names, roi_coords = load_roi_names_and_coords(subject_id)

    # compute covariance
    gl = covariance.GraphLassoCV(verbose=2)
    gl.fit(ts)
    if plot:
        plot_connectivity_matrix(subject_id, group, gl.covariance_, roi_names,
                                 'gl_covariance', session,
                                 preprocessing_folder, save, msdl)
        plot_connectivity_matrix(subject_id, group, gl.precision_, roi_names,
                                 'gl_precision', session, preprocessing_folder,
                                 save, msdl)
        sparsity = (gl.precision_ == 0)
        plot_connectivity_matrix(subject_id, group, sparsity, roi_names,
                                 'gl_sparsity', session, preprocessing_folder,
                                 save, msdl)

        plot_connectivity_glassbrain(subject_id, group, gl.covariance_,
                                     roi_coords, 'gl_covariance', session,
                                     preprocessing_folder, save, msdl)

    if save_file:
        CONN_DIR = set_data_base_dir('Dynacomp/connectivity')
        sparsity = (gl.precision_ == 0)
        if not os.path.isdir(os.path.join(CONN_DIR, subject_id)):
            os.mkdir(os.path.join(CONN_DIR, subject_id))
        output_file = os.path.join(
            CONN_DIR, subject_id, 'gl_' + session + '_' + preprocessing_folder)
        if msdl:
            output_file += '_msdl'
        np.savez(output_file,
                 covariance=gl.covariance_,
                 precision=gl.precision_,
                 sparsity=sparsity,
                 roi_names=roi_names,
                 roi_coords=roi_coords)
    return gl, roi_names, roi_coords
Exemplo n.º 2
0
def affinity_propagation(df_wtarget, df_wotarget, variables):
    '''
    This function takes in a dataframe with our target variable,
    a dataframe without the target variable, and the title of the
    result. It does an affinity propagation analysis and then prints
    out the clusters we've determined.
    '''

    # Learn a graphical structure from the correlations
    edge_model = covariance.GraphLassoCV()

    # standardize the time series: using correlations rather than covariance
    # is more efficient for structure recovery
    X = df_wotarget.copy().T
    X /= X.std(axis=0)
    edge_model.fit(X)

    # #############################################################################
    # Cluster using affinity propagation

    _, labels = cluster.affinity_propagation(edge_model.covariance_)
    df_wtarget['score'] = labels + 1
    scores = df_wtarget.filter(items=variables)

    writer = pd.ExcelWriter('results/affinity_propagation.xlsx')
    scores.to_excel(writer)
    writer.save()
def shrinkage():

    plt.imshow(cov_train_pyr[0])
    plt.colorbar()
    plt.show()
    cov_train_lasso = cov_train
    prec_train_lasso = cov_train
    cov_train_oas = cov_train
    corr_lasso = cov_train
    for i in range(len(data_train)):
        cov_train_oas[i] = covariance.OAS().fit(data_train[i]).covariance_
        # plt.imshow(cov_train[i])
        # plt.colorbar()
        # plt.show()
        GLassCV = covariance.GraphLassoCV(cv=5)
        cov_train_lasso[i] = GLassCV.fit(data_train[i]).covariance_
        prec_train_lasso[i] = GLassCV.fit(data_train[i]).precision_
        corr_lasso[i] = cov2corr(prec_train_lasso[i])
        print('sum of correlations: ', np.sum(np.abs(corr_lasso[i]), axis=1))
        myalphas = GLassCV.cv_alphas_
        print(myalphas)
        print(np.mean(GLassCV.grid_scores_, axis=1))

        plt.imshow(corr_lasso[i])
        plt.colorbar()
        plt.show()
    cov_train[i] = covariance.LedoitWolf().fit(data_train[i]).covariance_
Exemplo n.º 4
0
def find_top_genes_from_pcor_network(log2_df_cell, num_iterations,
                                     gene_index_list):
    alpha = 0.4
    keep_genes = []
    for iter_genes in range(0, num_iterations):
        top_pca_matrix, top_pca_genes = return_top_pca_gene(
            log2_df_cell,
            range_genes=[
                gene_index_list[max(0, iter_genes - 1)],
                gene_index_list[iter_genes + 1]
            ])
        mean_expr_dict = {}

        gl = covariance.GraphLassoCV()
        gene_data = scale(top_pca_matrix.as_matrix())

        gl.fit(gene_data)
        _, labels = cluster.affinity_propagation(gl.covariance_)

        n_labels = labels.max()
        names = np.array(top_pca_genes)
        prec_sp = gl.precision_
        matrix1 = -prec_sp + np.diag(np.diagonal(prec_sp))
        D = nx.Graph(matrix1)

        gene_weights_dict = {}
        for n in names:
            gene_weights_dict[n] = 0
        for x, y in D.edges():
            gene1 = names[x]
            gene2 = names[y]
            abs_weight = abs(D[x][y]['weight'])
            gene_weights_dict[gene1] += abs_weight
            gene_weights_dict[gene2] += abs_weight

        clust_gene_list = []
        avg_wieght_list = []
        for i in range(n_labels + 1):
            clust_id = "cluster_" + str(i + 1)
            w_list = [gene_weights_dict[g] for g in names[labels == i]]
            clust_gene_list.append([names[labels == i]])
            avg_wieght_list.append(sum(w_list) / len(w_list))
            print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i])))
        if iter_genes == 0:
            threshold = np.mean(avg_wieght_list) - np.std(avg_wieght_list)

        for g_list in [
                clust_gene_list[i] for i, av in enumerate(avg_wieght_list)
                if av >= threshold
        ]:
            keep_genes.append(np.ravel(g_list))

    final_gene_list = list(
        set([item for sublist in keep_genes for item in sublist]))
    print(final_gene_list)
    gene_matrix_log2 = log2_df_cell.T
    top_matrix = gene_matrix_log2[final_gene_list]
    top_matrix_cell = top_matrix.T
    return top_matrix, top_matrix_cell, final_gene_list
Exemplo n.º 5
0
    def get_sparse_cov(self):
        #calculating sparse LASSO covariance

        flat = self.flatten_cube_spectral(self.Z, normalise=True)
        model = skcv.GraphLassoCV()
        model.fit(flat)
        cov_ = model.covariance_
        self.corr_mat = cov_
Exemplo n.º 6
0
 def learnGraphStructure(self):
     self.edge_model = covariance.GraphLassoCV()
     # standardize the time series: using correlations rather than covariance
     # is more efficient for structure recovery
     #        data = self.getReturns(self.prices.as_frame())
     fx = FXClas.fxData()
     self._X = fx.getCurrencyBasketFromDB(currencies=None,
                                          periodicity='monthly',
                                          fxRisk=None)
     #        self.X = data['returns']
     self._X /= self._X.std(axis=0)
     self.names = self._X.columns
     print(self)
     print(self._X)
     self.edge_model.fit(self._X.fillna(np.mean(self._X), inplace=True))
Exemplo n.º 7
0
def make_graphlasso(top_matrix, name_list):
    gl = covariance.GraphLassoCV()
    gene_data = scale(top_matrix.as_matrix())

    gl.fit(gene_data)

    names = np.array(name_list)

    node_position_model = manifold.LocallyLinearEmbedding(n_components=2,
                                                          eigen_solver='dense',
                                                          n_neighbors=7)

    embedding = node_position_model.fit_transform(gene_data.T).T

    return gl, embedding, names
Exemplo n.º 8
0
def discover_clusters(var):
    from sklearn import cluster, covariance
    # Learn a graphical structure from the correlations
    edge_model = covariance.GraphLassoCV()
    edge_model.fit(var)

    # Cluster using affinity propagation
    _, labels = cluster.affinity_propagation(edge_model.covariance_)
    n_labels = labels.max()
    for i in xrange(n_labels + 1):
        print 'Cluster %i: %s' % (i, \
            ', '.join(var.columns[labels == i]))
    del cluster, covariance

    return labels, edge_model.precision_.copy()
Exemplo n.º 9
0
def cluster_industry(industry_return_df, start_date, end_date):
    industry_return_df = industry_return_df[industry_return_df.index.map(
        lambda x: x >= start_date and x <= end_date)]
    edge_model = covariance.GraphLassoCV()
    #     edge_model.fit(industry_return_df.values)
    #     centers, labels = cluster.affinity_propagation(edge_model.covariance_)
    kmeans_clf = cluster.KMeans(40)
    labels = kmeans_clf.fit_predict(industry_return_df.T.values)
    cluster_res_df = pd.DataFrame({
        'industry': industry_return_df.columns,
        'label': labels
    })
    for label, tmp_df in cluster_res_df.groupby('label'):
        industry_names = " ".join(tmp_df['industry'].values)
        print("label %s: %s" % (label, industry_names))
Exemplo n.º 10
0
def correlation_between_states_2(crude, states, threshold):

    correlations = []
    # We retrieve the correlation coefficent for every keyword, across every year we explored.
    diff = pd.DataFrame(crude.set_index('Year').groupby('LocationAbbr')\
                        .apply(lambda x: x['Data_Value'].diff()))

    d = diff.stack().unstack(0)
    if crude.shape[0] > 500:
        d.index = d.index.droplevel(level=1)

    # Learn a graphical structure from the correlations
    edge_model = covariance.GraphLassoCV()

    # standardize the time series: using correlations rather than covariance
    # is more efficient for structure recovery
    d = d.loc[1998:]
    corr = d.values
    X = corr.copy()
    X /= X.std(axis=0)
    edge_model.fit(X)

    # Transform it in a links data frame (3 columns only):
    corr = pd.DataFrame(edge_model.covariance_, index=states, columns=states)
    links = corr.stack().reset_index()
    links.columns = ['var1', 'var2', 'value']

    # Keep only correlation over a threshold and remove self correlation (cor(A,A)=1)
    links_filtered = links.loc[(links['value'] > threshold)
                               & (links['var1'] != links['var2'])]

    # Build your graph
    G = nx.from_pandas_dataframe(links_filtered, 'var1', 'var2')
    #pivoting to print data
    links = links.pivot(index='var1', columns='var2')['value']

    #computing the graph partition
    l = nx.laplacian_matrix(G=G)
    L = spr.csr_matrix(l).toarray()
    eig, vec = np.linalg.eig(L)
    sort = np.argsort(np.diff(np.abs(eig)))
    i = 1
    v = vec[:, sort[-i]].real / sum(vec[:, sort[-i]].real)
    label = v.astype(int)

    return (links, G, label)
Exemplo n.º 11
0
def analysis():
    now = datetime.datetime.now()
    ###############################################################################

    d1 = datetime.datetime(2016, now.month, 1)
    d2 = datetime.datetime(2016, now.month, 30)

    symbol_dict = stocks

    symbols, names = np.array(list(symbol_dict.items())).T

    quotes = [quotes_historical_yahoo(symbol, d1, d2, asobject=True)
              for symbol in symbols]

    open = np.array([q.open for q in quotes]).astype(np.float)
    close = np.array([q.close for q in quotes]).astype(np.float)

    # The daily variations of the quotes are what carry most information
    variation = close - open

    ###############################################################################
    # Learn a graphical structure from the correlations
    edge_model = covariance.GraphLassoCV()

    # standardize the time series: using correlations rather than covariance
    # is more efficient for structure recovery
    X = variation.copy().T
    X /= X.std(axis=0)
    edge_model.fit(X)

    ###############################################################################
    # Cluster using affinity propagation
    _, labels = cluster.affinity_propagation(edge_model.covariance_)
    n_labels = labels.max()

    message = ''

    for i in range(n_labels + 1):
        message += 'Cluster %i: %s\r\n' % ((i + 1), ', '.join(names[labels == i]))

    return message
Exemplo n.º 12
0
def computeCovar(bed, shrinkMethod, fitIndividuals):
    eigen = dict([])

    if (shrinkMethod in ['lw', 'oas', 'l1', 'cv']):
        import sklearn.covariance as cov
        t0 = time.time()
        print 'Estimating shrunk covariance using', shrinkMethod, 'estimator...'

        if (shrinkMethod == 'lw'):
            covEstimator = cov.LedoitWolf(assume_centered=True,
                                          block_size=5 * bed.val.shape[0])
        elif (shrinkMethod == 'oas'):
            covEstimator = cov.OAS(assume_centered=True)
        elif (shrinkMethod == 'l1'):
            covEstimator = cov.GraphLassoCV(assume_centered=True, verbose=True)
        elif (shrinkMethod == 'cv'):
            shrunkEstimator = cov.ShrunkCovariance(assume_centered=True)
            param_grid = {'shrinkage': [0.01, 0.1, 0.3, 0.5, 0.7, 0.9, 0.99]}
            covEstimator = sklearn.grid_search.GridSearchCV(
                shrunkEstimator, param_grid)
        else:
            raise Exception('unknown covariance regularizer')

        covEstimator.fit(bed.val[fitIndividuals, :].T)
        if (shrinkMethod == 'l1'):
            alpha = covEstimator.alpha_
            print 'l1 alpha chosen:', alpha
            covEstimator2 = cov.GraphLasso(alpha=alpha,
                                           assume_centered=True,
                                           verbose=True)
        else:
            if (shrinkMethod == 'cv'):
                shrinkEstimator = clf.best_params_['shrinkage']
            else:
                shrinkEstimator = covEstimator.shrinkage_
            print 'shrinkage estimator:', shrinkEstimator
            covEstimator2 = cov.ShrunkCovariance(shrinkage=shrinkEstimator,
                                                 assume_centered=True)
        covEstimator2.fit(bed.val.T)
        XXT = covEstimator2.covariance_ * bed.val.shape[1]
        print 'Done in %0.2f' % (time.time() - t0), 'seconds'

    else:
        print 'Computing kinship matrix...'
        t0 = time.time()
        XXT = symmetrize(blas.dsyrk(1.0, bed.val, lower=1))
        print 'Done in %0.2f' % (time.time() - t0), 'seconds'
        try:
            shrinkParam = float(shrinkMethod)
        except:
            shrinkParam = -1
        if (shrinkMethod == 'mylw'):
            XXT_fit = XXT[np.ix_(fitIndividuals, fitIndividuals)]
            sE2R = (np.sum(XXT_fit**2) -
                    np.sum(np.diag(XXT_fit)**2)) / (bed.val.shape[1]**2)
            #temp = (bed.val**2).dot((bed.val.T)**2)
            temp = symmetrize(
                blas.dsyrk(1.0, bed.val[fitIndividuals, :]**2, lower=1))
            sER2 = (temp.sum() - np.diag(temp).sum()) / bed.val.shape[1]
            shrinkParam = (sER2 - sE2R) / (sE2R * (bed.val.shape[1] - 1))
        if (shrinkParam > 0):
            print 'shrinkage estimator:', 1 - shrinkParam
            XXT = (1 - shrinkParam) * XXT + bed.val.shape[
                1] * shrinkParam * np.eye(XXT.shape[0])

    return XXT
Exemplo n.º 13
0
    symbol_dict = json.loads(f.read())

symbols, names = np.array(list(symbol_dict.items())).T

quotes = [quotes_yahoo(symbol, start_date, end_date, asobject=True) 
                for symbol in symbols]

# Extract opening and closing quotes
opening_quotes = np.array([quote.open for quote in quotes]).astype(np.float)
closing_quotes = np.array([quote.close for quote in quotes]).astype(np.float)

# The daily fluctuations of the quotes 
delta_quotes = closing_quotes - opening_quotes

# Build a graph model from the correlations
edge_model = covariance.GraphLassoCV()

# Standardize the data 
X = delta_quotes.copy().T
X /= X.std(axis=0)

# Train the model
with np.errstate(invalid='ignore'):
    edge_model.fit(X)

# Build clustering model using affinity propagation
_, labels = cluster.affinity_propagation(edge_model.covariance_)
num_labels = labels.max()

# Print the results of clustering
for i in range(num_labels + 1):
def contingencyTableChi2andPOISpaceStructure(dataBunch, pred, class_mapping,
                                             dbLabel, savefigFn):
    '''独立性检验'''
    mergingData = np.hstack(
        (pred.reshape(-1, 1), dataBunch.target.reshape(-1,
                                                       1)))  #水平组合聚类预测值和行业分类类标
    targetStack = []
    for i in range(len(
            np.array(class_mapping)[..., 0])):  #按行业类标重新组织数据,每行对应行业类标所有的聚类预测值
        targetStack.append(mergingData[mergingData[..., -1] == int(
            np.array(class_mapping)[..., 0][i])])
    clusterFrequency = {}
    for p in targetStack:  #按行业类标计算每类所有点所属聚类簇的数量(频数)
        clusterFrequency[(p[...,
                            -1][0])] = [(j, np.sum(p[..., 0] == int(j)) + 1)
                                        for j in dbLabel
                                        if j != -1]  #独立性检验值不能为零,因此将所有值+1


#    print(clusterFrequency)
    CTableTarget = list(clusterFrequency.keys())
    CTableIdx = np.array(list(clusterFrequency.values()))
    CTable = CTableIdx[..., 1]  #建立用于独立性分析的列联表,横向为行业类所属聚类簇频数,纵向为行业类标
    totalIndependence = chi2_contingency(CTable)  #列联表的独立性检验
    g, p, dof, expctd = totalIndependence  #提取卡方值g,p值,自由度dof和与元数据数组同维度的对应理论值。此次实验计算p=0.00120633349692,小于0.05,因此行业分类与聚类簇相关。
    print(g, p, dof)
    '''poi的空间分布结构。参考官方案例Visualizing the stock market structure:http://scikit-learn.org/stable/auto_examples/applications/plot_stock_market.html#sphx-glr-auto-examples-applications-plot-stock-market-py'''
    #A-协方差逆矩阵(精度矩阵)。The matrix inverse of the covariance matrix, often called the precision matrix, is proportional to the partial correlation matrix. It gives the partial independence relationship. In other words, if two features are independent conditionally on the others, the corresponding coefficient in the precision matrix will be zero。来自官网说明摘录
    edge_model = covariance.GraphLassoCV(
    )  #稀疏逆协方差估计器GraphLassoCV(),翻译有待数学专业确认。官网解释:http://scikit-learn.org/stable/modules/covariance.html#sparse-inverse-covariance
    X = CTable.copy().T
    print(X, X.shape)
    X = X / X.std(axis=0)  #标准化。可以自行实验小规模数组,查看变化,分析结果,获取结论。
    print(X)
    edge_model.fit(X)
    print("******************************************************************")
    print(edge_model.covariance_.shape)

    #B-affinity_propagation(AP)聚类算法是基于数据点间"信息传递"的一种聚类算法,不用预先给出cluster簇数。聚类协方差矩阵
    _, labels = cluster.affinity_propagation(edge_model.covariance_)
    n_labels = labels.max()
    print(labels)

    #C-Manifold中的降维方法可以能够处理数据中的非线性结构信息。具体可以查看官网http://scikit-learn.org/stable/modules/manifold.html#locally-linear-embedding。降维的目的是降到2维,作为xy坐标值,在二维图表中绘制为点。
    node_position_model = manifold.LocallyLinearEmbedding(n_components=2,
                                                          eigen_solver='dense',
                                                          n_neighbors=6)
    embedding = node_position_model.fit_transform(X.T).T
    print(embedding.shape)
    '''图表可视化poi空间分布结构'''
    plt.figure(1, facecolor='w', figsize=(10, 8))
    plt.clf()
    ax = plt.axes(
        [0., 0., 1., 1.]
    )  #可以参考官方示例程序 http://matplotlib.org/examples/pylab_examples/axis_equal_demo.html
    plt.axis('off')

    # Display a graph of the partial correlations/偏相关分析:在多要素所构成的系统中,当研究某一个要素对另一个要素的影响或相关程度时,把其他要素的影响视作常数(保持不变),即暂时不考虑其他要素影响,单独研究两个要素之间的相互关系的密切程度,所得数值结果为偏相关系数。在多元相关分析中,简单相关系数可能不能够真实的反映出变量X和Y之间的相关性,因为变量之间的关系很复杂,它们可能受到不止一个变量的影响。这个时候偏相关系数是一个更好的选择。
    partial_correlations = edge_model.precision_.copy()
    print(partial_correlations.shape)
    #    print(partial_correlations)
    d = 1 / np.sqrt(
        np.diag(partial_correlations))  #umpy.diag()返回一个矩阵的对角线元素,计算该元素平方根的倒数。
    partial_correlations *= d
    partial_correlations *= d[:, np.newaxis]
    non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02
                )  #np.triu()返回矩阵的上三角矩阵。

    # Plot the nodes using the coordinates of our embedding
    plt.scatter(embedding[0],
                embedding[1],
                s=300 * d**2,
                c=labels,
                cmap=plt.cm.Spectral)  #簇类标用于定义节点的颜色,降维后数据作为点坐标

    # Plot the edges
    start_idx, end_idx = np.where(
        non_zero
    )  #numpy.where(condition[, x, y])这里x,y是可选参数,condition是条件,这三个输入参数都是array_like的形式;而且三者的维度相同。当conditon的某个位置的为true时,输出x的对应位置的元素,否则选择y对应位置的元素;如果只有参数condition,则函数返回为true的元素的坐标位置信息;
    segments = [[embedding[:, start], embedding[:, stop]]
                for start, stop in zip(start_idx, end_idx)]
    values = np.abs(partial_correlations[non_zero])
    print(
        "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
    )
    print(len(segments))
    print(len(values))
    cm = plt.cm.get_cmap(
        'OrRd'
    )  #具体的`matplotlib.colors.Colormap'实例可以查看matplotlib官网 http://matplotlib.org/users/colormaps.html,替换不同色系
    lc = LineCollection(segments,
                        zorder=0,
                        cmap=cm,
                        norm=plt.Normalize(0, .7 * values.max()))
    lc.set_array(values)
    lc.set_linewidths(15 * values)  #定义边缘的强度。
    ax.add_collection(lc)

    # Add a label to each node. The challenge here is that we want to position the labels to avoid overlap with other labels,添加行业分类标签,并避免标签重叠。
    names = [i[-1] for i in class_mapping]
    for index, (name, label,
                (x, y)) in enumerate(zip(names, labels, embedding.T)):
        dx = x - embedding[0]
        dx[index] = 1
        dy = y - embedding[1]
        dy[index] = 1
        this_dx = dx[np.argmin(np.abs(dy))]
        this_dy = dy[np.argmin(np.abs(dx))]
        if this_dx > 0:
            horizontalalignment = 'left'
            x = x + .002
        else:
            horizontalalignment = 'right'
            x = x - .002
        if this_dy > 0:
            verticalalignment = 'bottom'
            y = y + .002
        else:
            verticalalignment = 'top'
            y = y - .002
        plt.text(x,
                 y,
                 name,
                 size=10,
                 horizontalalignment=horizontalalignment,
                 verticalalignment=verticalalignment,
                 bbox=dict(facecolor='w',
                           edgecolor=plt.cm.Spectral(label / float(n_labels)),
                           alpha=.6))
    plt.xlim(
        embedding[0].min() - .15 * embedding[0].ptp(),
        embedding[0].max() + .10 * embedding[0].ptp(),
    )  #numpy.ptp()极差函数返回沿轴的值的范围(最大值-最小值)。
    plt.ylim(embedding[1].min() - .03 * embedding[1].ptp(),
             embedding[1].max() + .03 * embedding[1].ptp())
    plt.savefig(os.path.join(savingFig, savefigFn))  #保存打印的图表
    plt.show()
    return CTable, np.array(partial_correlations)
Exemplo n.º 15
0
def dailyStockClusters():
    import datetime
    import os
    import numpy as np
    import pandas.io.data as web
    from pandas import DataFrame
    from matplotlib import pylab as pl
    from matplotlib import finance
    from matplotlib.collections import LineCollection

    from sklearn import cluster, covariance, manifold
    ########################################################################
    ###
    ### This example employs several unsupervised learning techniques to
    ### extract the stock market structure from variations in historical quotes.
    ### The quantity that we use is the daily variation in quote price:
    ### quotes that are linked tend to co-fluctuate during a day.
    ###
    ### stocks used are all Nasdaq 100 stocks that have one year of history
    ### from the current date.
    ###
    ### adopted from example at:
    ### http://scikit-learn.org/0.14/auto_examples/applications/plot_stock_market.html
    ###
    ########################################################################
    # Retrieve the data from Internet

    # Choose a time period reasonnably calm (not too long ago so that we get
    # high-tech firms, and before the 2008 crash)
    today = datetime.datetime.now()
    d1 = datetime.datetime(today.year - 1, today.month, today.day)
    d2 = datetime.datetime(today.year, today.month, today.day)

    # input symbols and company names from text file
    companyName_file = os.path.join(os.getcwd(), "symbols", "companyNames.txt")
    with open(companyName_file, "r") as f:
        companyNames = f.read()

    print "\n\n\n"
    companyNames = companyNames.split("\n")
    ii = companyNames.index("")
    del companyNames[ii]
    companySymbolList = []
    companyNameList = []
    symbol_dict = {}
    for iname, name in enumerate(companyNames):
        name = name.replace("amp;", "")
        testsymbol, testcompanyName = name.split(";")
        companySymbolList.append(format(testsymbol, 's'))
        companyNameList.append(format(testcompanyName, 's'))
        if testsymbol != "CASH":
            symbol_dict[testsymbol] = format(testcompanyName, 's')
    print " ... symbol_dict = ", symbol_dict

    symbols = companySymbolList[:]
    names = companyNameList[:]

    all_data = {}
    for ticker in symbols:
        try:
            all_data[ticker] = web.get_data_yahoo(ticker, d1, d2)
            qclose = DataFrame(
                {tic: data['Close']
                 for tic, data in all_data.iteritems()})
            qopen = DataFrame(
                {tic: data['Open']
                 for tic, data in all_data.iteritems()})
        except:
            print "Cant find ", ticker

    symbols_edit = []
    names_edit = []
    for i, ticker in enumerate(symbols):
        if True in np.isnan(np.array(qclose[ticker])).tolist():
            print ticker, " nans found, ticker removed"
            del qclose[ticker]
            del qopen[ticker]
        else:
            symbols_edit.append(ticker)
            names_edit.append(names[i])

    # The daily variations of the quotes are what carry most information
    variation = qclose - qopen
    variation[np.isnan(variation)] = 0.

    ###############################################################################
    # Learn a graphical structure from the correlations
    edge_model = covariance.GraphLassoCV()

    # standardize the time series: using correlations rather than covariance
    # is more efficient for structure recovery
    X = variation.copy()
    #X = variation.copy().T
    X /= X.std(axis=0)
    edge_model.fit(X)

    ###############################################################################
    # Cluster using affinity propagation

    _, labels = cluster.affinity_propagation(edge_model.covariance_)
    n_labels = labels.max()

    for i in range(n_labels + 1):
        print "Cluster " + str(i) + ":"
        for j in range(len(labels)):
            if labels[j] == i:
                print " ... " + names_edit[j]
        #print('Cluster %i: %s' % ((i + 1), ', '.join(names_edit[labels == i])))

    for i in range(n_labels + 1):
        print "Cluster " + str(i) + ":"
        for j in range(len(labels)):
            if labels[j] == i:
                print " ... " + names_edit[j]

    figure7path = 'Clustered_companyNames.png'  # re-set to name without full path
    figure7_htmlText = "\n<br><h3>Daily stock clustering analyis. Based on one year performance correlations.</h3>\n"
    figure7_htmlText = figure7_htmlText + "\nClustering based on daily variation in Nasdaq 100 quotes.\n"
    figure7_htmlText = figure7_htmlText + '''<br><img src="''' + figure7path + '''" alt="PyTAAA by DonaldPG" width="850" height="500"><br>\n'''

    ###############################################################################
    # Find a low-dimension embedding for visualization: find the best position of
    # the nodes (the stocks) on a 2D plane

    # We use a dense eigen_solver to achieve reproducibility (arpack is
    # initiated with random vectors that we don't control). In addition, we
    # use a large number of neighbors to capture the large-scale structure.
    node_position_model = manifold.LocallyLinearEmbedding(n_components=2,
                                                          eigen_solver='dense',
                                                          n_neighbors=6)

    embedding = node_position_model.fit_transform(X.T).T

    ###############################################################################
    # Visualization
    pl.figure(1, facecolor='w', figsize=(10, 8))
    pl.clf()
    ax = pl.axes([0., 0., 1., 1.])
    pl.axis('off')

    # Display a graph of the partial correlations
    partial_correlations = edge_model.precision_.copy()
    d = 1 / np.sqrt(np.diag(partial_correlations))
    partial_correlations *= d
    partial_correlations *= d[:, np.newaxis]
    non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02)

    # Plot the nodes using the coordinates of our embedding
    pl.scatter(embedding[0],
               embedding[1],
               s=100 * d**2,
               c=labels,
               cmap=pl.cm.spectral)

    # Plot the edges
    start_idx, end_idx = np.where(non_zero)
    #a sequence of (*line0*, *line1*, *line2*), where::
    #            linen = (x0, y0), (x1, y1), ... (xm, ym)
    segments = [[embedding[:, start], embedding[:, stop]]
                for start, stop in zip(start_idx, end_idx)]
    values = np.abs(partial_correlations[non_zero])
    lc = LineCollection(segments,
                        zorder=0,
                        cmap=pl.cm.hot_r,
                        norm=pl.Normalize(0, .7 * values.max()))
    lc.set_array(values)
    lc.set_linewidths(15 * values)
    ax.add_collection(lc)

    # Add a label to each node. The challenge here is that we want to
    # position the labels to avoid overlap with other labels
    for index, (name, label,
                (x, y)) in enumerate(zip(names, labels, embedding.T)):

        dx = x - embedding[0]
        dx[index] = 1
        dy = y - embedding[1]
        dy[index] = 1
        this_dx = dx[np.argmin(np.abs(dy))]
        this_dy = dy[np.argmin(np.abs(dx))]
        if this_dx > 0:
            horizontalalignment = 'left'
            x = x + .002
        else:
            horizontalalignment = 'right'
            x = x - .002
        if this_dy > 0:
            verticalalignment = 'bottom'
            y = y + .002
        else:
            verticalalignment = 'top'
            y = y - .002
        pl.text(x,
                y,
                name,
                size=10,
                horizontalalignment=horizontalalignment,
                verticalalignment=verticalalignment,
                bbox=dict(facecolor='w',
                          edgecolor=pl.cm.spectral(label / float(n_labels)),
                          alpha=.6))

    pl.xlim(
        embedding[0].min() - .15 * embedding[0].ptp(),
        embedding[0].max() + .10 * embedding[0].ptp(),
    )
    pl.ylim(embedding[1].min() - .03 * embedding[1].ptp(),
            embedding[1].max() + .03 * embedding[1].ptp())

    pl.savefig(os.path.join(os.getcwd(), "pyTAAA_web",
                            "Clustered_companyNames.png"),
               format='png')

    return figure7_htmlText
Exemplo n.º 16
0
    def relation_plot(self, df, good_list):
        close_price_list = [
            df[df.code == code].close.tolist() for code in good_list
        ]
        close_prices = np.vstack(close_price_list)

        open_price_list = [
            df[df.code == code].open.tolist() for code in good_list
        ]
        open_prices = np.vstack(open_price_list)

        # the daily variations of the quotes are what carry most information
        variation = (close_prices - open_prices) * 100 / open_prices

        logger.info("get variation succeed")
        # #############################################################################
        # learn a graphical structure from the correlations
        edge_model = covariance.GraphLassoCV()
        # standardize the time series: using correlations rather than covariance is more efficient for structure recovery
        X = variation.copy().T
        X /= X.std(axis=0)
        edge_model.fit(X)

        logger.info("mode compute succeed")
        # #############################################################################
        # cluster using affinity propagation
        _, labels = cluster.affinity_propagation(edge_model.covariance_)
        n_labels = labels.max()
        code_list = np.array(good_list)

        industry_dict = dict()
        industry_df_info = IndustryInfo.get()
        for index, name in industry_df_info.name.iteritems():
            content = industry_df_info.loc[index]['content']
            a_code_list = json.loads(content)
            for code in a_code_list:
                industry_dict[code] = name

        cluster_dict = dict()
        for i in range(n_labels + 1):
            cluster_dict[i] = code_list[labels == i]
            name_list = [
                CStockInfo.get(code, 'name') for code in code_list[labels == i]
            ]
            logger.info('cluster code %i: %s' %
                        ((i + 1), ', '.join(name_list)))

        cluster_info = dict()
        for group, _code_list in cluster_dict.items():
            for code in _code_list:
                iname = industry_dict[code]
                if group not in cluster_info: cluster_info[group] = set()
                cluster_info[group].add(iname)
            logger.info('cluster inustry %i: %s' %
                        ((i + 1), ', '.join(list(cluster_info[group]))))

        # #############################################################################
        # find a low-dimension embedding for visualization: find the best position of
        # the nodes (the stocks) on a 2D plane
        # we use a dense eigen_solver to achieve reproducibility (arpack is
        # initiated with random vectors that we don't control). In addition, we
        # use a large number of neighbors to capture the large-scale structure.
        node_position_model = manifold.LocallyLinearEmbedding(
            n_components=2, eigen_solver='dense', n_neighbors=6)
        embedding = node_position_model.fit_transform(X.T).T

        # #############################################################################
        # visualizatio
        plt.figure(1, facecolor='w', figsize=(10, 8))
        plt.clf()
        ax = plt.axes([0., 0., 1., 1.])
        plt.axis('off')

        # display a graph of the partial correlations
        partial_correlations = edge_model.precision_.copy()
        d = 1 / np.sqrt(np.diag(partial_correlations))
        partial_correlations *= d
        partial_correlations *= d[:, np.newaxis]
        non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02)

        # plot the nodes using the coordinates of our embedding
        plt.scatter(embedding[0],
                    embedding[1],
                    s=100 * d**2,
                    c=labels,
                    cmap=plt.cm.nipy_spectral)

        # plot the edges
        start_idx, end_idx = np.where(non_zero)
        # a sequence of (*line0*, *line1*, *line2*), where:: linen = (x0, y0), (x1, y1), ... (xm, ym)
        segments = [[embedding[:, start], embedding[:, stop]]
                    for start, stop in zip(start_idx, end_idx)]
        values = np.abs(partial_correlations[non_zero])
        lc = LineCollection(segments,
                            zorder=0,
                            cmap=plt.cm.hot_r,
                            norm=plt.Normalize(0, .7 * values.max()))
        lc.set_array(values)
        lc.set_linewidths(15 * values)
        ax.add_collection(lc)

        # add a label to each node. The challenge here is that we want to position the labels to avoid overlap with other labels
        for index, (name, label,
                    (x, y)) in enumerate(zip(code_list, labels, embedding.T)):
            dx = x - embedding[0]
            dx[index] = 1
            dy = y - embedding[1]
            dy[index] = 1
            this_dx = dx[np.argmin(np.abs(dy))]
            this_dy = dy[np.argmin(np.abs(dx))]
            if this_dx > 0:
                horizontalalignment = 'left'
                x = x + .002
            else:
                horizontalalignment = 'right'
                x = x - .002
            if this_dy > 0:
                verticalalignment = 'bottom'
                y = y + .002
            else:
                verticalalignment = 'top'
                y = y - .002
            plt.text(x,
                     y,
                     name,
                     size=10,
                     horizontalalignment=horizontalalignment,
                     verticalalignment=verticalalignment,
                     bbox=dict(facecolor='w',
                               edgecolor=plt.cm.nipy_spectral(label /
                                                              float(n_labels)),
                               alpha=.6))
        plt.xlim(
            embedding[0].min() - .15 * embedding[0].ptp(),
            embedding[0].max() + .10 * embedding[0].ptp(),
        )
        plt.ylim(embedding[1].min() - .03 * embedding[1].ptp(),
                 embedding[1].max() + .03 * embedding[1].ptp())
        plt.savefig('/tmp/relation.png', dpi=1000)
Exemplo n.º 17
0
def ClusterAnalyses(dfs, names):
    close_prices = np.vstack([q['Close'] for q in dfs])
    open_prices = np.vstack([q['Open'] for q in dfs])
    variation = close_prices - open_prices

    # ########################
    # Learn a graphical structure from the correlations
    edge_model = covariance.GraphLassoCV()

    # standardize the time series: using correlations rather than covariance
    # is more efficient for structure recovery
    X = variation.copy().T
    X /= X.std(axis=0)
    edge_model.fit(X)

    # #########################
    # Cluster using affinity propagation

    _, labels = cluster.affinity_propagation(edge_model.covariance_)
    n_labels = labels.max()

    for i in range(n_labels + 1):
        print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i])))

    # #########################
    # Find a low-dimension embedding for visualization: find the best position of
    # the nodes (the stocks) on a 2D plane

    # We use a dense eigen_solver to achieve reproducibility (arpack is
    # initiated with random vectors that we don't control). In addition, we
    # use a large number of neighbors to capture the large-scale structure.
    node_position_model = manifold.LocallyLinearEmbedding(
        n_components=2, eigen_solver='dense', n_neighbors=6)

    embedding = node_position_model.fit_transform(X.T).T

    # #########################
    # Visualization
    plt.figure(1, facecolor='w', figsize=(10, 8))
    plt.clf()
    ax = plt.axes([0., 0., 1., 1.])
    plt.axis('off')

    # Display a graph of the partial correlations
    partial_correlations = edge_model.precision_.copy()
    d = 1 / np.sqrt(np.diag(partial_correlations))
    partial_correlations *= d
    partial_correlations *= d[:, np.newaxis]
    non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02)

    # Plot the nodes using the coordinates of our embedding
    plt.scatter(embedding[0], embedding[1], s=100 * d ** 2, c=labels,
                cmap=plt.cm.spectral)

    # Plot the edges
    start_idx, end_idx = np.where(non_zero)
    # a sequence of (*line0*, *line1*, *line2*), where::
    #            linen = (x0, y0), (x1, y1), ... (xm, ym)
    segments = [[embedding[:, start], embedding[:, stop]]
                for start, stop in zip(start_idx, end_idx)]
    values = np.abs(partial_correlations[non_zero])
    lc = LineCollection(segments,
                        zorder=0, cmap=plt.cm.viridis,
                        norm=plt.Normalize(0, .7 * values.max()))
    lc.set_array(values)
    lc.set_linewidths(15 * values)
    ax.add_collection(lc)

    # Add a label to each node. The challenge here is that we want to
    # position the labels to avoid overlap with other labels
    for index, (name, label, (x, y)) in enumerate(
            zip(names, labels, embedding.T)):

        dx = x - embedding[0]
        dx[index] = 1
        dy = y - embedding[1]
        dy[index] = 1
        this_dx = dx[np.argmin(np.abs(dy))]
        this_dy = dy[np.argmin(np.abs(dx))]
        if this_dx > 0:
            horizontalalignment = 'left'
            x = x + .002
        else:
            horizontalalignment = 'right'
            x = x - .002
        if this_dy > 0:
            verticalalignment = 'bottom'
            y = y + .002
        else:
            verticalalignment = 'top'
            y = y - .002
        plt.text(x, y, name, size=10,
                horizontalalignment=horizontalalignment,
                verticalalignment=verticalalignment,
                bbox=dict(facecolor='w',
                        edgecolor=plt.cm.spectral(label / float(n_labels)),
                        alpha=.6))

    plt.xlim(embedding[0].min() - .15 * embedding[0].ptp(),
            embedding[0].max() + .10 * embedding[0].ptp(),)
    plt.ylim(embedding[1].min() - .03 * embedding[1].ptp(),
            embedding[1].max() + .03 * embedding[1].ptp())

    plt.show()
Exemplo n.º 18
0
def plot_graph(settings=None, macro_data_z=None, negate_fields=None):

    symbols = np.array(settings['data_fieldnames']).T
    graph_data = macro_data_z[macro_data_z.index > settings['common_start_date']
                             ][settings['data_fields']].iloc[2:]
    if negate_fields is not None:
        graph_data[negate_fields] = -graph_data[negate_fields]

    graph_data = graph_data.rolling(window=3, center=False).sum()
    variation = graph_data.values.T

    ###############################################################################
    # Learn a graphical structure from the correlations
    edge_model = covariance.GraphLassoCV()

    # standardize the time series: using correlations rather than covariance
    # is more efficient for structure recovery
    X = variation.copy().T
    X /= X.std(axis=0)
    edge_model.fit(X)

    ###############################################################################
    # Cluster using affinity propagation

    _, labels = cluster.affinity_propagation(edge_model.covariance_)
    n_labels = labels.max()

    for i in range(n_labels + 1):
        print('Cluster %i: %s' % ((i + 1), ', '.join(symbols[labels == i])))

    ###############################################################################
    # Find a low-dimension embedding for visualization: find the best position of
    # the nodes (the stocks) on a 2D plane
    from sklearn.decomposition import kernel_pca
    # node_position_model = manifold.LocallyLinearEmbedding(
    #     n_components=2, eigen_solver='dense', n_neighbors=8)
    # node_position_model = KernelPCA(kernel='rbf',
    #                                 fit_inverse_transform=True,
    #                                 gamma=10,
    #                                 n_components=2)
    node_position_model = manifold.SpectralEmbedding(n_components=2,
                                                     n_neighbors=6)

    # node_position_model = PCA(n_components=2)
    embedding = node_position_model.fit_transform(X.T).T
    # embedding = components[[0, 1]].values.T
    f1 = 0
    f2 = 1

    ###############################################################################
    # Visualization
    plt.figure(1, facecolor='w', figsize=(12, 6))
    plt.clf()
    ax = plt.axes([0., 0., 1., 1.])
    # plt.axis('off')
    # ax.set_axis_bgcolor('k')

    # Display a graph of the partial correlations
    partial_correlations = edge_model.precision_.copy()
    d = 1 / np.sqrt(np.diag(partial_correlations))
    partial_correlations *= d
    partial_correlations *= d[:, np.newaxis]
    non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02)

    # Plot the nodes using the coordinates of our embedding
    plt.scatter(embedding[f1],
                embedding[f2],
                s=100 * d ** 2,
                c=labels,
                cmap=plt.cm.coolwarm)

    # Plot the edges
    start_idx, end_idx = np.where(non_zero)
    segments = [[embedding[[f1, f2], start], embedding[[f1, f2], stop]]
                for start, stop in zip(start_idx, end_idx)]
    values = np.abs(partial_correlations[non_zero])
    lc = LineCollection(segments,
                        zorder=0,
                        cmap=plt.cm.coolwarm,
                        norm=plt.Normalize(0, .7 * np.sqrt(values.max())))
    lc.set_array(np.sqrt(values))
    lc.set_linewidths(15 * np.sqrt(values))
    ax.add_collection(lc)

    # Add a label to each node. The challenge here is that we want to
    # position the labels to avoid overlap with other labels
    label_offset = 0.002

    for index, (name, label, (f_1, f_2)) in enumerate(
            zip(symbols, labels, embedding.T)):

        if f1 == 0:
            x = f_1
        if f1 == 1:
            x = f_2

        if f2 == 0:
            y = f_1
        if f2 == 1:
            y = f_2

        dx = x - embedding[f1]
        dx[index] = 1
        dy = y - embedding[f2]
        dy[index] = 1
        this_dx = dx[np.argmin(np.abs(dy))]
        this_dy = dy[np.argmin(np.abs(dx))]
        if this_dx > 0:
            horizontalalignment = 'left'
            x += label_offset
        else:
            horizontalalignment = 'right'
            x -= label_offset
        if this_dy > 0:
            verticalalignment = 'bottom'
            y += label_offset
        else:
            verticalalignment = 'top'
            y -= label_offset
        plt.text(x, y, name, size=10,
                 horizontalalignment=horizontalalignment,
                 verticalalignment=verticalalignment,
                 bbox=dict(facecolor='w',
                           edgecolor=plt.cm.spectral(label / float(n_labels)),
                           alpha=.6))

    plt.xlim(embedding[f1].min() - .15 * embedding[f1].ptp(),
             embedding[f1].max() + .10 * embedding[f1].ptp(),)
    plt.ylim(embedding[f2].min() - .03 * embedding[f2].ptp(),
             embedding[f2].max() + .03 * embedding[f2].ptp())
    plt.show()

    plt.savefig('figures/macro_graph.png',
                facecolor='w',
                edgecolor='w',
                transparent=True)
Exemplo n.º 19
0
def kluster(form):

    try:

        tickerA = web.DataReader(form.tickerA + '.sa',
                                 data_source='yahoo')[-252:]
        tickerB = web.DataReader(form.tickerB + '.sa',
                                 data_source='yahoo')[-252:]
        tickerC = web.DataReader(form.tickerC + '.sa',
                                 data_source='yahoo')[-252:]
        tickerD = web.DataReader(form.tickerD + '.sa',
                                 data_source='yahoo')[-252:]
        tickerE = web.DataReader(form.tickerE + '.sa',
                                 data_source='yahoo')[-252:]

        barchart = [tickerA, tickerB, tickerC, tickerD, tickerE]

        names = [
            form.tickerA, form.tickerB, form.tickerC, form.tickerD,
            form.tickerE
        ]

        quotes = []

        for item in barchart:
            portfolio = pd.DataFrame(item)
            quotes.append(portfolio)

        names = pd.DataFrame(names).T
        opening_quotes = np.array([quote.Open
                                   for quote in quotes]).astype(np.float)
        closing_quotes = np.array([quote.Close
                                   for quote in quotes]).astype(np.float)

        delta_quotes = closing_quotes - opening_quotes

        edge_model = covariance.GraphLassoCV()

        X = delta_quotes.copy().T
        X /= X.std(axis=0)

        with np.errstate(invalid='ignore'):
            edge_model.fit(X)

        from sklearn import cluster

        _, labels = cluster.affinity_propagation(edge_model.covariance_)
        num_labels = labels.max()

        k = []

        for i in range(num_labels + 1):
            try:
                cluster = (i + 1, ', '.join(names.T[0][labels == i]))
                k.append(cluster)
            except Exception:
                pass  # or you could use 'continue'

        kluster = pd.DataFrame(list(k))
        kluster.columns = ['Cluster', 'Ticker']
        kluster = kluster.to_html(index=False, columns=['Cluster', 'Ticker'])

    except Exception:
        return render_to_response('project/apologies.html')

    return render_to_response('cluster.html', context={'kluster': kluster})
Exemplo n.º 20
0
def getStockMarketStructure(symbol_dict):
 	
# Choose a time period reasonnably calm (not too long ago so that we get
# high-tech firms, and before the 2008 crash)
	d1 = datetime.datetime(2009, 1, 1)
	d2 = datetime.datetime(2011, 1, 1)
#d1 = datetime.datetime.now() - timedelta(days=365*2)
#d2 = datetime.datetime.now()- timedelta(days=1)
# kraft symbol has now changed from KFT to MDLZ in yahoo
        symbols, names = np.array(list(symbol_dict.items())).T

        quotes = [finance.quotes_historical_yahoo(symbol, d1, d2, asobject=True)
          for symbol in symbols]

        open = np.array([q.open for q in quotes]).astype(np.float)
        close = np.array([q.close for q in quotes]).astype(np.float)

# The daily variations of the quotes are what carry most information
        variation = close - open

###############################################################################
# Learn a graphical structure from the correlations
        edge_model = covariance.GraphLassoCV()

# standardize the time series: using correlations rather than covariance
# is more efficient for structure recovery
        X = variation.copy().T
        X /= X.std(axis=0)
        edge_model.fit(X)

###############################################################################
# Cluster using affinity propagation

        _, labels = cluster.affinity_propagation(edge_model.covariance_)
        n_labels = labels.max()

        for i in range(n_labels + 1):
            print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i])))

###############################################################################
# Find a low-dimension embedding for visualization: find the best position of
# the nodes (the stocks) on a 2D plane

# We use a dense eigen_solver to achieve reproducibility (arpack is
# initiated with random vectors that we don't control). In addition, we
# use a large number of neighbors to capture the large-scale structure.
        node_position_model = manifold.LocallyLinearEmbedding(
            n_components=2, eigen_solver='dense', n_neighbors=6)

        embedding = node_position_model.fit_transform(X.T).T

###############################################################################
# Visualization
        plt.figure(1, facecolor='w', figsize=(10, 8))
        plt.clf()
        ax = plt.axes([0., 0., 1., 1.])
        plt.axis('off')
# Display a graph of the partial correlations
        partial_correlations = edge_model.precision_.copy()
        d = 1 / np.sqrt(np.diag(partial_correlations))
        partial_correlations *= d
        partial_correlations *= d[:, np.newaxis]
        non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02)
# Plot the nodes using the coordinates of our embedding
        plt.scatter(embedding[0], embedding[1], s=100 * d ** 2, c=labels,
                    cmap=plt.cm.spectral)
# Plot the edges
        start_idx, end_idx = np.where(non_zero)
#a sequence of (*line0*, *line1*, *line2*), where::
#            linen = (x0, y0), (x1, y1), ... (xm, ym)
        segments = [[embedding[:, start], embedding[:, stop]]
                    for start, stop in zip(start_idx, end_idx)]
        values = np.abs(partial_correlations[non_zero])
        lc = LineCollection(segments,
                            zorder=0, cmap=plt.cm.hot_r,
                            norm=plt.Normalize(0, .7 * values.max()))
        lc.set_array(values)
        lc.set_linewidths(15 * values)
        ax.add_collection(lc)
# Add a label to each node. The challenge here is that we want to
# position the labels to avoid overlap with other labels
        for index, (name, label, (x, y)) in enumerate(
                zip(names, labels, embedding.T)):

            dx = x - embedding[0]
            dx[index] = 1
            dy = y - embedding[1]
            dy[index] = 1
            this_dx = dx[np.argmin(np.abs(dy))]
            this_dy = dy[np.argmin(np.abs(dx))]
            if this_dx > 0:
                horizontalalignment = 'left'
                x = x + .002
            else:
                horizontalalignment = 'right'
                x = x - .002
            if this_dy > 0:
                verticalalignment = 'bottom'
                y = y + .002
            else:
                verticalalignment = 'top'
                y = y - .002
	
            plt.text(x, y, name, size=10,
                    horizontalalignment=horizontalalignment,
                    verticalalignment=verticalalignment,
                    bbox=dict(facecolor='w',
                            edgecolor=plt.cm.spectral(label / float(n_labels)),
                            alpha=.6))
        plt.xlim(embedding[0].min() - .15 * embedding[0].ptp(),
                embedding[0].max() + .10 * embedding[0].ptp(),)
        plt.ylim(embedding[1].min() - .03 * embedding[1].ptp(),
                embedding[1].max() + .03 * embedding[1].ptp())
#plt.show()
        filename_1 = id_generator()+'.svg'
        plt.savefig(filename_1) 
        return filename_1
print("-- Loading raw data ({0:d}) and masking ...".format(subject_n))
regions_img = nisl.datasets.load_harvard_oxford("cort-maxprob-thr25-2mm",
                                                symmetric_split=True)

print("-- Computing confounds ...")
# Compcor on full image
hv_confounds = nisl.image.high_variance_confounds(filename)
mvt_confounds = np.loadtxt(confound_file, skiprows=1)
confounds = np.hstack((hv_confounds, mvt_confounds))

print("-- Computing region signals ...")
region_ts, _ = nisl.region.img_to_signals_labels(filename, regions_img)

region_ts = nisl.signal.clean(region_ts,
                              low_pass=None,
                              detrend=True,
                              standardize=True,
                              confounds=confounds,
                              t_r=2.5,
                              high_pass=0.01)

print("-- Computing covariance matrices ...")
estimator = covariance.GraphLassoCV()
estimator.fit(region_ts)

plot_matrices(estimator.covariance_,
              -estimator.precision_,
              title="Graph Lasso CV ({0:.3f})".format(estimator.alpha_),
              subject_n=subject_n)
pl.show()
Exemplo n.º 22
0
def stock_structure_demo():
    start_date = datetime(2005, 1, 1).date()
    end_date = datetime(2008, 1, 1).date()
    symbol_dict = {
        'NYSE:TOT': 'Total',
        'NYSE:XOM': 'Exxon',
        'NYSE:CVX': 'Chevron',
        'NYSE:COP': 'ConocoPhillips',
        'NYSE:VLO': 'Valero Energy',
        'NASDAQ:MSFT': 'Microsoft',
        'NYSE:IBM': 'IBM',
        'NYSE:TWX': 'Time Warner',
        'NASDAQ:CMCSA': 'Comcast',
        'NYSE:CVC': 'Cablevision',
        'NASDAQ:YHOO': 'Yahoo',
        'NASDAQ:DELL': 'Dell',
        'NYSE:HPQ': 'HP',
        'NASDAQ:AMZN': 'Amazon',
        'NYSE:TM': 'Toyota',
        'NYSE:CAJ': 'Canon',
        'NYSE:SNE': 'Sony',
        'NYSE:F': 'Ford',
        'NYSE:HMC': 'Honda',
        'NYSE:NAV': 'Navistar',
        'NYSE:NOC': 'Northrop Grumman',
        'NYSE:BA': 'Boeing',
        'NYSE:KO': 'Coca Cola',
        'NYSE:MMM': '3M',
        'NYSE:MCD': 'McDonald\'s',
        'NYSE:PEP': 'Pepsi',
        'NYSE:K': 'Kellogg',
        'NYSE:UN': 'Unilever',
        'NASDAQ:MAR': 'Marriott',
        'NYSE:PG': 'Procter Gamble',
        'NYSE:CL': 'Colgate-Palmolive',
        'NYSE:GE': 'General Electrics',
        'NYSE:WFC': 'Wells Fargo',
        'NYSE:JPM': 'JPMorgan Chase',
        'NYSE:AIG': 'AIG',
        'NYSE:AXP': 'American express',
        'NYSE:BAC': 'Bank of America',
        'NYSE:GS': 'Goldman Sachs',
        'NASDAQ:AAPL': 'Apple',
        'NYSE:SAP': 'SAP',
        'NASDAQ:CSCO': 'Cisco',
        'NASDAQ:TXN': 'Texas Instruments',
        'NYSE:XRX': 'Xerox',
        'NYSE:WMT': 'Wal-Mart',
        'NYSE:HD': 'Home Depot',
        'NYSE:GSK': 'GlaxoSmithKline',
        'NYSE:PFE': 'Pfizer',
        'NYSE:SNY': 'Sanofi-Aventis',
        'NYSE:NVS': 'Novartis',
        'NYSE:KMB': 'Kimberly-Clark',
        'NYSE:R': 'Ryder',
        'NYSE:GD': 'General Dynamics',
        'NYSE:RTN': 'Raytheon',
        'NYSE:CVS': 'CVS',
        'NYSE:CAT': 'Caterpillar',
        'NYSE:DD': 'DuPont de Nemours',
        'NYSE:ABB': 'ABB'
    }
    symbols, names = np.array(sorted(symbol_dict.items())).T
    # retry is used because quotes_historical_google can temporarily fail
    # for various reasons (e.g. empty result from Google API).
    quotes = []
    for symbol in symbols:
        print('Fetching quote history for %r' % symbol, file=sys.stderr)
        quotes.append(
            retry(quotes_historical_google)(symbol, start_date, end_date))
    close_prices = np.vstack([q['close'] for q in quotes])
    open_prices = np.vstack([q['open'] for q in quotes])
    # The daily variations of the quotes are what carry most information
    variation = close_prices - open_prices
    # #############################################################################
    # Learn a graphical structure from the correlations
    edge_model = covariance.GraphLassoCV()
    # standardize the time series: using correlations rather than covariance
    # is more efficient for structure recovery
    X = variation.copy().T
    X /= X.std(axis=0)
    edge_model.fit(X)
    # #############################################################################
    # Cluster using affinity propagation
    _, labels = cluster.affinity_propagation(edge_model.covariance_)
    n_labels = labels.max()
    for i in range(n_labels + 1):
        print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i])))

    # #############################################################################
    # Find a low-dimension embedding for visualization: find the best position of
    # the nodes (the stocks) on a 2D plane
    # We use a dense eigen_solver to achieve reproducibility (arpack is
    # initiated with random vectors that we don't control). In addition, we
    # use a large number of neighbors to capture the large-scale structure.
    node_position_model = manifold.LocallyLinearEmbedding(n_components=2,
                                                          eigen_solver='dense',
                                                          n_neighbors=6)
    embedding = node_position_model.fit_transform(X.T).T
    # #############################################################################
    # Visualization
    plt.figure(1, facecolor='w', figsize=(10, 8))
    plt.clf()
    ax = plt.axes([0., 0., 1., 1.])
    plt.axis('off')
    # Display a graph of the partial correlations
    partial_correlations = edge_model.precision_.copy()
    d = 1 / np.sqrt(np.diag(partial_correlations))
    partial_correlations *= d
    partial_correlations *= d[:, np.newaxis]
    non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02)
    # Plot the nodes using the coordinates of our embedding
    plt.scatter(embedding[0],
                embedding[1],
                s=100 * d**2,
                c=labels,
                cmap=plt.cm.spectral)
    # Plot the edges
    start_idx, end_idx = np.where(non_zero)
    # a sequence of (*line0*, *line1*, *line2*), where::
    #            linen = (x0, y0), (x1, y1), ... (xm, ym)
    segments = [[embedding[:, start], embedding[:, stop]]
                for start, stop in zip(start_idx, end_idx)]
    values = np.abs(partial_correlations[non_zero])
    lc = LineCollection(segments,
                        zorder=0,
                        cmap=plt.cm.hot_r,
                        norm=plt.Normalize(0, .7 * values.max()))
    lc.set_array(values)
    lc.set_linewidths(15 * values)
    ax.add_collection(lc)
    # Add a label to each node. The challenge here is that we want to
    # position the labels to avoid overlap with other labels
    for index, (name, label,
                (x, y)) in enumerate(zip(names, labels, embedding.T)):

        dx = x - embedding[0]
        dx[index] = 1
        dy = y - embedding[1]
        dy[index] = 1
        this_dx = dx[np.argmin(np.abs(dy))]
        this_dy = dy[np.argmin(np.abs(dx))]
        if this_dx > 0:
            horizontalalignment = 'left'
            x = x + .002
        else:
            horizontalalignment = 'right'
            x = x - .002
        if this_dy > 0:
            verticalalignment = 'bottom'
            y = y + .002
        else:
            verticalalignment = 'top'
            y = y - .002
        plt.text(x,
                 y,
                 name,
                 size=10,
                 horizontalalignment=horizontalalignment,
                 verticalalignment=verticalalignment,
                 bbox=dict(facecolor='w',
                           edgecolor=plt.cm.spectral(label / float(n_labels)),
                           alpha=.6))
    plt.xlim(
        embedding[0].min() - .15 * embedding[0].ptp(),
        embedding[0].max() + .10 * embedding[0].ptp(),
    )
    plt.ylim(embedding[1].min() - .03 * embedding[1].ptp(),
             embedding[1].max() + .03 * embedding[1].ptp())
    plt.show()
Exemplo n.º 23
0
    #    try:
    df = ts.get_hist_data(hs300_code[i], start='2017-07-23', end='2018-07-23')
    #print(str(hs300_code[i])+':'+str(df.shape))
    #print(df)
    if df.shape == (245, 13):
        df_hs300[str(hs300_code[i])] = df['price_change']
        names.append(hs300_name[i])
#    except:
#        print('出现未知错误')
names = np.array(names)
variation = np.array(df_hs300)
#print(variation)
#print(df_hs300)
X = variation.copy()
X /= X.std(axis=0)
edge_model = covariance.GraphLassoCV()  #构建稀疏协方差逆矩阵
edge_model.fit(X)

_, labels = cluster.affinity_propagation(edge_model.covariance_)  #进行聚类
n_labels = labels.max()

for i in range(n_labels + 1):
    # print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i])))
    print('Cluster' + str(i + 1) + ','.join(names[labels == i]))

node_position_model = manifold.LocallyLinearEmbedding(n_components=2,
                                                      eigen_solver='dense',
                                                      n_neighbors=6)

embedding = node_position_model.fit_transform(X.T).T
Exemplo n.º 24
0
                                                standardize=True,
                                                memory=mem,
                                                memory_level=1,
                                                verbose=1)
    region_ts = masker.fit_transform(
        func_filename, confounds=[hv_confounds, confound_filename])
    subjects.append(region_ts)

# Computing group-sparse precision matrices ###################################
print("-- Computing group-sparse precision matrices ...")
from nilearn.group_sparse_covariance import GroupSparseCovarianceCV
gsc = GroupSparseCovarianceCV(verbose=2, n_jobs=3)
gsc.fit(subjects)

print("-- Computing graph-lasso precision matrices ...")
from sklearn import covariance
gl = covariance.GraphLassoCV(n_jobs=3)
gl.fit(subjects[plotted_subject])

# Displaying results ##########################################################
print("-- Displaying results")
title = "{0:d} GroupSparseCovariance $\\alpha={1:.2e}$".format(
    plotted_subject, gsc.alpha_)
plot_matrices(gsc.covariances_[..., plotted_subject],
              gsc.precisions_[..., plotted_subject], title)

title = "{0:d} GraphLasso $\\alpha={1:.2e}$".format(plotted_subject, gl.alpha_)
plot_matrices(gl.covariance_, gl.precision_, title)

plt.show()
Exemplo n.º 25
0
def cluster_data(data):
    names = data.columns
    edge_model = covariance.GraphLassoCV()
    data = np.array(data)

    X = data.copy().T
    X /= X.std(axis=0)

    edge_model.fit(X)
    _, labels = cluster.affinity_propagation(edge_model.covariance_)
    n_labels = labels.max()

    for i in range(n_labels + 1):
        print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i])))

    #Visualization
    node_position_model = manifold.LocallyLinearEmbedding(n_components=2,
                                                          eigen_solver='dense',
                                                          n_neighbors=6)
    embedding = node_position_model.fit_transform(X.T).T
    plt.figure(1, facecolor='w', figsize=(10, 8))
    plt.clf()
    ax = plt.axes([0., 0., 1., 1.])
    plt.axis('off')

    # Display a graph of the partial correlations
    partial_correlations = edge_model.precision_.copy()
    d = 1 / np.sqrt(np.diag(partial_correlations))
    partial_correlations *= d
    partial_correlations *= d[:, np.newaxis]
    non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02)

    # Plot the nodes using the coordinates of our embedding
    plt.scatter(embedding[0],
                embedding[1],
                s=100 * d**2,
                c=labels,
                cmap=plt.cm.spectral)

    # Plot the edges
    start_idx, end_idx = np.where(non_zero)
    #a sequence of (*line0*, *line1*, *line2*), where::
    #            linen = (x0, y0), (x1, y1), ... (xm, ym)
    segments = [[embedding[:, start], embedding[:, stop]]
                for start, stop in zip(start_idx, end_idx)]
    values = np.abs(partial_correlations[non_zero])
    lc = LineCollection(segments,
                        zorder=0,
                        cmap=plt.cm.hot_r,
                        norm=plt.Normalize(0, .7 * values.max()))
    lc.set_array(values)
    lc.set_linewidths(15 * values)
    ax.add_collection(lc)

    # Add a label to each node. The challenge here is that we want to
    # position the labels to avoid overlap with other labels
    for index, (name, label,
                (x, y)) in enumerate(zip(names, labels, embedding.T)):
        name = str(name).decode('utf-8').encode('utf-8')
        dx = x - embedding[0]
        dx[index] = 1
        dy = y - embedding[1]
        dy[index] = 1
        this_dx = dx[np.argmin(np.abs(dy))]
        this_dy = dy[np.argmin(np.abs(dx))]
        if this_dx > 0:
            horizontalalignment = 'left'
            x = x + .002
        else:
            horizontalalignment = 'right'
            x = x - .002
        if this_dy > 0:
            verticalalignment = 'bottom'
            y = y + .002
        else:
            verticalalignment = 'top'
            y = y - .002
        plt.text(x,
                 y,
                 name,
                 size=10,
                 horizontalalignment=horizontalalignment,
                 verticalalignment=verticalalignment,
                 bbox=dict(facecolor='w',
                           edgecolor=plt.cm.spectral(label / float(n_labels)),
                           alpha=.6))

    plt.xlim(
        embedding[0].min() - .15 * embedding[0].ptp(),
        embedding[0].max() + .10 * embedding[0].ptp(),
    )
    plt.ylim(embedding[1].min() - .03 * embedding[1].ptp(),
             embedding[1].max() + .03 * embedding[1].ptp())
    plt.show()
Exemplo n.º 26
0
                       np.matrix(np.eye(class_ix * 5)))
        X = np.random.multivariate_normal(mean=np.zeros(p), cov=C, size=ni)
        #ml_glassocv.fit(X)
        #Theta = ml_glassocv.get_precision()
        A_c.append(A)
        C_c.append(C)
        X_c.append(X)
        #Theta_t.append(Theta)
    A_list.append(A_c)
    C_list.append(C_c)
    X_list.append(X_c)
    #Theta_glassocv_list.append(Theta_t)
#-------------------------------------------------------------------------------------------------------------------------------------------------

#-------------------------------- Graphical Lasso ---------------------------------
ml_glassocv = cov.GraphLassoCV(assume_centered=False)
Theta_glassocv_list = []
for class_ix in range(len_class):
    Theta_c = []
    for time_ix in range(len_t):
        ml_glassocv.fit(X_list[class_ix][time_ix])
        Theta = ml_glassocv.get_precision()
        Theta_c.append(Theta)
    Theta_glassocv_list.append(Theta_c)

# F1 score for graphical lasso
for class_ix in range(len_class):
    for time_ix in range(len_t):
        getF1(A_list[class_ix][time_ix],
              Theta_glassocv_list[class_ix][time_ix])
#        print(getF1(A_list[class_ix][time_ix], Theta_glassocv_list[class_ix][time_ix]))
    # Computing some confounds
    hv_confounds = mem.cache(image.high_variance_confounds)(func_filename)

    region_ts = masker.transform(func_filename,
                                 confounds=[hv_confounds, confound_filename])
    subject_time_series.append(region_ts)

##############################################################################
# Computing group-sparse precision matrices
from nilearn.connectome import GroupSparseCovarianceCV
gsc = GroupSparseCovarianceCV(verbose=2)
gsc.fit(subject_time_series)

from sklearn import covariance
gl = covariance.GraphLassoCV(verbose=2)
gl.fit(np.concatenate(subject_time_series))

##############################################################################
# Displaying results
atlas_imgs = image.iter_img(msdl_atlas_dataset.maps)
atlas_region_coords = [plotting.find_xyz_cut_coords(img) for img in atlas_imgs]

plotting.plot_connectome(gl.covariance_,
                         atlas_region_coords,
                         edge_threshold='90%',
                         title="Covariance",
                         display_mode="lzr")
plotting.plot_connectome(-gl.precision_,
                         atlas_region_coords,
                         edge_threshold='90%',
Exemplo n.º 28
0
def process(start_date, end_date, interesting_stock):
    quotes = []
#    print("Downloading data...")
    for symbol in symbols:
        quotes.append(quotes_historical_yahoo_ochl(symbol,
                                                   start_date,
                                                   end_date,
                                                   asobject=True))

    if min([len(q.open) for q in quotes]) != max([len(q.open) for q in quotes]):
        for q, name in zip(quotes, names):
            print("%d: %s" % (len(q.open), name))
        print("Different length of quotes for different stocks")
    open = np.array([q.open for q in quotes]).astype(np.float)
    close = np.array([q.close for q in quotes]).astype(np.float)

    # The daily variations of the quotes are what carry most information
    variation = close - open

#    print("Finding similarities...")
    edge_model = covariance.GraphLassoCV()

    # standardize the time series: using correlations rather than covariance
    # is more efficient for structure recovery
    X = variation.copy().T
    X /= X.std(axis=0)
    edge_model.fit(X)

    _, labels = cluster.affinity_propagation(edge_model.covariance_)
    n_labels = labels.max()

    for i in range(n_labels + 1):
        cluster_desc = 'Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i]))
        if interesting_stock is None or interesting_stock in cluster_desc:
            print(cluster_desc)

    def plot():
        # We use a dense eigen_solver to achieve reproducibility (arpack is
        # initiated with random vectors that we don't control). In addition, we
        # use a large number of neighbors to capture the large-scale structure.
        node_position_model = manifold.LocallyLinearEmbedding(
            n_components=2, eigen_solver='dense', n_neighbors=6)

        embedding = node_position_model.fit_transform(X.T).T    

        plt.figure(1, facecolor='w', figsize=(10, 8))
        plt.clf()
        ax = plt.axes([0., 0., 1., 1.])
        plt.axis('off')

        # Display a graph of the partial correlations
        partial_correlations = edge_model.precision_.copy()
        d = 1 / np.sqrt(np.diag(partial_correlations))
        partial_correlations *= d
        partial_correlations *= d[:, np.newaxis]
        non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02)

        # Plot the nodes using the coordinates of our embedding
        plt.scatter(embedding[0], embedding[1], s=100 * d ** 2, c=labels,
                    cmap=plt.cm.spectral)

        # Plot the edges
        start_idx, end_idx = np.where(non_zero)
        #a sequence of (*line0*, *line1*, *line2*), where::
        #            linen = (x0, y0), (x1, y1), ... (xm, ym)
        segments = [[embedding[:, start], embedding[:, stop]]
                    for start, stop in zip(start_idx, end_idx)]
        values = np.abs(partial_correlations[non_zero])
        lc = LineCollection(segments,
                            zorder=0, cmap=plt.cm.hot_r,
                            norm=plt.Normalize(0, .7 * values.max()))
        lc.set_array(values)
        lc.set_linewidths(15 * values)
        ax.add_collection(lc)

        # Add a label to each node. The challenge here is that we want to
        # position the labels to avoid overlap with other labels
        for index, (name, label, (x, y)) in enumerate(
                zip(names, labels, embedding.T)):

            dx = x - embedding[0]
            dx[index] = 1
            dy = y - embedding[1]
            dy[index] = 1
            this_dx = dx[np.argmin(np.abs(dy))]
            this_dy = dy[np.argmin(np.abs(dx))]
            if this_dx > 0:
                horizontalalignment = 'left'
                x = x + .002
            else:
                horizontalalignment = 'right'
                x = x - .002
            if this_dy > 0:
                verticalalignment = 'bottom'
                y = y + .002
            else:
                verticalalignment = 'top'
                y = y - .002
            plt.text(x, y, name, size=10,
                     horizontalalignment=horizontalalignment,
                     verticalalignment=verticalalignment,
                     bbox=dict(facecolor='w',
                               edgecolor=plt.cm.spectral(label / float(n_labels)),
                               alpha=.6))

        plt.xlim(embedding[0].min() - .15 * embedding[0].ptp(),
                 embedding[0].max() + .10 * embedding[0].ptp(),)
        plt.ylim(embedding[1].min() - .03 * embedding[1].ptp(),
                 embedding[1].max() + .03 * embedding[1].ptp())

        plt.show()
    plot()
    print('Fetching station: {}, with error variance: {}'.format(
        stn, metric.var()))
    values.append(metric.squeeze())

# =========================================================================
# standardize the time series: using correlations rather than covariance
# is more efficient for structure recovery
stn_id = np.array(stn_id)
X = np.stack(values, axis=-1)
X /= X.std(axis=0)

print(np.shape(X))

# Learn a graphical structure from the correlations
edge_model = covariance.GraphLassoCV(n_refinements=10, cv=5)
edge_model.fit(X)

covariance = edge_model.covariance_
partial_correlations = edge_model.precision_

print(27 * '=', 'covariance matrix', 27 * '=')
print(covariance)
print(73 * '=')
# =========================================================================
# Cluster using affinity propagation / Kmeans methods

# unknown number of clusters:
estimator = AffinityPropagation(verbose=True)

# known number of clusters:
Exemplo n.º 30
0
def affinity_propagation_network(X, names=None):
    """
    Cluster (affinity propagation based on the correlation of ) rows of X,
        printing out cluster contents and
        drawing a labeled network of the results, with darker edges for more correlated pairs

    X can be an array or a pandas DataFrame.
    names are labels for the rows, which will be taken to be the indices of the dataframe, or the "names" column, or
    0..n-1 otherwise

    Very lightly adapted from
        http://scikit-learn.org/stable/auto_examples/applications/plot_stock_market.html#example-applications-plot-stock-market-py

    Author: Gael Varoquaux [email protected]
    License: BSD 3 clause

    The output of the 3 models are combined in a 2D graph where nodes represents the columns and edges the:
    * cluster labels are used to define the color of the nodes
    * the sparse covariance model is used to display the strength of the edges
    * the 2D embedding is used to position the nodes in the plan

    This example has a fair amount of visualization-related code, as visualization is crucial here to display the graph.
    One of the challenge is to position the labels minimizing overlap. For this we use an heuristic based on the
    direction of the nearest neighbor along each axis
    """

    X = X.copy()

    if isinstance(X, pd.DataFrame):
        if isinstance(names, basestring):
            names = X.pop(names)
        elif names is None:
            names = X.index.values
        X = X.as_matrix().T
    elif names is None:
        names = range(X.shape[0])

    ###############################################################################
    # Learn a graphical structure from the correlations
    edge_model = covariance.GraphLassoCV()

    # standardize the time series: using correlations rather than covariance
    # is more efficient for structure recovery
    # X = variation.copy().T
    X /= X.std(axis=0)
    edge_model.fit(X)

    ###############################################################################
    # Cluster using affinity propagation

    _, labels = cluster.affinity_propagation(edge_model.covariance_)
    n_labels = labels.max()

    for i in range(n_labels + 1):
        print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i])))

    ###############################################################################
    # Find a low-dimension embedding for visualization: find the best position of
    # the nodes (the stocks) on a 2D plane

    # We use a dense eigen_solver to achieve reproducibility (arpack is
    # initiated with random vectors that we don't control). In addition, we
    # use a large number of neighbors to capture the large-scale structure.
    node_position_model = manifold.LocallyLinearEmbedding(
        n_components=2, eigen_solver='dense', n_neighbors=6)

    embedding = node_position_model.fit_transform(X.T).T

    ###############################################################################
    # Visualization
    plt.figure(1, facecolor='w', figsize=(10, 8))
    plt.clf()
    ax = plt.axes([0., 0., 1., 1.])
    plt.axis('off')

    # Display a graph of the partial correlations
    partial_correlations = edge_model.precision_.copy()
    d = 1 / np.sqrt(np.diag(partial_correlations))
    partial_correlations *= d
    partial_correlations *= d[:, np.newaxis]
    non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02)

    # Plot the nodes using the coordinates of our embedding
    plt.scatter(embedding[0], embedding[1], s=100 * d ** 2, c=labels,
                cmap=plt.cm.spectral)

    # Plot the edges
    start_idx, end_idx = np.where(non_zero)
    #a sequence of (*line0*, *line1*, *line2*), where::
    #            linen = (x0, y0), (x1, y1), ... (xm, ym)
    segments = [[embedding[:, start], embedding[:, stop]]
                for start, stop in zip(start_idx, end_idx)]
    values = np.abs(partial_correlations[non_zero])
    lc = LineCollection(segments,
                        zorder=0, cmap=plt.cm.hot_r,
                        norm=plt.Normalize(0, .7 * values.max()))
    lc.set_array(values)
    lc.set_linewidths(15 * values)
    ax.add_collection(lc)

    # Add a label to each node. The challenge here is that we want to
    # position the labels to avoid overlap with other labels
    for index, (name, label, (x, y)) in enumerate(
            zip(names, labels, embedding.T)):

        dx = x - embedding[0]
        dx[index] = 1
        dy = y - embedding[1]
        dy[index] = 1
        this_dx = dx[np.argmin(np.abs(dy))]
        this_dy = dy[np.argmin(np.abs(dx))]
        if this_dx > 0:
            horizontalalignment = 'left'
            x = x + .002
        else:
            horizontalalignment = 'right'
            x = x - .002
        if this_dy > 0:
            verticalalignment = 'bottom'
            y = y + .002
        else:
            verticalalignment = 'top'
            y = y - .002
        plt.text(x, y, name, size=10,
                 horizontalalignment=horizontalalignment,
                 verticalalignment=verticalalignment,
                 bbox=dict(facecolor='w',
                           edgecolor=plt.cm.spectral(label / float(n_labels)),
                           alpha=.6))

    plt.xlim(embedding[0].min() - .15 * embedding[0].ptp(),
             embedding[0].max() + .10 * embedding[0].ptp(),)
    plt.ylim(embedding[1].min() - .03 * embedding[1].ptp(),
             embedding[1].max() + .03 * embedding[1].ptp())

    plt.show()