def get_network(): path = USER_PATH / str(g.user["id"]) list_names = [f for f in os.listdir(path) if os.path.isfile((path / f))] file_name = request.form.get("file_name") file_path = USER_PATH / str(g.user['id']) / file_name df = PreProcess.getDF(file_path) error = check_df(df) if error: return render_template("network/index.html", error=error, filename=file_name, all_names=list_names) # df = df.iloc[:, 0:350] names = df.columns.values X = df.values.copy() X /= X.std(axis=0) warnings.filterwarnings("ignore", category=RuntimeWarning) edge_model = covariance.GraphicalLassoCV(tol=1e-3) edge_model.fit(X) _, labels = cluster.affinity_propagation(edge_model.covariance_) n_labels = labels.max() partial_correlations = edge_model.precision_.copy() d = 1 / np.sqrt(np.diag(partial_correlations)) partial_correlations *= d partial_correlations *= d[:, np.newaxis] non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02) start_idx, end_idx = np.where(non_zero) node = [] i = 0 for name in names: data_set = {"id": i, "label": name, "group": str(labels[i])} i = i + 1 node.append(data_set) edges = [] for x in range(len(start_idx)): link = {"from": str(start_idx[x]), "to": str(end_idx[x])} edges.append(link) return render_template("network/index.html", node=node, edges=edges, error=error, filename=file_name, all_names=list_names, n_labels=n_labels)
def glasso_iso_cov(self): # read in isoform expression data # run glasso to get adjusted isoform exptression covariance matrix cur_tpm = pd.read_csv(self.gene + "_tpm.csv") cur_tpm.drop(cur_tpm.columns[[0]], axis=1, inplace=True) cur_tpm = np.transpose(cur_tpm.to_numpy()) cur_tpm_std = (cur_tpm - np.mean(cur_tpm, axis=0)) / np.std(cur_tpm, axis=0) md = covariance.GraphicalLassoCV().fit(cur_tpm_std) self.iso_cov = md.covariance_ # self.iso_cov = np.loadtxt(file) # print("isoform expression covariance matrix:") print(self.iso_cov)
def test(): df = PreProcess.getDF(TEST_FILE) df = df.drop(['class'], axis=1) df = df.iloc[:, 0:10] names = df.columns.values X = df.values edge_model = covariance.GraphicalLassoCV() edge_model.fit(X) # _, labels = cluster.affinity_propagation(edge_model.covariance_) # n_labels = labels.max() # for i in range(n_labels + 1): # print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i]))) # node_position_model = manifold.LocallyLinearEmbedding( # n_components=2, eigen_solver='dense', n_neighbors=6) # # embedding = node_position_model.fit_transform(X.T).T partial_correlations = edge_model.precision_.copy() d = 1 / np.sqrt(np.diag(partial_correlations)) partial_correlations *= d partial_correlations *= d[:, np.newaxis] non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02) start_idx, end_idx = np.where(non_zero) node = [] i = 0 for name in names: data_set = {"id": i, "label": name, "group": 1} i = i + 1 node.append(data_set) # print(node) edges = [] for x in range(len(start_idx)): link = {"from": str(start_idx[x]), "to": str(end_idx[x])} edges.append(link) # print(edges) return render_template("network/index.html", names=names, start_idx=start_idx, end_idx=end_idx, node=node, edges=edges)
def fit(X): X /= X.std(axis=0) # 稀疏的可逆协方差估计 edge_model = covariance.GraphicalLassoCV(cv=5) edge_model.fit(X) # ############################################################################# # edge_model.covariance_ 协方差 # 聚类 _, labels = cluster.affinity_propagation(edge_model.covariance_) # ############################################################################# # 非线性降维 node_position_model = manifold.LocallyLinearEmbedding(n_components=4, eigen_solver='dense', n_neighbors=12) embedding = node_position_model.fit_transform(X.T).T partial_correlations = edge_model.precision_.copy() return labels, partial_correlations, embedding
def x2cor(xarray, corr='pCor'): # ############################################################################# # Learn a graphical structure from the correlations edge_model = covariance.GraphicalLassoCV() edge_model.fit(xarray) cov_correlations = edge_model.covariance_.copy() if corr == 'Cor': d = 1 / np.sqrt(np.diag(cov_correlations)) non_zero = (np.abs(np.triu(cov_correlations, k=1)) > 0.5) else: # Display a graph of the partial correlations partial_correlations = edge_model.precision_.copy() d = 1 / np.sqrt(np.diag(partial_correlations)) partial_correlations *= d partial_correlations *= d[:, np.newaxis] non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02) #values = np.abs(partial_correlations[non_zero]) return non_zero, d, cov_correlations
'https://raw.githubusercontent.com/mailtsjp/finpy/master/examples-data/' 'financial-data/{}.csv') # https://raw.githubusercontent.com/mailtsjp/Quandlproj1/master/examples-data/financial-data/WFC.csv a = quotes.append(pd.read_csv(url.format(symbol))) close_prices = np.vstack([q['Close'] for q in quotes]) open_prices = np.vstack([q['Open'] for q in quotes]) # The daily variations of the quotes are what carry most information variation = close_prices - open_prices # ############################################################################# # Learn a graphical structure from the correlations edge_model = covariance.GraphicalLassoCV() # standardize the time series: using correlations rather than covariance # is more efficient for structure recovery X = variation.copy().T X /= X.std(axis=0) edge_model.fit(X) # ############################################################################# # Cluster using affinity propagation _, labels = cluster.affinity_propagation(edge_model.covariance_, random_state=0) n_labels = labels.max() for i in range(n_labels + 1):
def parCorrGlasso(dataset, kfolds=10): ''' INPUT: dataset : a dataset with dimension [nDatapoints x nNodes] kfolds : number of folds for the cross-validation to select regularization parameter alpha OUTPUT: Mreg : matrix of regularized (shrinked) partial correlation coefficients Mmod : matrix of partial correlations constrained to the glasso model condSetSizeMat : a matrix with the conditional sets sizes for the Mmod partial correlations note: this is needed to compute the fisher z-transform for the significance tests ''' D = dataset #in regularized methods it is necessary to standardized the data D = stats.zscore(D,axis=0) #estimate the empirical covariance emp_cov = np.cov(D,rowvar=False) #define the glasso model with cross-validation glasso = covariance.GraphicalLassoCV(cv = kfolds) #fit the model to the data and cross-validate to get the #best regularization parameter glasso.fit(D) #get the regularized precision matrix (inverse covariance) prec_mat = glasso.precision_ #transform into regularized partial correlation coefficients ##https://en.wikipedia.org/wiki/Partial_correlation#Using_matrix_inversion denom = np.atleast_2d(1. / np.sqrt(np.diag(prec_mat))) Mreg = -prec_mat * denom * denom.T #make the diagonal zero. np.fill_diagonal(Mreg,0) #use the connectivity model defined by glasso to compute the partial correlations between two connected nodes. #this is done to get non-shrinked partial correlations. #For adjacent nodes x and y, compute partial correlation of x and y conditional on Z = {adj(x) & adj(y)} #where adj(x) is the set of adjacent nodes to x in the glasso model Gm. #glasso model = non-zero entries in Mreg, ie. edges in the connectivity model Gm = Mreg != 0 nNodes = Gm.shape[0] #allocate memory Mmod = np.zeros((nNodes,nNodes)) condSetSizeMat = np.zeros((nNodes,nNodes)) #iterate through each pair of nodes for x in range(nNodes-1): for y in range(x+1,nNodes): #if x and y are adjacent if Gm[x,y] != 0: #get adjacencies indices adj_x = np.argwhere(Gm[x,:] != 0) adj_y = np.argwhere(Gm[y,:] != 0) #get the union of adj(x) and adj(y) Z = np.union1d(adj_x,adj_y) #remove x and y from Z Z = Z[Z!=x] Z = Z[Z!=y] #and put them back at the beginning of Z Z = np.insert(Z,0,y) Z = np.insert(Z,0,x) #define a new dataset only including nodes x & y & adj(x) & adj(y) newD = D[:,Z] #compute the partial correlation for the new dataset but only get Mxy element #x and y will always be in the 0 and 1 positions. pc_xyz = parCorrInvCov(newD)[0,1] #is a symmetric matrix Mmod[x,y] = Mmod[y,x] = pc_xyz #get the size of the conditioning set for x and y condSetSizeMat[x,y] = condSetSizeMat[y,x] = len(Z)-2 return Mreg, Mmod,condSetSizeMat
def graphicalAnalysis(dataset, start_date='2000-01-01', end_date='2020-05-31', Sectors_chosen=[], drop_firm=[], display_SumStat=True, display_IndRet=True, data_rf=df_rf): # Check if the inputed date are legit if (datetime.strptime(start_date, "%Y-%m-%d") > datetime.strptime( end_date, "%Y-%m-%d")): print( 'ERROR: Revision needed! The entered \"start_date\" should be before \"end_date\".' ) return 0, 0 if (dataset.index[0] - timedelta(days=dataset.index[0].weekday()) > datetime.strptime(start_date, "%Y-%m-%d")): print( 'WARNING: the entered \"start_date\" is outside of the range for the given dataset.' ) print( 'The \"start_date\" is adjusted to the earliest start_date, i.e. ', (dataset.index[0] - timedelta(days=dataset.index[0].weekday())).strftime("%Y-%m-%d")) print() if (dataset.index[-1] < datetime.strptime(end_date, "%Y-%m-%d")): print( 'WARNING: the entered \"end_date\" is outside of the range for the given dataset.' ) print('The \"end_date\" is adjusted to the lastest end_date, i.e. ', dataset.index[-1].strftime("%Y-%m-%d")) print() # Extract the data for the given time period temp = dataset[dataset.index >= start_date].copy() X = temp[temp.index <= end_date].copy() temp = data_rf[data_rf.index >= start_date].copy() data_rf2 = temp[temp.index <= end_date].copy() # Check if we are using all sectors or dropping some sector if ((not Sectors_chosen) == False): if (all([(s in firms_info.Sector.unique()) for s in Sectors_chosen])): f_in_sector_chosen = [] for s in Sectors_chosen: f_in_sector_chosen += list( firms_info[firms_info.Sector == s].index) X = X[f_in_sector_chosen] print('Sectors choosen in the Graphical Analysis are:') print(Sectors_chosen) print() else: print( 'ERROR: Revision needed! At Least 1 Sector entered in the \"Sectors_choosen\" option is NOT in the dataset!' ) print('Check your format!') return 0, 0 # Check if we are using all firm or dropping some firms if ((not drop_firm) == False): if (all([(f in X.columns) for f in drop_firm])): print('The following Firms are dropped:') print(drop_firm) print() X.drop(columns=drop_firm, inplace=True) else: print( 'ERROR: Revision needed! At Least 1 firm entered in the \"drop_firm\" option is NOT in the dataset!' ) print('Check your format!') return 0, 0 # Check if there is NA in the dataset within the given time period # If yes, then drop those firms before doing graphical analysis if (X.isnull().values.any()): print('WARNING: Some firms have missing data during this time period!') print('Dropping firms: ') for Xcol_dropped in list(X.columns[X.isna().any()]): print(Xcol_dropped) X = X.dropna(axis='columns') print() # Get the Start and End date of the dataset date_obj = X.index[0] start_of_week = date_obj - timedelta(days=date_obj.weekday()) start = start_of_week.strftime("%m/%d/%Y") end = X.index[-1].strftime("%m/%d/%Y") # Get the firm names of the dataset names = np.array(list(X.columns)) # Show the number of firms examined print('Number of firms examined:', X.shape[1]) # ############################################################################# # Learn a graphical structure from the correlations # Graphical Lasso is used here to estimate the precision matrix edge_model = covariance.GraphicalLassoCV(max_iter=1000) # standardize the time series: # using correlations rather than covariance is more efficient for structure recovery X_std = X / X.std(axis=0) edge_model.fit(X_std) # ############################################################################# # Cluster using affinity propagation _, labels = cluster.affinity_propagation(edge_model.covariance_) n_labels = labels.max() for i in range(n_labels + 1): print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i]))) # ############################################################################# # Find a low-dimension embedding for visualization: find the best position of # the nodes (the stocks) on a 2D plane node_position_model = manifold.MDS(n_components=2, random_state=0) embedding = node_position_model.fit_transform(X_std.T).T # ############################################################################# # Visualization I # Specify node colors by cluster labels color_list = pl.cm.jet(np.linspace(0, 1, n_labels + 1)) my_colors = [color_list[i] for i in labels] # Compute the partial correlations partial_correlations = edge_model.precision_.copy() d = 1 / np.sqrt(np.diag(partial_correlations)) partial_correlations *= d partial_correlations *= d[:, np.newaxis] non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02) # Compute the edge values based on the partial correlations values = np.abs(partial_correlations[non_zero]) val_max = values.max() # Title of the plot title = 'Graphical Network Analysis of Selected Firms over the Period ' + start + ' to ' + end + ' (Weekly)' # Display the partial correlation graph graphicalAnalysis_plot(d, partial_correlations, my_colors, names, labels, embedding, val_max, title) # The configuration of the plot plot_config = [ d, partial_correlations, my_colors, names, labels, embedding, val_max, title ] # ############################################################################# # Visualization II # For individual firm performance over the given period if (display_IndRet): print('Individual Stock Performance over the Period ' + start + ' to ' + end + ' (Weekly):') l_r = int(np.ceil(len(names) / 4)) l_c = 4 f_hei = l_r * 2.5 f_wid = l_c * 4 ax = (X + 1).cumprod().plot(subplots=True, layout=(l_r, l_c), figsize=(f_wid, f_hei), logy=True, sharex=True, sharey=True, x_compat=True, color=my_colors) for i in range(l_c): ax[0, i].xaxis.set_tick_params(which='both', top=True, labeltop=True, labelrotation=40) plt.show() # ############################################################################# # Show summary statistics for each firm over the given period if (display_SumStat): display(getSumStat(X, rf=data_rf2['T-Bill'])) return [edge_model.covariance_, edge_model.precision_], plot_config
def learn_network_structure(ts_returns_data, names, alphas=4, cv=5, mode='cd', assume_centered=False, n_components=2, n_neighbors=5, eigen_solver="dense", method='standard', neighbors_algorithm="auto", random_state=None, n_jobs=None, standardise=False): """ Parameters ---------- ts_returns_data : array-like of shape [n_samples, n_instruments] time series matrix of returns names : array-like of shape [n_samples, 1] Individual names of the financial instrument alphas : int or positive float, optional Number of points on the grids to be used cv : int, optional Number of folds for cross-validation splitting strategy mode : str, optional Solver to use to compute the graph assume_centered : bool, optional Centre the data if False. n_components : int Number of components for the manifold n_neighbors: int Number of neighbours to consider for each point eigen_solver : str Algorithm to compute eigenvalues method : str Algorithm to use for local linear embedding neighbors_algorithm : str Algorithm to use for nearest neighbours search random_state : int, RandomState instance or None, optional If int, random_state is the seed used by the random number generator. If RandomState instance, random_state is the random number generator. If None, the random number generator is the RandomState instance used by np.random. Used when eigen_solver == ‘arpack’ n_jobs : int or None, optional number of parallel jobs to run standardise : bool standardise data if True Returns : sklearn.covariance.graph_lasso_.GraphicalLassoCV sklearn.manifold.locally_linear.LocallyLinearEmbedding array-like of shape [n_components, n_instruments] Transformed embedding vectors array-like of shape [n_instruments, 1] numeric identifier of each cluster ------- """ if not isinstance(ts_returns_data, (np.ndarray, np.generic)): raise TypeError("ts_returns_data must be of class ndarray") # learn graphical structure edge_model = covariance.GraphicalLassoCV(alphas=alphas, cv=cv, mode=mode, assume_centered=assume_centered) edge_model.fit(ts_returns_data) # cluster using affinity propagation _, labels = cluster.affinity_propagation(edge_model.covariance_) n_labels = labels.max() for i in range(n_labels + 1): print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i]))) # find low-dimension embedding - useful for 2D plane visualisation node_position_model = manifold.LocallyLinearEmbedding( n_components=n_components, eigen_solver=eigen_solver, n_neighbors=n_neighbors, method=method, neighbors_algorithm=neighbors_algorithm, random_state=random_state, n_jobs=n_jobs) embedding = node_position_model.fit_transform(ts_returns_data.T).T if standardise: # standardise returns standard_ret = ts_returns_data.copy() standard_ret /= ts_returns_data.std(axis=0) # learn graph model edge_model.fit(standard_ret) # cluster using affinity propagation _, labels = cluster.affinity_propagation(edge_model.covariance_) n_labels = labels.max() for i in range(n_labels + 1): print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i]))) # find low-dimension embedding - useful for 2D plane visualisation node_position_model = manifold.LocallyLinearEmbedding( n_components=n_components, eigen_solver=eigen_solver, n_neighbors=n_neighbors, method=method, neighbors_algorithm=neighbors_algorithm, random_state=random_state, n_jobs=n_jobs) embedding = node_position_model.fit_transform(ts_returns_data.T).T return edge_model, node_position_model, embedding, labels
def _visualize(self, names, close_prices, open_prices): # The daily variations of the quotes are what carry most information variation = close_prices - open_prices # NaN值赋值为0,下面在调用GraphLassoCV的时候会报一些除0的RuntimeWarning,但是可以通过 variation[np.isnan(variation)] = 0 # ############################################################################# # Learn a graphical structure from the correlations edge_model = covariance.GraphicalLassoCV() # standardize the time series: using correlations rather than covariance # is more efficient for structure recovery X = variation.copy().T X /= X.std(axis=0) edge_model.fit(X) # ############################################################################# # Cluster using affinity propagation _, labels = cluster.affinity_propagation(edge_model.covariance_, random_state=0) n_labels = labels.max() for i in range(n_labels + 1): print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i]))) # ############################################################################# # Find a low-dimension embedding for visualization: find the best position of # the nodes (the stocks) on a 2D plane # We use a dense eigen_solver to achieve reproducibility (arpack is # initiated with random vectors that we don't control). In addition, we # use a large number of neighbors to capture the large-scale structure. node_position_model = manifold.LocallyLinearEmbedding( n_components=2, eigen_solver='dense', n_neighbors=6) embedding = node_position_model.fit_transform(X.T).T # ############################################################################# # Visualization # 支持中文 plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 plt.figure(1, facecolor='w', figsize=(15, 12)) plt.clf() ax = plt.axes([0., 0., 1., 1.]) plt.axis('off') # Display a graph of the partial correlations partial_correlations = edge_model.precision_.copy() d = 1 / np.sqrt(np.diag(partial_correlations)) partial_correlations *= d partial_correlations *= d[:, np.newaxis] non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02) # Plot the nodes using the coordinates of our embedding plt.scatter(embedding[0], embedding[1], s=100 * d**2, c=labels, cmap=plt.cm.nipy_spectral) # Plot the edges start_idx, end_idx = np.where(non_zero) # a sequence of (*line0*, *line1*, *line2*), where:: # linen = (x0, y0), (x1, y1), ... (xm, ym) segments = [[embedding[:, start], embedding[:, stop]] for start, stop in zip(start_idx, end_idx)] values = np.abs(partial_correlations[non_zero]) lc = LineCollection(segments, zorder=0, cmap=plt.cm.hot_r, norm=plt.Normalize(0, .7 * values.max())) lc.set_array(values) lc.set_linewidths(15 * values) ax.add_collection(lc) # Add a label to each node. The challenge here is that we want to # position the labels to avoid overlap with other labels for index, (name, label, (x, y)) in enumerate(zip(names, labels, embedding.T)): dx = x - embedding[0] dx[index] = 1 dy = y - embedding[1] dy[index] = 1 this_dx = dx[np.argmin(np.abs(dy))] this_dy = dy[np.argmin(np.abs(dx))] if this_dx > 0: horizontalalignment = 'left' x = x + .002 else: horizontalalignment = 'right' x = x - .002 if this_dy > 0: verticalalignment = 'bottom' y = y + .002 else: verticalalignment = 'top' y = y - .002 plt.text(x, y, name, size=10, horizontalalignment=horizontalalignment, verticalalignment=verticalalignment, bbox=dict(facecolor='w', edgecolor=plt.cm.nipy_spectral(label / float(n_labels)), alpha=.6)) plt.xlim( embedding[0].min() - .15 * embedding[0].ptp(), embedding[0].max() + .10 * embedding[0].ptp(), ) plt.ylim(embedding[1].min() - .03 * embedding[1].ptp(), embedding[1].max() + .03 * embedding[1].ptp()) plt.show()
def index(): validation_file_name = request.args.get("file") result_id = request.args.get("id") if result_id is None: return redirect('../fs/val/config') r = UserData.get_result_from_id(result_id) if r is None: return abort(403) col_overlapped = r['col_overlapped'].split(',') col_selected_method = r['col_selected_method'].split(',') filename = r['filename'] if col_overlapped is None and col_selected_method is None: return redirect('/an') col_m1 = r['col_method1'].split(',') col_m2 = r['col_method2'].split(',') col_m3 = r['col_method3'].split(',') method_names = r['fs_methods'].split(',') col_mo = list(dict.fromkeys(col_overlapped + col_selected_method)) disease_file_path = VALIDATION_PATH / validation_file_name # gene_card_df = PreProcess.getDF(disease_file_path) get_gene_card_df gene_card_df = PreProcess.get_gene_card_df(disease_file_path) col_gene_card = gene_card_df.columns.tolist() col_m1_gene_card = get_overlap_features(col_gene_card, col_m1) col_m2_gene_card = get_overlap_features(col_gene_card, col_m2) col_m3_gene_card = get_overlap_features(col_gene_card, col_m3) data_available = 1 if not col_m1_gene_card and not col_m2_gene_card and not col_m3_gene_card: data_available = 0 col_dist_gene_card = get_overlap_features(col_gene_card, col_mo) dis_gene_card = gene_card_df[col_dist_gene_card] col_gene_card = [col_m1_gene_card, col_m2_gene_card, col_m3_gene_card, col_mo, col_dist_gene_card] venn_data = FeatureSelection.venn_diagram_data(col_m1_gene_card, col_m2_gene_card, col_m3_gene_card) #Get gene info gene_info_path = GENE_INFO_PATH / "Homo_sapiens.gene_info" unique_genes = list(set(col_m1 + col_m2 + col_m3)) gene_info_df = FeatureSelection.get_selected_gene_info(gene_info_path, unique_genes) gene_info = gene_info_df.to_json(orient='index') gene_info = json.loads(gene_info) gene_name_list = list(gene_info_df.index) dis_gene_card = dis_gene_card.T dis_gene_card.columns.name = dis_gene_card.index.name dis_gene_card.index.name = None dis_gene_card = dis_gene_card.sort_values(by='Relevance score', ascending=False) #Network Create file_path = USER_PATH / str(g.user['id']) / filename df = PreProcess.getDF(file_path) universal_col = set(col_m1).union(set(col_m2), set(col_m3)) # df = df.drop(['class'], axis=1) df = df[universal_col] names = df.columns.values X = df.values X /= X.std(axis=0) edge_model = covariance.GraphicalLassoCV() edge_model.fit(X) partial_correlations = edge_model.precision_.copy() d = 1 / np.sqrt(np.diag(partial_correlations)) partial_correlations *= d partial_correlations *= d[:, np.newaxis] non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02) start_idx, end_idx = np.where(non_zero) node = [] i = 0 for name in names: if name in col_mo: data_set = {"id": i, "label": name, "group": 2, "color":'red'} else: data_set = {"id": i, "label": name, "group": 0, "color":'green'} i = i + 1 node.append(data_set) # print(node) edges = [] for x in range(len(start_idx)): link = {"from": str(start_idx[x]), "to": str(end_idx[x])} edges.append(link) # print(edges) return render_template("validation/index.html", col_gene_card = col_gene_card, method_names = method_names, tables=[dis_gene_card.to_html(classes='data')], venn_data=venn_data, filename=filename, result_id = result_id, gene_info = gene_info, gene_name_list = gene_name_list, data_available = data_available, node=node, edges=edges)
# %% # .. _stock_market: # # Learning a graph structure # -------------------------- # # We use sparse inverse covariance estimation to find which quotes are # correlated conditionally on the others. Specifically, sparse inverse # covariance gives us a graph, that is a list of connection. For each # symbol, the symbols that it is connected too are those useful to explain # its fluctuations. from sklearn import covariance alphas = np.logspace(-1.5, 1, num=10) edge_model = covariance.GraphicalLassoCV(alphas=alphas) # standardize the time series: using correlations rather than covariance # former is more efficient for structure recovery X = variation.copy().T X /= X.std(axis=0) edge_model.fit(X) # %% # Clustering using affinity propagation # ------------------------------------- # # We use clustering to group together quotes that behave similarly. Here, # amongst the :ref:`various clustering techniques <clustering>` available # in the scikit-learn, we use :ref:`affinity_propagation` as it does # not enforce equal-size clusters, and it can choose automatically the