Пример #1
0
def get_network():
    path = USER_PATH / str(g.user["id"])

    list_names = [f for f in os.listdir(path) if os.path.isfile((path / f))]

    file_name = request.form.get("file_name")
    file_path = USER_PATH / str(g.user['id']) / file_name
    df = PreProcess.getDF(file_path)

    error = check_df(df)

    if error:
        return render_template("network/index.html",
                               error=error,
                               filename=file_name,
                               all_names=list_names)

    # df = df.iloc[:, 0:350]

    names = df.columns.values
    X = df.values.copy()
    X /= X.std(axis=0)

    warnings.filterwarnings("ignore", category=RuntimeWarning)

    edge_model = covariance.GraphicalLassoCV(tol=1e-3)
    edge_model.fit(X)

    _, labels = cluster.affinity_propagation(edge_model.covariance_)
    n_labels = labels.max()

    partial_correlations = edge_model.precision_.copy()
    d = 1 / np.sqrt(np.diag(partial_correlations))
    partial_correlations *= d
    partial_correlations *= d[:, np.newaxis]
    non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02)

    start_idx, end_idx = np.where(non_zero)

    node = []
    i = 0
    for name in names:
        data_set = {"id": i, "label": name, "group": str(labels[i])}
        i = i + 1
        node.append(data_set)

    edges = []
    for x in range(len(start_idx)):
        link = {"from": str(start_idx[x]), "to": str(end_idx[x])}
        edges.append(link)

    return render_template("network/index.html",
                           node=node,
                           edges=edges,
                           error=error,
                           filename=file_name,
                           all_names=list_names,
                           n_labels=n_labels)
Пример #2
0
    def glasso_iso_cov(self):
        # read in isoform expression data
        # run glasso to get adjusted isoform exptression covariance matrix
        cur_tpm = pd.read_csv(self.gene + "_tpm.csv")
        cur_tpm.drop(cur_tpm.columns[[0]], axis=1, inplace=True)
        cur_tpm = np.transpose(cur_tpm.to_numpy())
        cur_tpm_std = (cur_tpm - np.mean(cur_tpm, axis=0)) / np.std(cur_tpm,
                                                                    axis=0)
        md = covariance.GraphicalLassoCV().fit(cur_tpm_std)
        self.iso_cov = md.covariance_

        # self.iso_cov = np.loadtxt(file)
        # print("isoform expression covariance matrix:")
        print(self.iso_cov)
Пример #3
0
def test():
    df = PreProcess.getDF(TEST_FILE)
    df = df.drop(['class'], axis=1)
    df = df.iloc[:, 0:10]

    names = df.columns.values
    X = df.values

    edge_model = covariance.GraphicalLassoCV()
    edge_model.fit(X)

    # _, labels = cluster.affinity_propagation(edge_model.covariance_)
    # n_labels = labels.max()

    # for i in range(n_labels + 1):
    #     print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i])))

    # node_position_model = manifold.LocallyLinearEmbedding(
    #     n_components=2, eigen_solver='dense', n_neighbors=6)
    #
    # embedding = node_position_model.fit_transform(X.T).T

    partial_correlations = edge_model.precision_.copy()
    d = 1 / np.sqrt(np.diag(partial_correlations))
    partial_correlations *= d
    partial_correlations *= d[:, np.newaxis]
    non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02)

    start_idx, end_idx = np.where(non_zero)

    node = []
    i = 0
    for name in names:
        data_set = {"id": i, "label": name, "group": 1}
        i = i + 1
        node.append(data_set)

    # print(node)
    edges = []
    for x in range(len(start_idx)):
        link = {"from": str(start_idx[x]), "to": str(end_idx[x])}
        edges.append(link)
    # print(edges)

    return render_template("network/index.html",
                           names=names,
                           start_idx=start_idx,
                           end_idx=end_idx,
                           node=node,
                           edges=edges)
Пример #4
0
def fit(X):
    X /= X.std(axis=0)
    # 稀疏的可逆协方差估计
    edge_model = covariance.GraphicalLassoCV(cv=5)
    edge_model.fit(X)
    # #############################################################################
    # edge_model.covariance_ 协方差
    # 聚类
    _, labels = cluster.affinity_propagation(edge_model.covariance_)
    # #############################################################################
    # 非线性降维
    node_position_model = manifold.LocallyLinearEmbedding(n_components=4, eigen_solver='dense',
                                                          n_neighbors=12)
    embedding = node_position_model.fit_transform(X.T).T
    partial_correlations = edge_model.precision_.copy()
    return labels, partial_correlations, embedding
Пример #5
0
    def x2cor(xarray, corr='pCor'):
        # #############################################################################
        # Learn a graphical structure from the correlations
        edge_model = covariance.GraphicalLassoCV()
        edge_model.fit(xarray)

        cov_correlations = edge_model.covariance_.copy()
        if corr == 'Cor':
            d = 1 / np.sqrt(np.diag(cov_correlations))
            non_zero = (np.abs(np.triu(cov_correlations, k=1)) > 0.5)
        else:
            # Display a graph of the partial correlations
            partial_correlations = edge_model.precision_.copy()
            d = 1 / np.sqrt(np.diag(partial_correlations))
            partial_correlations *= d
            partial_correlations *= d[:, np.newaxis]
            non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02)
            #values = np.abs(partial_correlations[non_zero])
        return non_zero, d, cov_correlations
Пример #6
0
        'https://raw.githubusercontent.com/mailtsjp/finpy/master/examples-data/'
        'financial-data/{}.csv')

    #  https://raw.githubusercontent.com/mailtsjp/Quandlproj1/master/examples-data/financial-data/WFC.csv

    a = quotes.append(pd.read_csv(url.format(symbol)))

close_prices = np.vstack([q['Close'] for q in quotes])
open_prices = np.vstack([q['Open'] for q in quotes])

# The daily variations of the quotes are what carry most information
variation = close_prices - open_prices

# #############################################################################
# Learn a graphical structure from the correlations
edge_model = covariance.GraphicalLassoCV()

# standardize the time series: using correlations rather than covariance
# is more efficient for structure recovery
X = variation.copy().T
X /= X.std(axis=0)
edge_model.fit(X)

# #############################################################################
# Cluster using affinity propagation

_, labels = cluster.affinity_propagation(edge_model.covariance_,
                                         random_state=0)
n_labels = labels.max()

for i in range(n_labels + 1):
Пример #7
0
def parCorrGlasso(dataset, kfolds=10):
    '''
    INPUT:
        dataset : a dataset with dimension [nDatapoints x nNodes]
        kfolds : number of folds for the cross-validation to select regularization parameter alpha
    OUTPUT:
        Mreg : matrix of regularized (shrinked) partial correlation coefficients
        Mmod : matrix of partial correlations constrained to the glasso model
        condSetSizeMat : a matrix with the conditional sets sizes for the Mmod partial correlations
                        note: this is needed to compute the fisher z-transform for the significance tests
    '''
    D = dataset
    #in regularized methods it is necessary to standardized the data
    D = stats.zscore(D,axis=0)
    #estimate the empirical covariance
    emp_cov = np.cov(D,rowvar=False)
    #define the glasso model with cross-validation
    glasso = covariance.GraphicalLassoCV(cv = kfolds)
    #fit the model to the data and cross-validate to get the
    #best regularization parameter
    glasso.fit(D)
    #get the regularized precision matrix (inverse covariance)
    prec_mat = glasso.precision_
    #transform into regularized partial correlation coefficients
    ##https://en.wikipedia.org/wiki/Partial_correlation#Using_matrix_inversion
    denom = np.atleast_2d(1. / np.sqrt(np.diag(prec_mat)))
    Mreg = -prec_mat * denom * denom.T
    #make the diagonal zero.
    np.fill_diagonal(Mreg,0)
    
    
    #use the connectivity model defined by glasso to compute the partial correlations between two connected nodes.
    #this is done to get non-shrinked partial correlations.
    #For adjacent nodes x and y, compute partial correlation of x and y conditional on Z = {adj(x) & adj(y)}
    #where adj(x) is the set of adjacent nodes to x in the glasso model Gm.
    
    #glasso model = non-zero entries in Mreg, ie. edges in the connectivity model
    Gm = Mreg != 0
    nNodes = Gm.shape[0]
    #allocate memory
    Mmod = np.zeros((nNodes,nNodes))
    condSetSizeMat = np.zeros((nNodes,nNodes))
    #iterate through each pair of nodes
    for x in range(nNodes-1):
        for y in range(x+1,nNodes):
            #if x and y are adjacent
            if Gm[x,y] != 0:
                #get adjacencies indices
                adj_x = np.argwhere(Gm[x,:] != 0)
                adj_y = np.argwhere(Gm[y,:] != 0)
                #get the union of adj(x) and adj(y)
                Z = np.union1d(adj_x,adj_y)
                #remove x and y from Z
                Z = Z[Z!=x]
                Z = Z[Z!=y]
                #and put them back at the beginning of Z
                Z = np.insert(Z,0,y)
                Z = np.insert(Z,0,x)
                #define a new dataset only including nodes x & y & adj(x) & adj(y)
                newD = D[:,Z]
                #compute the partial correlation for the new dataset but only get Mxy element
                #x and y will always be in the 0 and 1 positions.
                pc_xyz = parCorrInvCov(newD)[0,1]
                #is a symmetric matrix
                Mmod[x,y] = Mmod[y,x] = pc_xyz
                #get the size of the conditioning set for x and y
                condSetSizeMat[x,y] = condSetSizeMat[y,x] = len(Z)-2
                
    
    
    return Mreg, Mmod,condSetSizeMat
Пример #8
0
def graphicalAnalysis(dataset,
                      start_date='2000-01-01',
                      end_date='2020-05-31',
                      Sectors_chosen=[],
                      drop_firm=[],
                      display_SumStat=True,
                      display_IndRet=True,
                      data_rf=df_rf):

    # Check if the inputed date are legit
    if (datetime.strptime(start_date, "%Y-%m-%d") > datetime.strptime(
            end_date, "%Y-%m-%d")):
        print(
            'ERROR: Revision needed! The entered \"start_date\" should be before \"end_date\".'
        )
        return 0, 0
    if (dataset.index[0] - timedelta(days=dataset.index[0].weekday()) >
            datetime.strptime(start_date, "%Y-%m-%d")):
        print(
            'WARNING: the entered \"start_date\" is outside of the range for the given dataset.'
        )
        print(
            'The \"start_date\" is adjusted to the earliest start_date, i.e. ',
            (dataset.index[0] -
             timedelta(days=dataset.index[0].weekday())).strftime("%Y-%m-%d"))
        print()
    if (dataset.index[-1] < datetime.strptime(end_date, "%Y-%m-%d")):
        print(
            'WARNING: the entered \"end_date\" is outside of the range for the given dataset.'
        )
        print('The \"end_date\" is adjusted to the lastest end_date, i.e. ',
              dataset.index[-1].strftime("%Y-%m-%d"))
        print()

    # Extract the data for the given time period
    temp = dataset[dataset.index >= start_date].copy()
    X = temp[temp.index <= end_date].copy()
    temp = data_rf[data_rf.index >= start_date].copy()
    data_rf2 = temp[temp.index <= end_date].copy()

    # Check if we are using all sectors or dropping some sector
    if ((not Sectors_chosen) == False):
        if (all([(s in firms_info.Sector.unique()) for s in Sectors_chosen])):
            f_in_sector_chosen = []
            for s in Sectors_chosen:
                f_in_sector_chosen += list(
                    firms_info[firms_info.Sector == s].index)
            X = X[f_in_sector_chosen]
            print('Sectors choosen in the Graphical Analysis are:')
            print(Sectors_chosen)
            print()
        else:
            print(
                'ERROR: Revision needed! At Least 1 Sector entered in the \"Sectors_choosen\" option is NOT in the dataset!'
            )
            print('Check your format!')
            return 0, 0

    # Check if we are using all firm or dropping some firms
    if ((not drop_firm) == False):
        if (all([(f in X.columns) for f in drop_firm])):
            print('The following Firms are dropped:')
            print(drop_firm)
            print()
            X.drop(columns=drop_firm, inplace=True)
        else:
            print(
                'ERROR: Revision needed! At Least 1 firm entered in the \"drop_firm\" option is NOT in the dataset!'
            )
            print('Check your format!')
            return 0, 0

    # Check if there is NA in the dataset within the given time period
    # If yes, then drop those firms before doing graphical analysis
    if (X.isnull().values.any()):
        print('WARNING: Some firms have missing data during this time period!')
        print('Dropping firms: ')
        for Xcol_dropped in list(X.columns[X.isna().any()]):
            print(Xcol_dropped)
        X = X.dropna(axis='columns')
        print()

    # Get the Start and End date of the dataset
    date_obj = X.index[0]
    start_of_week = date_obj - timedelta(days=date_obj.weekday())
    start = start_of_week.strftime("%m/%d/%Y")
    end = X.index[-1].strftime("%m/%d/%Y")

    # Get the firm names of the dataset
    names = np.array(list(X.columns))

    # Show the number of firms examined
    print('Number of firms examined:', X.shape[1])

    # #############################################################################
    # Learn a graphical structure from the correlations

    # Graphical Lasso is used here to estimate the precision matrix
    edge_model = covariance.GraphicalLassoCV(max_iter=1000)

    # standardize the time series:
    # using correlations rather than covariance is more efficient for structure recovery
    X_std = X / X.std(axis=0)
    edge_model.fit(X_std)

    # #############################################################################
    # Cluster using affinity propagation

    _, labels = cluster.affinity_propagation(edge_model.covariance_)
    n_labels = labels.max()

    for i in range(n_labels + 1):
        print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i])))

    # #############################################################################
    # Find a low-dimension embedding for visualization: find the best position of
    # the nodes (the stocks) on a 2D plane

    node_position_model = manifold.MDS(n_components=2, random_state=0)
    embedding = node_position_model.fit_transform(X_std.T).T

    # #############################################################################
    # Visualization I

    # Specify node colors by cluster labels
    color_list = pl.cm.jet(np.linspace(0, 1, n_labels + 1))
    my_colors = [color_list[i] for i in labels]

    # Compute the partial correlations
    partial_correlations = edge_model.precision_.copy()
    d = 1 / np.sqrt(np.diag(partial_correlations))
    partial_correlations *= d
    partial_correlations *= d[:, np.newaxis]
    non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02)

    # Compute the edge values based on the partial correlations
    values = np.abs(partial_correlations[non_zero])
    val_max = values.max()

    # Title of the plot
    title = 'Graphical Network Analysis of Selected Firms over the Period ' + start + ' to ' + end + ' (Weekly)'

    # Display the partial correlation graph
    graphicalAnalysis_plot(d, partial_correlations, my_colors, names, labels,
                           embedding, val_max, title)

    # The configuration of the plot
    plot_config = [
        d, partial_correlations, my_colors, names, labels, embedding, val_max,
        title
    ]

    # #############################################################################
    # Visualization II

    # For individual firm performance over the given period
    if (display_IndRet):
        print('Individual Stock Performance over the Period ' + start +
              ' to ' + end + ' (Weekly):')
        l_r = int(np.ceil(len(names) / 4))
        l_c = 4
        f_hei = l_r * 2.5
        f_wid = l_c * 4
        ax = (X + 1).cumprod().plot(subplots=True,
                                    layout=(l_r, l_c),
                                    figsize=(f_wid, f_hei),
                                    logy=True,
                                    sharex=True,
                                    sharey=True,
                                    x_compat=True,
                                    color=my_colors)
        for i in range(l_c):
            ax[0, i].xaxis.set_tick_params(which='both',
                                           top=True,
                                           labeltop=True,
                                           labelrotation=40)
        plt.show()

    # #############################################################################
    # Show summary statistics for each firm over the given period
    if (display_SumStat):
        display(getSumStat(X, rf=data_rf2['T-Bill']))

    return [edge_model.covariance_, edge_model.precision_], plot_config
Пример #9
0
def learn_network_structure(ts_returns_data,
                            names,
                            alphas=4,
                            cv=5,
                            mode='cd',
                            assume_centered=False,
                            n_components=2,
                            n_neighbors=5,
                            eigen_solver="dense",
                            method='standard',
                            neighbors_algorithm="auto",
                            random_state=None,
                            n_jobs=None,
                            standardise=False):
    """

	Parameters
	----------
	ts_returns_data : array-like of shape [n_samples, n_instruments]
	                  time series matrix of returns

	names : array-like of shape [n_samples, 1]
	        Individual names of the financial instrument

	alphas : int or positive float, optional
	         Number of points on the grids to be used

	cv : int, optional
	     Number of folds for cross-validation splitting strategy

	mode : str, optional
	       Solver to use to compute the graph

	assume_centered : bool, optional
                      Centre the data if False.

	n_components : int
	               Number of components for the manifold

	n_neighbors: int
                 Number of neighbours to consider for each point

	eigen_solver : str
	               Algorithm to compute eigenvalues

	method : str
             Algorithm to use for local linear embedding
	neighbors_algorithm : str
	                      Algorithm to use for nearest neighbours search

	random_state : int, RandomState instance or None, optional
	               If int, random_state is the seed used by the random number generator.
	               If RandomState instance, random_state is the random number generator.
	               If None, the random number generator is the RandomState instance used by np.random.
	               Used when eigen_solver == ‘arpack’

	n_jobs : int or None, optional
	         number of parallel jobs to run

	standardise : bool
	              standardise data if True

	Returns : sklearn.covariance.graph_lasso_.GraphicalLassoCV

              sklearn.manifold.locally_linear.LocallyLinearEmbedding

              array-like of shape [n_components, n_instruments]
              Transformed embedding vectors

              array-like of shape [n_instruments, 1]
              numeric identifier of each cluster



	-------
	"""

    if not isinstance(ts_returns_data, (np.ndarray, np.generic)):
        raise TypeError("ts_returns_data must be of class ndarray")

    # learn graphical structure
    edge_model = covariance.GraphicalLassoCV(alphas=alphas,
                                             cv=cv,
                                             mode=mode,
                                             assume_centered=assume_centered)
    edge_model.fit(ts_returns_data)

    # cluster using affinity propagation
    _, labels = cluster.affinity_propagation(edge_model.covariance_)
    n_labels = labels.max()
    for i in range(n_labels + 1):
        print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i])))

    # find low-dimension embedding - useful for 2D plane visualisation
    node_position_model = manifold.LocallyLinearEmbedding(
        n_components=n_components,
        eigen_solver=eigen_solver,
        n_neighbors=n_neighbors,
        method=method,
        neighbors_algorithm=neighbors_algorithm,
        random_state=random_state,
        n_jobs=n_jobs)
    embedding = node_position_model.fit_transform(ts_returns_data.T).T

    if standardise:
        # standardise returns
        standard_ret = ts_returns_data.copy()
        standard_ret /= ts_returns_data.std(axis=0)

        # learn graph model
        edge_model.fit(standard_ret)

        # cluster using affinity propagation
        _, labels = cluster.affinity_propagation(edge_model.covariance_)
        n_labels = labels.max()
        for i in range(n_labels + 1):
            print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i])))

        # find low-dimension embedding - useful for 2D plane visualisation
        node_position_model = manifold.LocallyLinearEmbedding(
            n_components=n_components,
            eigen_solver=eigen_solver,
            n_neighbors=n_neighbors,
            method=method,
            neighbors_algorithm=neighbors_algorithm,
            random_state=random_state,
            n_jobs=n_jobs)
        embedding = node_position_model.fit_transform(ts_returns_data.T).T

    return edge_model, node_position_model, embedding, labels
Пример #10
0
    def _visualize(self, names, close_prices, open_prices):
        # The daily variations of the quotes are what carry most information
        variation = close_prices - open_prices
        # NaN值赋值为0,下面在调用GraphLassoCV的时候会报一些除0的RuntimeWarning,但是可以通过
        variation[np.isnan(variation)] = 0
        # #############################################################################
        # Learn a graphical structure from the correlations
        edge_model = covariance.GraphicalLassoCV()
        # standardize the time series: using correlations rather than covariance
        # is more efficient for structure recovery
        X = variation.copy().T
        X /= X.std(axis=0)
        edge_model.fit(X)
        # #############################################################################
        # Cluster using affinity propagation
        _, labels = cluster.affinity_propagation(edge_model.covariance_,
                                                 random_state=0)
        n_labels = labels.max()
        for i in range(n_labels + 1):
            print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i])))
        # #############################################################################
        # Find a low-dimension embedding for visualization: find the best position of
        # the nodes (the stocks) on a 2D plane
        # We use a dense eigen_solver to achieve reproducibility (arpack is
        # initiated with random vectors that we don't control). In addition, we
        # use a large number of neighbors to capture the large-scale structure.
        node_position_model = manifold.LocallyLinearEmbedding(
            n_components=2, eigen_solver='dense', n_neighbors=6)
        embedding = node_position_model.fit_transform(X.T).T
        # #############################################################################
        # Visualization
        # 支持中文
        plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
        plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号
        plt.figure(1, facecolor='w', figsize=(15, 12))
        plt.clf()
        ax = plt.axes([0., 0., 1., 1.])
        plt.axis('off')
        # Display a graph of the partial correlations
        partial_correlations = edge_model.precision_.copy()
        d = 1 / np.sqrt(np.diag(partial_correlations))
        partial_correlations *= d
        partial_correlations *= d[:, np.newaxis]
        non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02)
        # Plot the nodes using the coordinates of our embedding
        plt.scatter(embedding[0],
                    embedding[1],
                    s=100 * d**2,
                    c=labels,
                    cmap=plt.cm.nipy_spectral)
        # Plot the edges
        start_idx, end_idx = np.where(non_zero)
        # a sequence of (*line0*, *line1*, *line2*), where::
        #            linen = (x0, y0), (x1, y1), ... (xm, ym)
        segments = [[embedding[:, start], embedding[:, stop]]
                    for start, stop in zip(start_idx, end_idx)]
        values = np.abs(partial_correlations[non_zero])
        lc = LineCollection(segments,
                            zorder=0,
                            cmap=plt.cm.hot_r,
                            norm=plt.Normalize(0, .7 * values.max()))
        lc.set_array(values)
        lc.set_linewidths(15 * values)
        ax.add_collection(lc)
        # Add a label to each node. The challenge here is that we want to
        # position the labels to avoid overlap with other labels
        for index, (name, label,
                    (x, y)) in enumerate(zip(names, labels, embedding.T)):

            dx = x - embedding[0]
            dx[index] = 1
            dy = y - embedding[1]
            dy[index] = 1
            this_dx = dx[np.argmin(np.abs(dy))]
            this_dy = dy[np.argmin(np.abs(dx))]
            if this_dx > 0:
                horizontalalignment = 'left'
                x = x + .002
            else:
                horizontalalignment = 'right'
                x = x - .002
            if this_dy > 0:
                verticalalignment = 'bottom'
                y = y + .002
            else:
                verticalalignment = 'top'
                y = y - .002
            plt.text(x,
                     y,
                     name,
                     size=10,
                     horizontalalignment=horizontalalignment,
                     verticalalignment=verticalalignment,
                     bbox=dict(facecolor='w',
                               edgecolor=plt.cm.nipy_spectral(label /
                                                              float(n_labels)),
                               alpha=.6))
        plt.xlim(
            embedding[0].min() - .15 * embedding[0].ptp(),
            embedding[0].max() + .10 * embedding[0].ptp(),
        )
        plt.ylim(embedding[1].min() - .03 * embedding[1].ptp(),
                 embedding[1].max() + .03 * embedding[1].ptp())
        plt.show()
Пример #11
0
def index():
    validation_file_name = request.args.get("file")
    result_id = request.args.get("id")

    if result_id is None:
        return redirect('../fs/val/config')

    r = UserData.get_result_from_id(result_id)
    if r is None:
        return abort(403)

    col_overlapped = r['col_overlapped'].split(',')
    col_selected_method = r['col_selected_method'].split(',')
    filename = r['filename']

    if col_overlapped is None and col_selected_method is None:
        return redirect('/an')

    col_m1 = r['col_method1'].split(',')
    col_m2 = r['col_method2'].split(',')
    col_m3 = r['col_method3'].split(',')
    method_names = r['fs_methods'].split(',')

    col_mo = list(dict.fromkeys(col_overlapped + col_selected_method))

    disease_file_path = VALIDATION_PATH / validation_file_name

    # gene_card_df = PreProcess.getDF(disease_file_path) get_gene_card_df
    gene_card_df = PreProcess.get_gene_card_df(disease_file_path)

    col_gene_card = gene_card_df.columns.tolist()

    col_m1_gene_card = get_overlap_features(col_gene_card, col_m1)
    col_m2_gene_card = get_overlap_features(col_gene_card, col_m2)
    col_m3_gene_card = get_overlap_features(col_gene_card, col_m3)

    data_available = 1
    if not col_m1_gene_card and not col_m2_gene_card and not col_m3_gene_card:
        data_available = 0

    col_dist_gene_card = get_overlap_features(col_gene_card, col_mo)

    dis_gene_card = gene_card_df[col_dist_gene_card]

    col_gene_card = [col_m1_gene_card, col_m2_gene_card, col_m3_gene_card, col_mo, col_dist_gene_card]

    venn_data = FeatureSelection.venn_diagram_data(col_m1_gene_card, col_m2_gene_card, col_m3_gene_card)

    #Get gene info
    gene_info_path = GENE_INFO_PATH / "Homo_sapiens.gene_info"
    unique_genes = list(set(col_m1 + col_m2 + col_m3))

    gene_info_df = FeatureSelection.get_selected_gene_info(gene_info_path, unique_genes)
    gene_info = gene_info_df.to_json(orient='index')

    gene_info = json.loads(gene_info)

    gene_name_list = list(gene_info_df.index)

    dis_gene_card = dis_gene_card.T
    dis_gene_card.columns.name = dis_gene_card.index.name
    dis_gene_card.index.name = None
    dis_gene_card = dis_gene_card.sort_values(by='Relevance score', ascending=False)

    #Network Create
    file_path = USER_PATH / str(g.user['id']) / filename

    df = PreProcess.getDF(file_path)
    universal_col = set(col_m1).union(set(col_m2), set(col_m3))
    # df = df.drop(['class'], axis=1)
    df = df[universal_col]

    names = df.columns.values
    X = df.values
    X /= X.std(axis=0)

    edge_model = covariance.GraphicalLassoCV()
    edge_model.fit(X)

    partial_correlations = edge_model.precision_.copy()
    d = 1 / np.sqrt(np.diag(partial_correlations))
    partial_correlations *= d
    partial_correlations *= d[:, np.newaxis]
    non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02)

    start_idx, end_idx = np.where(non_zero)

    node = []
    i = 0
    for name in names:
        if name in col_mo:
            data_set = {"id": i, "label": name, "group": 2, "color":'red'}
        else:
            data_set = {"id": i, "label": name, "group": 0, "color":'green'}
        i = i + 1
        node.append(data_set)

    # print(node)
    edges = []
    for x in range(len(start_idx)):
        link = {"from": str(start_idx[x]), "to": str(end_idx[x])}
        edges.append(link)
    # print(edges)

    return render_template("validation/index.html", col_gene_card = col_gene_card, method_names = method_names,
                           tables=[dis_gene_card.to_html(classes='data')], venn_data=venn_data, filename=filename,
                           result_id = result_id, gene_info = gene_info, gene_name_list = gene_name_list,
                           data_available = data_available, node=node, edges=edges)
Пример #12
0
# %%
# .. _stock_market:
#
# Learning a graph structure
# --------------------------
#
# We use sparse inverse covariance estimation to find which quotes are
# correlated conditionally on the others. Specifically, sparse inverse
# covariance gives us a graph, that is a list of connection. For each
# symbol, the symbols that it is connected too are those useful to explain
# its fluctuations.

from sklearn import covariance

alphas = np.logspace(-1.5, 1, num=10)
edge_model = covariance.GraphicalLassoCV(alphas=alphas)

# standardize the time series: using correlations rather than covariance
# former is more efficient for structure recovery
X = variation.copy().T
X /= X.std(axis=0)
edge_model.fit(X)

# %%
# Clustering using affinity propagation
# -------------------------------------
#
# We use clustering to group together quotes that behave similarly. Here,
# amongst the :ref:`various clustering techniques <clustering>` available
# in the scikit-learn, we use :ref:`affinity_propagation` as it does
# not enforce equal-size clusters, and it can choose automatically the