Пример #1
0
    def __init__(self, datafile, outdir="./"):
        """
        default values when initialize. set log file
        """
        self.outdir = outdir
        self.tablename = datafile
        uf.create_dir(self.outdir)
        print("Reading data from {}".format(datafile))
        df, self.cells, self.genes = get_data(datafile)

        self.data = df[df.sum(axis=1) != 0][:, df.sum(axis=0) != 0]
        print("Finish reading data from {}".format(datafile))
Пример #2
0
 def __init__(self, datafile, outdir="./", log=None):
     """
     default values when initialize. set log file
     """
     self.outdir = outdir
     self.tablename = datafile
     uf.create_dir(self.outdir)
     self.log = os.path.join(self.outdir, log) if log else \
                os.path.join( self.outdir,"{}_{}.log".format(os.path.basename(datafile),uf.now()) )
     self.logger = uf.create_logger(self.log)
     self.logger.info("start reading data from {}, log file is {}".format(
         data, self.log))
     df = get_data(datafile)
     self.data = df.loc[df.sum(axis=1) != 0, df.sum(axis=0) != 0]
     self.logger.info("finish reading data from {}".format(data))
Пример #3
0
 def __init__(self, data, outdir='./', log=None):
     """
     default values when initialize. set log file
     """
     self.outdir = outdir
     self.tablename = data
     uf.create_dir(self.outdir)
     self.log = os.path.join(self.outdir, log) if log else os.path.join(
         self.outdir, '{}_{}.log'.format(os.path.basename(data), uf.now()))
     self.logger = uf.create_logger(self.log)
     self.logger.info('start reading data from {}, log file is {}'.format(
         data, self.log))
     df = get_data(data)
     self.data = df.loc[(df.sum(axis=1) != 0, df.sum(axis=0) != 0)]
     self.csn = None
     self.logger.info('finish reading data from {}'.format(data))
Пример #4
0
 def __init__(self, data, outdir="./", log=None):
     """
     default values when initialize. set log file
     """
     self.outdir = outdir
     self.tablename = data
     uf.create_dir(self.outdir)
     self.log = os.path.join(self.outdir, log) if log else \
                os.path.join( self.outdir,"{}_{}.log".format(os.path.basename(data),uf.now()) )
     self.logger = uf.create_logger(self.log)
     self.logger.info("start reading data from {}, log file is {}".format(
         data, self.log))
     df = get_data(data)
     self.data = df.loc[df.sum(axis=1) != 0, df.sum(axis=0) != 0]
     self.csn = None
     # Gene expression matrix (TPM/FPKM/RPKM/count), rows = genes, columns = cells or OTU table
     self.logger.info("finish reading data from {}".format(data))
Пример #5
0
    def csnet(self,
              cells=None,
              alpha=0.01,
              boxsize=0.1,
              edgeW=0,
              kk=0,
              dlimit=5,
              to_csv=0,
              average=1,
              *args,
              **kwargs):
        """
        fcndm = cndm(data, 0.1, 0.1, 1) for test
        Construct the CSN for sepecified cells
        
            Parameters:        
                `cells`   Construct the CSNs for all cells, set cells = None (Default) otherwise input cells.list
                `alpha`   Significant level (eg. 0.001, 0.01, 0.05 ...)
                          larger alpha leads to more edges, Default = 0.01
                `boxsize` Size of neighborhood, the value between 1 to 2 is recommended, Default = 0.1, 
                `edgeW`   1  edge is weighted (statistic pxy(x))
                          0  edge is not weighted (Default)
                `nodeW`   1  node is weighted (gene or otu abundance)
                          0  node is not wieghted (Default)
                `csn`     Cell-specific network, the kth CSN is in csn{k}
                          rows = genes, columns = genes
                `kk`      the number of conditional gene. when kk=0, the method is CSN
                `dlimit`  the min degree limitation of conditional genes.
                `average` whether use the average(adjmc + adjmc1) network or intersection(adjmc.*adjmc1) network.

            Returns: 
                csnet dict 
            Raises:  
                KeyError - raises an exception  
            Notes:
                Too many cells or genes may lead to out of memory.

        学习 dataframe 和array python的矩阵运算。
        np index start from 0 
        每个new cell都要和原来所有的细胞一起计算lower upper边界矩阵,都要排序每个基因来计算。
        如果数据库足够大,可以就用原来的边界矩阵,重新换算出upper和lower矩阵。带入new cell的基因表达数据就可以。
        """
        self.logger.info('start construction cell-specific network ')
        nr, nc = self.data.shape
        data = self.data
        upper = pd.DataFrame(np.zeros((nr, nc)),
                             columns=data.columns,
                             index=data.index)
        lower = pd.DataFrame(np.zeros((nr, nc)),
                             columns=data.columns,
                             index=data.index)
        for i in range(nr):
            sort_gi = data.iloc[i, :].sort_values(axis=0, ascending=True)
            s1 = sort_gi.values
            s2 = sort_gi.index
            n1 = sum(np.sign(s1))
            n0 = nc - n1
            h = round(boxsize * np.sqrt(n1))
            k = 0
            while k < nc:
                s = 0
                while k + s + 1 < nc and s1[(k + s + 1)] == s1[k]:
                    s = s + 1

                if s >= h:
                    upper.loc[(data.index[i], s2[range(
                        k, k + s + 1)])] = data.loc[(data.index[i], s2[k])]
                    lower.loc[(data.index[i], s2[range(
                        k, k + s + 1)])] = data.loc[(data.index[i], s2[k])]
                else:
                    upper.loc[(data.index[i], s2[range(
                        k, k + s + 1)])] = data.loc[(data.index[i], s2[int(
                            min(nc - 1, k + s + h))])]
                    lower.loc[(data.index[i], s2[range(
                        k, k + s + 1)])] = data.loc[(data.index[i], s2[int(
                            max(n0 * (n0 > h), k - h))])]
                k = k + s + 1

        # %If gene expression matrix is sparse, use the sparse matrix will accelerate
        # %the calculation and reduce memory footprint
        # %data = sparse(data); upper = sparse(upper); lower = sparse(lower);

        self.logger.info(
            'finish caculate the neighborhood of each gene for each cell')
        cells = self.get_cells(cells=cells)

        csn = dict()
        B = pd.DataFrame(np.zeros((nr, nc)),
                         columns=data.columns,
                         index=data.index)
        p = -stats.norm.ppf(q=alpha, loc=0, scale=1)

        for k in cells:
            for j in B.columns:
                if average:
                    B.loc[:, j] = (data.loc[:, j] <= upper.loc[:, k]) & (
                        data.loc[:, j] >= lower.loc[:, k]) & (data.loc[:, k] >
                                                              0)
                else:
                    B.loc[:, j] = (data.loc[:, j] <= upper.loc[:, k]) & (
                        data.loc[:, j] >= lower.loc[:, k])

            B = B * 1
            a = np.matrix(B.sum(axis=1))
            csnk = (B.dot(B.T) * nc - a.T * a) / np.sqrt(
                np.multiply(a.T * a, (nc - a).T * (nc - a)) /
                (nc - 1) + np.spacing(1))
            csnlink = (csnk > p) * 1
            if csnlink.sum().sum() == 0:
                self.logger.info('no genes in Cell {} has a link'.format(k))
                continue
            if kk != 0:
                id = condition_g(csnlink, kk=kk, dlimit=dlimit)
                csnlink = pd.DataFrame(np.zeros(
                    [nr, nr])) if average else pd.DataFrame(np.ones([nr, nr]))
                for m in range(kk):
                    B_z = B.iloc[id[m], :] * B
                    idc = np.argwhere(B.iloc[id[m], :] != 0).flatten()
                    B_z = B_z.iloc[:, idc]
                    r = B_z.shape[1]
                    a_z = np.mat(B_z.sum(axis=1))
                    c_z = B_z @ B_z.T
                    csnk1 = (c_z * r - a_z.T * a_z) / np.sqrt(
                        np.multiply(a_z.T * a_z, (r - a_z).T * (r - a_z)) /
                        (r - 1) + np.spacing(1))
                    csnlink1 = (csnk1 > p) * 1
                    csnlink = csnlink + csnlink1 if average else csnlink * csnlink1

            else:
                kk = 1
            csnlink = csnlink / kk if average else csnlink
            csn[k] = csnlink
            if to_csv:
                filename = os.path.join(self.outdir, 'cellnws',
                                        '{}.nw.csv'.format(k))
                uf.create_dir(self.outdir + '/cellnws')
                csn[k].to_csv(path_or_buf=filename)
            self.logger.info('Cell {} specific network is completed'.format(k))

        self.logger.info(
            'Finished constructing all {} cell specific networks'.format(
                len(cells)))
        self.upper = upper
        self.lower = lower
        self.csn = csn
Пример #6
0
    def csnet(self,
              cells=None,
              alpha=0.01,
              boxsize=0.1,
              edgeW=0,
              kk=0,
              dlimit=5,
              to_csv=0,
              *args,
              **kwargs):
        """
        fcndm = cndm(data, 0.1, 0.1, 1) for test
        Construct the CSN for sepecified cells
        
            Parameters:        
                `cells`   Construct the CSNs for all cells, set cells = None (Default) otherwise input cells.list
                `alpha`   Significant level (eg. 0.001, 0.01, 0.05 ...)
                          larger alpha leads to more edges, Default = 0.01
                `boxsize` Size of neighborhood, the value between 1 to 2 is recommended, Default = 0.1, 
                `edgeW`   1  edge is weighted (statistic pxy(x))
                          0  edge is not weighted (Default)
                `nodeW`   1  node is weighted (gene or otu abundance)
                          0  node is not wieghted (Default)
                `csn`     Cell-specific network, the kth CSN is in csn{k}
                          rows = genes, columns = genes
                `kk`      the number of conditional gene. when kk=0, the method is CSN
                `dlimit`  the min degree limitation of conditional genes.

            Returns: 
                csnet dict 
            Raises:  
                KeyError - raises an exception  
            Notes:
                Too many cells or genes may lead to out of memory.

        学习 dataframe 和array python的矩阵运算。
        np index start from 0 
        每个new cell都要和原来所有的细胞一起计算lower upper边界矩阵,都要排序每个基因来计算。
        如果数据库足够大,可以就用原来的边界矩阵,重新换算出upper和lower矩阵。带入new cell的基因表达数据就可以。
        """
        self.logger.info("start construction cell-specific network ")
        nr, nc = self.data.shape
        data = self.data

        #Define the neighborhood of each plot
        upper = pd.DataFrame(np.zeros((nr, nc)),
                             columns=data.columns,
                             index=data.index)
        lower = pd.DataFrame(np.zeros((nr, nc)),
                             columns=data.columns,
                             index=data.index)
        for i in range(nr):
            sort_gi = data.iloc[i, :].sort_values(axis=0, ascending=True)
            s1 = sort_gi.values
            s2 = sort_gi.index
            n1 = sum(np.sign(s1))
            n0 = nc - n1  # the number of 0
            h = round(boxsize / np.sqrt(n1))  # radius of the box
            k = 0
            while k < nc:
                s = 0
                while k + s + 1 < nc and s1[k + s + 1] == s1[k]:
                    # if the gene expression is same, assign the same values
                    s = s + 1
                if s >= h:
                    upper.loc[data.index[i],
                              s2[range(k, k + s +
                                       1)]] = data.loc[data.index[i], s2[k]]
                    lower.loc[data.index[i],
                              s2[range(k, k + s +
                                       1)]] = data.loc[data.index[i], s2[k]]
                else:
                    upper.loc[data.index[i],
                              s2[range(k, k + s + 1)]] = data.loc[
                                  data.index[i],
                                  s2[int(min(nc - 1, k + s + h))]]
                    lower.loc[data.index[i],
                              s2[range(k, k + s + 1)]] = data.loc[
                                  data.index[i],
                                  s2[int(max(n0 * (n0 > h), k - h))]]
                k = k + s + 1
        self.logger.info(
            "finish caculate the neighborhood of each gene for each cell")

        # Construction of CSN
        # Construction of cell-specific networks for each cell, use GNN to classify or cluster graphs
        cells = self.get_cells(cells=cells)
        csn = dict()
        #dict.fromkeys(cells)

        B = pd.DataFrame(np.zeros((nr, nc)),
                         columns=data.columns,
                         index=data.index)
        # one B matrix for each cell, the value in B matrix is {1: gene is in the box, 0: not}
        p = -stats.norm.ppf(q=alpha, loc=0, scale=1)
        # p: Statistical thresholds under confidence 0.99.
        """
        cell k has gene j, and the expression value is among lower and upper 
        gene expresion value 决定了upper lower 边界大小,把这个存成dict,供new cell 使用,基因丰度最相近的box作为newcell gene的box边界,根据gene大小对应的up lower键值来,计算new cells 的B矩阵
        根据基因丰度来对cell进行聚类,丰度模式相似的样品,度矩阵也默认相似。
        怎么根据数据库,快速产生度矩阵,然后进行分类预测。数据库里已经有度矩阵,和预测模型,根据new cell的度向量进行判别
        new cell 的度向量,csn快速计算问题,后面再研究
        运算符重载的使用有点不合逻辑:* 不能按元素操作,但 / 确是如此。
        # 虽然我们可以使用sort, 但是sort是全局排序
        # 如果数组非常大, 我们只希望选择最小的10个元素, 直接通过np.partition(arr, 9)即可
        # 然后如果排序的话, 只对这选出来的10个元素排序即可, 而无需对整个大数组进行排序

        """
        for k in cells:  #to update for multi process run
            for j in B.columns:
                B.loc[:,j] = (data.loc[:,j] <= upper.loc[:,k]).astype('int') \
                            * (data.loc[:,j] >= lower.loc[:,k]).astype('int') \
                            * [(i>0)*1 for i in data.loc[:,k]] #in the box and !=0
            a = np.mat(
                B.sum(axis=1))  # sum of genes in all the boxes of cells?
            # in matlab vectors are column vector, a*a'
            # in python vectors are rows in a matrix, a.T*a

            # CSN adj-matrix
            csnk = (B.dot(B.T)*nc - a.T*a) \
                    / np.sqrt( (a.T * a) * ((nc-a).T*(nc-a)) / (nc - 1) + np.spacing(1) ) #cell-k's gene-gene network
            np.fill_diagonal(np.asmatrix(csnk), 0)
            csnlink = (csnk > p) * 1  # 1: link exsist, 0: no link

            if csnlink.sum().sum(
            ) == 0:  #all genes has no link with each other
                self.logger.info("no genes in Cell {} has a link".format(k))
                continue

            if kk != 0:
                id = condition_g(csnlink, kk=kk, dlimit=dlimit)
                # sort out top kk conditional genes based on sum of ρ statistic value (or degree for 0 1 matrix).
                # the lager ρ (or degree) the more dependent between gene_z and other genes
                csnlink = pd.DataFrame(np.zeros([nr, nr]))
                for m in range(
                        kk):  # start from the gene_z with largest degree
                    B_z = B * B.iloc[id[m], :]
                    # p(B|z) , box-matrix for all gens, {1: z and x-y genes all in the box of 3-dim, 0: others}
                    idc = np.argwhere(B.iloc[id[m], :] != 0).flatten(
                    )  #indexes of cells containing gene z
                    B_z = B_z.iloc[:, idc]  # sub Matrix of cells with gene z
                    r = B_z.shape[1]  # r: cell numbers
                    a_z = np.mat(
                        B_z.sum(axis=1)
                    )  # gene degree sum given by z coexist(condition) Nxy in the box
                    c_z = B_z @ B_z.T

                    csnk1 = (c_z*r - a_z.T*a_z) \
                            / np.sqrt( (a_z.T * a_z) * ((r-a_z).T*(r-a_z)) / (r - 1) + np.spacing(1) )
                    np.fill_diagonal(np.asmatrix(csnk1), 0)
                    csnlink1 = (csnk1 > p) * 1
                    csnlink = csnlink + csnlink1
            else:
                kk = 1

            csnlink = csnlink / kk

            # if edgeW:
            #     csn[k]=np.multiply(csnk, csnlink)
            # else:
            #     csn[k]= csnlink
            csn[k] = csnlink

            if to_csv:
                filename = os.path.join(self.outdir, "cellnws",
                                        "{}.nw.csv".format(k))
                uf.create_dir(self.outdir + "/cellnws")
                csn[k].to_csv(path_or_buf=filename)
            self.logger.info('Cell {} specific network is completed'.format(k))
        self.logger.info('Finished all {} cell specific networks'.format(
            len(cells)))
        self.csn = csn