def __init__(self, datafile, outdir="./"): """ default values when initialize. set log file """ self.outdir = outdir self.tablename = datafile uf.create_dir(self.outdir) print("Reading data from {}".format(datafile)) df, self.cells, self.genes = get_data(datafile) self.data = df[df.sum(axis=1) != 0][:, df.sum(axis=0) != 0] print("Finish reading data from {}".format(datafile))
def __init__(self, datafile, outdir="./", log=None): """ default values when initialize. set log file """ self.outdir = outdir self.tablename = datafile uf.create_dir(self.outdir) self.log = os.path.join(self.outdir, log) if log else \ os.path.join( self.outdir,"{}_{}.log".format(os.path.basename(datafile),uf.now()) ) self.logger = uf.create_logger(self.log) self.logger.info("start reading data from {}, log file is {}".format( data, self.log)) df = get_data(datafile) self.data = df.loc[df.sum(axis=1) != 0, df.sum(axis=0) != 0] self.logger.info("finish reading data from {}".format(data))
def __init__(self, data, outdir='./', log=None): """ default values when initialize. set log file """ self.outdir = outdir self.tablename = data uf.create_dir(self.outdir) self.log = os.path.join(self.outdir, log) if log else os.path.join( self.outdir, '{}_{}.log'.format(os.path.basename(data), uf.now())) self.logger = uf.create_logger(self.log) self.logger.info('start reading data from {}, log file is {}'.format( data, self.log)) df = get_data(data) self.data = df.loc[(df.sum(axis=1) != 0, df.sum(axis=0) != 0)] self.csn = None self.logger.info('finish reading data from {}'.format(data))
def __init__(self, data, outdir="./", log=None): """ default values when initialize. set log file """ self.outdir = outdir self.tablename = data uf.create_dir(self.outdir) self.log = os.path.join(self.outdir, log) if log else \ os.path.join( self.outdir,"{}_{}.log".format(os.path.basename(data),uf.now()) ) self.logger = uf.create_logger(self.log) self.logger.info("start reading data from {}, log file is {}".format( data, self.log)) df = get_data(data) self.data = df.loc[df.sum(axis=1) != 0, df.sum(axis=0) != 0] self.csn = None # Gene expression matrix (TPM/FPKM/RPKM/count), rows = genes, columns = cells or OTU table self.logger.info("finish reading data from {}".format(data))
def csnet(self, cells=None, alpha=0.01, boxsize=0.1, edgeW=0, kk=0, dlimit=5, to_csv=0, average=1, *args, **kwargs): """ fcndm = cndm(data, 0.1, 0.1, 1) for test Construct the CSN for sepecified cells Parameters: `cells` Construct the CSNs for all cells, set cells = None (Default) otherwise input cells.list `alpha` Significant level (eg. 0.001, 0.01, 0.05 ...) larger alpha leads to more edges, Default = 0.01 `boxsize` Size of neighborhood, the value between 1 to 2 is recommended, Default = 0.1, `edgeW` 1 edge is weighted (statistic pxy(x)) 0 edge is not weighted (Default) `nodeW` 1 node is weighted (gene or otu abundance) 0 node is not wieghted (Default) `csn` Cell-specific network, the kth CSN is in csn{k} rows = genes, columns = genes `kk` the number of conditional gene. when kk=0, the method is CSN `dlimit` the min degree limitation of conditional genes. `average` whether use the average(adjmc + adjmc1) network or intersection(adjmc.*adjmc1) network. Returns: csnet dict Raises: KeyError - raises an exception Notes: Too many cells or genes may lead to out of memory. 学习 dataframe 和array python的矩阵运算。 np index start from 0 每个new cell都要和原来所有的细胞一起计算lower upper边界矩阵,都要排序每个基因来计算。 如果数据库足够大,可以就用原来的边界矩阵,重新换算出upper和lower矩阵。带入new cell的基因表达数据就可以。 """ self.logger.info('start construction cell-specific network ') nr, nc = self.data.shape data = self.data upper = pd.DataFrame(np.zeros((nr, nc)), columns=data.columns, index=data.index) lower = pd.DataFrame(np.zeros((nr, nc)), columns=data.columns, index=data.index) for i in range(nr): sort_gi = data.iloc[i, :].sort_values(axis=0, ascending=True) s1 = sort_gi.values s2 = sort_gi.index n1 = sum(np.sign(s1)) n0 = nc - n1 h = round(boxsize * np.sqrt(n1)) k = 0 while k < nc: s = 0 while k + s + 1 < nc and s1[(k + s + 1)] == s1[k]: s = s + 1 if s >= h: upper.loc[(data.index[i], s2[range( k, k + s + 1)])] = data.loc[(data.index[i], s2[k])] lower.loc[(data.index[i], s2[range( k, k + s + 1)])] = data.loc[(data.index[i], s2[k])] else: upper.loc[(data.index[i], s2[range( k, k + s + 1)])] = data.loc[(data.index[i], s2[int( min(nc - 1, k + s + h))])] lower.loc[(data.index[i], s2[range( k, k + s + 1)])] = data.loc[(data.index[i], s2[int( max(n0 * (n0 > h), k - h))])] k = k + s + 1 # %If gene expression matrix is sparse, use the sparse matrix will accelerate # %the calculation and reduce memory footprint # %data = sparse(data); upper = sparse(upper); lower = sparse(lower); self.logger.info( 'finish caculate the neighborhood of each gene for each cell') cells = self.get_cells(cells=cells) csn = dict() B = pd.DataFrame(np.zeros((nr, nc)), columns=data.columns, index=data.index) p = -stats.norm.ppf(q=alpha, loc=0, scale=1) for k in cells: for j in B.columns: if average: B.loc[:, j] = (data.loc[:, j] <= upper.loc[:, k]) & ( data.loc[:, j] >= lower.loc[:, k]) & (data.loc[:, k] > 0) else: B.loc[:, j] = (data.loc[:, j] <= upper.loc[:, k]) & ( data.loc[:, j] >= lower.loc[:, k]) B = B * 1 a = np.matrix(B.sum(axis=1)) csnk = (B.dot(B.T) * nc - a.T * a) / np.sqrt( np.multiply(a.T * a, (nc - a).T * (nc - a)) / (nc - 1) + np.spacing(1)) csnlink = (csnk > p) * 1 if csnlink.sum().sum() == 0: self.logger.info('no genes in Cell {} has a link'.format(k)) continue if kk != 0: id = condition_g(csnlink, kk=kk, dlimit=dlimit) csnlink = pd.DataFrame(np.zeros( [nr, nr])) if average else pd.DataFrame(np.ones([nr, nr])) for m in range(kk): B_z = B.iloc[id[m], :] * B idc = np.argwhere(B.iloc[id[m], :] != 0).flatten() B_z = B_z.iloc[:, idc] r = B_z.shape[1] a_z = np.mat(B_z.sum(axis=1)) c_z = B_z @ B_z.T csnk1 = (c_z * r - a_z.T * a_z) / np.sqrt( np.multiply(a_z.T * a_z, (r - a_z).T * (r - a_z)) / (r - 1) + np.spacing(1)) csnlink1 = (csnk1 > p) * 1 csnlink = csnlink + csnlink1 if average else csnlink * csnlink1 else: kk = 1 csnlink = csnlink / kk if average else csnlink csn[k] = csnlink if to_csv: filename = os.path.join(self.outdir, 'cellnws', '{}.nw.csv'.format(k)) uf.create_dir(self.outdir + '/cellnws') csn[k].to_csv(path_or_buf=filename) self.logger.info('Cell {} specific network is completed'.format(k)) self.logger.info( 'Finished constructing all {} cell specific networks'.format( len(cells))) self.upper = upper self.lower = lower self.csn = csn
def csnet(self, cells=None, alpha=0.01, boxsize=0.1, edgeW=0, kk=0, dlimit=5, to_csv=0, *args, **kwargs): """ fcndm = cndm(data, 0.1, 0.1, 1) for test Construct the CSN for sepecified cells Parameters: `cells` Construct the CSNs for all cells, set cells = None (Default) otherwise input cells.list `alpha` Significant level (eg. 0.001, 0.01, 0.05 ...) larger alpha leads to more edges, Default = 0.01 `boxsize` Size of neighborhood, the value between 1 to 2 is recommended, Default = 0.1, `edgeW` 1 edge is weighted (statistic pxy(x)) 0 edge is not weighted (Default) `nodeW` 1 node is weighted (gene or otu abundance) 0 node is not wieghted (Default) `csn` Cell-specific network, the kth CSN is in csn{k} rows = genes, columns = genes `kk` the number of conditional gene. when kk=0, the method is CSN `dlimit` the min degree limitation of conditional genes. Returns: csnet dict Raises: KeyError - raises an exception Notes: Too many cells or genes may lead to out of memory. 学习 dataframe 和array python的矩阵运算。 np index start from 0 每个new cell都要和原来所有的细胞一起计算lower upper边界矩阵,都要排序每个基因来计算。 如果数据库足够大,可以就用原来的边界矩阵,重新换算出upper和lower矩阵。带入new cell的基因表达数据就可以。 """ self.logger.info("start construction cell-specific network ") nr, nc = self.data.shape data = self.data #Define the neighborhood of each plot upper = pd.DataFrame(np.zeros((nr, nc)), columns=data.columns, index=data.index) lower = pd.DataFrame(np.zeros((nr, nc)), columns=data.columns, index=data.index) for i in range(nr): sort_gi = data.iloc[i, :].sort_values(axis=0, ascending=True) s1 = sort_gi.values s2 = sort_gi.index n1 = sum(np.sign(s1)) n0 = nc - n1 # the number of 0 h = round(boxsize / np.sqrt(n1)) # radius of the box k = 0 while k < nc: s = 0 while k + s + 1 < nc and s1[k + s + 1] == s1[k]: # if the gene expression is same, assign the same values s = s + 1 if s >= h: upper.loc[data.index[i], s2[range(k, k + s + 1)]] = data.loc[data.index[i], s2[k]] lower.loc[data.index[i], s2[range(k, k + s + 1)]] = data.loc[data.index[i], s2[k]] else: upper.loc[data.index[i], s2[range(k, k + s + 1)]] = data.loc[ data.index[i], s2[int(min(nc - 1, k + s + h))]] lower.loc[data.index[i], s2[range(k, k + s + 1)]] = data.loc[ data.index[i], s2[int(max(n0 * (n0 > h), k - h))]] k = k + s + 1 self.logger.info( "finish caculate the neighborhood of each gene for each cell") # Construction of CSN # Construction of cell-specific networks for each cell, use GNN to classify or cluster graphs cells = self.get_cells(cells=cells) csn = dict() #dict.fromkeys(cells) B = pd.DataFrame(np.zeros((nr, nc)), columns=data.columns, index=data.index) # one B matrix for each cell, the value in B matrix is {1: gene is in the box, 0: not} p = -stats.norm.ppf(q=alpha, loc=0, scale=1) # p: Statistical thresholds under confidence 0.99. """ cell k has gene j, and the expression value is among lower and upper gene expresion value 决定了upper lower 边界大小,把这个存成dict,供new cell 使用,基因丰度最相近的box作为newcell gene的box边界,根据gene大小对应的up lower键值来,计算new cells 的B矩阵 根据基因丰度来对cell进行聚类,丰度模式相似的样品,度矩阵也默认相似。 怎么根据数据库,快速产生度矩阵,然后进行分类预测。数据库里已经有度矩阵,和预测模型,根据new cell的度向量进行判别 new cell 的度向量,csn快速计算问题,后面再研究 运算符重载的使用有点不合逻辑:* 不能按元素操作,但 / 确是如此。 # 虽然我们可以使用sort, 但是sort是全局排序 # 如果数组非常大, 我们只希望选择最小的10个元素, 直接通过np.partition(arr, 9)即可 # 然后如果排序的话, 只对这选出来的10个元素排序即可, 而无需对整个大数组进行排序 """ for k in cells: #to update for multi process run for j in B.columns: B.loc[:,j] = (data.loc[:,j] <= upper.loc[:,k]).astype('int') \ * (data.loc[:,j] >= lower.loc[:,k]).astype('int') \ * [(i>0)*1 for i in data.loc[:,k]] #in the box and !=0 a = np.mat( B.sum(axis=1)) # sum of genes in all the boxes of cells? # in matlab vectors are column vector, a*a' # in python vectors are rows in a matrix, a.T*a # CSN adj-matrix csnk = (B.dot(B.T)*nc - a.T*a) \ / np.sqrt( (a.T * a) * ((nc-a).T*(nc-a)) / (nc - 1) + np.spacing(1) ) #cell-k's gene-gene network np.fill_diagonal(np.asmatrix(csnk), 0) csnlink = (csnk > p) * 1 # 1: link exsist, 0: no link if csnlink.sum().sum( ) == 0: #all genes has no link with each other self.logger.info("no genes in Cell {} has a link".format(k)) continue if kk != 0: id = condition_g(csnlink, kk=kk, dlimit=dlimit) # sort out top kk conditional genes based on sum of ρ statistic value (or degree for 0 1 matrix). # the lager ρ (or degree) the more dependent between gene_z and other genes csnlink = pd.DataFrame(np.zeros([nr, nr])) for m in range( kk): # start from the gene_z with largest degree B_z = B * B.iloc[id[m], :] # p(B|z) , box-matrix for all gens, {1: z and x-y genes all in the box of 3-dim, 0: others} idc = np.argwhere(B.iloc[id[m], :] != 0).flatten( ) #indexes of cells containing gene z B_z = B_z.iloc[:, idc] # sub Matrix of cells with gene z r = B_z.shape[1] # r: cell numbers a_z = np.mat( B_z.sum(axis=1) ) # gene degree sum given by z coexist(condition) Nxy in the box c_z = B_z @ B_z.T csnk1 = (c_z*r - a_z.T*a_z) \ / np.sqrt( (a_z.T * a_z) * ((r-a_z).T*(r-a_z)) / (r - 1) + np.spacing(1) ) np.fill_diagonal(np.asmatrix(csnk1), 0) csnlink1 = (csnk1 > p) * 1 csnlink = csnlink + csnlink1 else: kk = 1 csnlink = csnlink / kk # if edgeW: # csn[k]=np.multiply(csnk, csnlink) # else: # csn[k]= csnlink csn[k] = csnlink if to_csv: filename = os.path.join(self.outdir, "cellnws", "{}.nw.csv".format(k)) uf.create_dir(self.outdir + "/cellnws") csn[k].to_csv(path_or_buf=filename) self.logger.info('Cell {} specific network is completed'.format(k)) self.logger.info('Finished all {} cell specific networks'.format( len(cells))) self.csn = csn