def processed_X(Xprocessed, conditions, GDM, OGs, Map, MapSpecies): L = len(Xprocessed) res = np.array([None] * L, dtype=object) resData = np.array([None] * L, dtype=object) resHeader = np.array([None] * L, dtype=object) resGeneNames = np.array([None] * L, dtype=object) for l in range(L): # Header (Samples) #samploc = np.array(Samples[l]) #uniqueSamploc = np.unique(SamplesIDs[l]) #uniqueSamploc = uniqueSamploc[uniqueSamploc >= 0] #resHeader[l] = [samploc[np.array(SamplesIDs[l]) == s][0] for s in uniqueSamploc] resHeader[l] = conditions[l] if Map is None: resHeader[l] = np.array([['Genes'] + resHeader[l]]) else: resHeader[l] = np.array([['OGs'] + MapSpecies.tolist() + resHeader[l]]) # Gene names if Map is None: resGeneNames[l] = OGs[GDM[:, l]].reshape(-1,1) else: genenames = [[ds.concatenateStrings(gs) for gs in Map[GDM[:, l]][:, sp]] for sp in range(Map.shape[1])] resGeneNames[l] = np.concatenate((OGs[GDM[:, l]].reshape(-1,1), np.transpose(genenames)), axis=1) # Data resData[l] = np.array(Xprocessed[l]) # concatenate them res[l] = np.concatenate((resGeneNames[l], resData[l]), axis=1) res[l] = np.concatenate((resHeader[l], res[l]), axis=0) res[l] = np.array(res[l], dtype=str) return res
def calculateGDMandUpdateDatasets(X, Genes, Map=None, mapheader=True, OGsFirstColMap=True, delimGenesInMap='\\W+', OGsIncludedIfAtLeastInDatasets=1): Xloc = ds.listofarrays2arrayofarrays(X) Genesloc = deepcopy(Genes) if Map is None: OGsDatasets = deepcopy(Genes) OGs = np.unique(ds.flattenAList( OGsDatasets)) # Unique list of genes (or mapped genes) MapNew = None MapSpecies = None else: (OGs, OGsDatasets, MapNew, MapSpecies) = mapGenesToCommonIDs(Genes, Map, mapheader, OGsFirstColMap, delimGenesInMap) L = len(Genesloc) # Number of datasets # Ng = len(OGs) # Number of unique genes GDMall = np.transpose([np.in1d(OGs, gs) for gs in OGsDatasets]) # GDM: (Ng)x(L) boolean # Exclude OGs that do not exist in at least (OGsIncludedIfAtLeastInDatasets) datasets IncludedOGs = np.sum(GDMall, axis=1) >= OGsIncludedIfAtLeastInDatasets GDM = GDMall[IncludedOGs] OGs = OGs[IncludedOGs] if MapNew is not None: MapNew = MapNew[IncludedOGs] Ngs = np.sum(GDM, axis=0) # Numbers of unique mapped genes in each dataset Xnew = np.array([None] * L, dtype=object) GenesDatasets = np.array([None] * L, dtype=object) for l in range(L): arelogs = np.nansum( abs(Xloc[l][~isnan(Xloc[l])]) < 30 ) > 0.98 * ds.numel( Xloc[l][~isnan(Xloc[l])]) # More than 98% of values are below 30.0 d = Xloc[l].shape[1] # Number of dimensions (samples) in this dataset Xnew[l] = np.zeros([Ngs[l], d], dtype=float) GenesDatasets[l] = np.empty(Ngs[l], dtype=object) OGsInThisDS = OGs[GDM[:, l]] # Unique OGs in this dataset # TODO: Optimise the code below by exploiting ds.findArrayInSubArraysOfAnotherArray1D (like in line 203 above) for ogi in range(len(OGsInThisDS)): og = OGsInThisDS[ogi] if arelogs: Xnew[l][ogi] = np.log2( np.sum(np.power(2.0, Xloc[l][np.in1d(OGsDatasets[l], og)]), axis=0)) else: Xnew[l][ogi] = np.sum(Xloc[l][np.in1d(OGsDatasets[l], og)], axis=0) GenesDatasets[l][ogi] = ds.concatenateStrings(Genesloc[l][np.in1d( OGsDatasets[l], og)]) return Xnew, GDM, GDMall, OGs, MapNew, MapSpecies
def clusters_genes_OGs(B, OGs, Map, MapSpecies, delim='; '): if Map is None: Nsp = 0 else: Nsp = len(MapSpecies) # Number of species K = B.shape[1] # Number of clusters if K == 0: return np.array(np.empty([1, 1]), dtype=object) Csizes = np.sum(B, axis=0) # Clusters' sizes maxCsize = np.max(Csizes) # Largest cluster size res = np.array(np.empty([maxCsize, (Nsp + 1) * K], dtype=str), dtype=object) header = np.array([None] * ((Nsp + 1) * K * 2), dtype=object).reshape([2, ((Nsp + 1) * K)]) for k in range(K): col = k * (Nsp + 1) res[0:Csizes[k], col] = OGs[B[:, k]] res[Csizes[k]:, col] = '' header[0, col] = 'C{0} ({1} {2})'.format(k, Csizes[k], 'genes' if Map is None else 'OGs') header[1, col] = 'Genes' if Map is None else 'OGs' for sp in range(Nsp): # Will not get into this if Map is None, as Nsp will be zero in that case col = k * (Nsp + 1) + sp + 1 res[0:Csizes[k], col] = [ds.concatenateStrings(gs, delim) for gs in Map[B[:, k], sp]] res[Csizes[k]:, col] = '' header[0, col] = '' header[1, col] = MapSpecies[sp] return np.array(np.concatenate((header, res), axis=0), dtype=str)