def convertCAVM(inDir, outD, REALRUN, CAVM, TCGA, MAPID=1): bookDic = {} sampleMaps = {} bookDic = cgWalk(inDir, 0) if not os.path.exists(outD): os.system("mkdir " + outD) if not bookDic: print "repo has problem" return 0 sampleMaps = collectSampleMaps(bookDic) missingMaps = collectMissingSampleMaps(bookDic) allMaps = sampleMaps.keys() allMaps.extend(missingMaps.keys()) for sampleMap in allMaps: print sampleMap outDir = outD + sampleMap + "/" if not os.path.exists(outDir): os.system("mkdir " + outDir) path = bookDic[sampleMap]['path'] """ if string.find(os.path.abspath(path), "/inside/home/jzhu/cgDataJing/scripts/data_flatten/") ==-1: print "ignore "+path continue """ if sampleMap in missingMaps: #construct an empty sampleMap sMap = SampleMapNew(None, sampleMap) #fill sMap with individual nodes, no connection changed = checkIdsAllIn(sMap, bookDic) #build connection else: name = bookDic[sampleMap]['name'] fin = open(path, 'r') sMap = SampleMapNew(fin, name) if not sMap.getName(): print "Fail to initiate", name return 0 fin.close() #cohort sampleMap json sMapJ = {} fin = open(bookDic[sampleMap]['path'] + ".json", 'r') sMapJ = json.loads(fin.read()) fin.close() #integration list integrationList = [] rootDic = {} clinFile = "" clinMatrix = None #cohort COHORT = "" cohortPath = string.join( string.split(bookDic[sampleMap]['path'], "/")[0:-1], "/") + "/cohort.json" if os.path.exists(cohortPath): fin = open(cohortPath, 'r') cohortJ = json.loads(fin.read()) COHORT = cohortJ["name"] for name in sampleMaps[sampleMap]: obj = bookDic[name] if obj['type'] == "clinicalMatrix": clinFile = outDir + os.path.basename(obj['path']) #JSON fin = open(obj['path'] + ".json", 'r') J = json.load(fin) fin.close() if COHORT: J["cohort"] = COHORT else: J['cohort'] = J[':sampleMap'] J["label"] = "Phenotypes" if CAVM: J.pop(':sampleMap') if J.has_key("dataSubType"): if J.has_key(":dataSubType"): J.pop(':dataSubType') else: if J.has_key(":dataSubType"): J["dataSubType"] = J[":dataSubType"] J.pop(':dataSubType') fout = open(clinFile + ".json", 'w') fout.write(json.dumps(J, indent=-1)) fout.close() if REALRUN != 0 and REALRUN != 1: continue if clinMatrix != None: print "only one clinical matrix is allowed" sys.exit() fin = open(obj['path'], 'U') fout = open(clinFile, 'w') line = fin.readline() fout.write(line) samples = [] for line in fin.readlines(): if MAPID: sample = string.split(line[:-1], "\t")[-1] else: sample = string.split(line[:-1], "\t")[0] if sample not in samples and sample != "": samples.append(sample) fout.write(sample + "\t") if MAPID: fout.write( string.join( string.split(line[:-1], "\t")[1:], "\t")) else: fout.write( string.join( string.split(line[:-1], "\t")[1:-1], "\t") + "\t" + sample) fout.write("\n") fout.close() integrationList = copy.deepcopy(samples) #clinicalFeature if J.has_key(":clinicalFeature"): cFobj = bookDic[J[":clinicalFeature"]] outfile = outDir + os.path.basename(cFobj['path']) os.system("cp " + cFobj['path'] + " " + outfile) os.system("cp " + cFobj['path'] + ".json " + outfile + ".json") #sampleMap data mapping information #cgData 1 if not CAVM: os.system("cp " + bookDic[sampleMap]['path'] + " " + outDir + "sampleMap") #only expect one clinical matrix clinMatrix = ClinicalMatrixNew(clinFile, "clinMatrix") break for name in sampleMaps[sampleMap]: obj = bookDic[name] if obj['type'] in ["genomicSegment", "mutationVector"]: path = obj['path'] print path outfile = outDir + os.path.basename(obj['path']) fin = open(obj['path'] + ".json", 'r') J = json.load(fin) fin.close() if COHORT: J["cohort"] = COHORT else: J['cohort'] = J[':sampleMap'] if CAVM: J.pop(':sampleMap') if J.has_key("dataSubType"): if J.has_key(":dataSubType"): J.pop(':dataSubType') else: if J.has_key(":dataSubType"): J["dataSubType"] = J[":dataSubType"] J.pop(':dataSubType') fout = open(outfile + ".json", 'w') fout.write(json.dumps(J, indent=-1)) fout.close() if REALRUN == 1: fin = open(path, 'r') fout = open(outDir + os.path.basename(path), 'w') for line in fin.readlines(): data = string.split(line, '\t') sample = data[0] if rootDic.has_key(sample): root = rootDic[sample] else: root = sMap.getIntegrationId( sample, integrationList) if not root: root = sample rootDic[sample] = root fout.write(root + "\t" + string.join(data[1:], '\t')) fin.close() fout.close() if obj['type'] == "genomicMatrix": print obj['name'] #JSON outfile = outDir + os.path.basename(obj['path']) fin = open(obj['path'] + ".json", 'r') J = json.load(fin) fin.close() if COHORT: J["cohort"] = COHORT else: J['cohort'] = J[':sampleMap'] if CAVM: J.pop(':sampleMap') if J.has_key("dataSubType"): if J.has_key(":dataSubType"): J.pop(':dataSubType') else: if J.has_key(":dataSubType"): J["dataSubType"] = J[":dataSubType"] J.pop(':dataSubType') fout = open(outfile + ".json", 'w') fout.write(json.dumps(J, indent=-1)) fout.close() if J.has_key('anatomical_origin'): sMapJ['anatomical_origin'] = J['anatomical_origin'] if J.has_key('primary_disease'): sMapJ['primary_disease'] = J['primary_disease'] if J.has_key('domain'): sMapJ['domain'] = J['domain'] if J.has_key('sample_type'): sMapJ['sample_type'] = J['sample_type'] if J.has_key('tags'): sMapJ['tags'] = J['tags'] if REALRUN != 1 and REALRUN != 0: continue # add to clinMatrix the id mappings mappingCol = "_GENOMIC_ID_" + obj['name'] clinMatrix.addOneColWithSameValue(mappingCol, "") # need to find it out if there are more than one sample map to each _INTEGRATION ID roots = {} findDup = 0 fin = open(obj['path'], 'U') samples = string.split(fin.readline()[:-1], "\t")[1:] for i in range(0, len(samples)): sample = samples[i] if sample == "": print name, "has bad empty sample id" sys.exit() if rootDic.has_key(sample): root = rootDic[sample] else: root = sMap.getIntegrationId(sample, integrationList) if not root: root = sample rootDic[sample] = root genomic_Id = clinMatrix.getDATA(root, mappingCol) if genomic_Id is None or genomic_Id == "": clinMatrix.setDATA(root, mappingCol, sample) else: genomic_Id = string.split(genomic_Id, ",") if sample not in genomic_Id: genomic_Id.append(sample) genomic_Id = string.join(genomic_Id, ',') #print sample, genomic_Id clinMatrix.setDATA(root, mappingCol, genomic_Id) if roots.has_key(root): roots[root].append(i) findDup = 1 else: roots[root] = [i] fin.close() if REALRUN != 1: continue #probemap for genomic segment #if J.has_key(':genomicSegment'): if J.has_key(':probeMap'): if bookDic.has_key(J[':probeMap']): probeMap = bookDic[J[':probeMap']]['path'] os.system("cp " + probeMap + " " + outDir + os.path.basename(probeMap)) os.system("cp " + probeMap + ".json " + outDir + os.path.basename(probeMap) + ".json") #need to figure out if there are duplication in the probe ids findDupProbe = [] process = os.popen( "r=$(cut -f 1 " + obj['path'] + " | more +2 | sort |uniq -c | sed -e 's/ *//' -e 's/ /\t/' | sort -n |cut -f 1 |sort -un|tail -n 1); if [ $r -ne \"1\" ]; then echo $r ; fi" ) r = process.read() if r: print string.strip(r), obj['path'] process = os.popen( "cut -f 1 " + obj['path'] + " | more +2 | sort |uniq -c | sed -e 's/ *//' -e 's/ /\t/' | sort -n | grep -vP ^1'\t' | cut -f 2 |sort" ) r = process.read() list = string.split(r, "\n") print len(list) for probe in list: findDupProbe.append(probe) #genomic data no dup fout = open(outfile, 'w') fin = open(obj['path'], 'U') if findDup == 0 and findDupProbe == []: data = string.split(fin.readline()[:-1], "\t") samples = data[1:] fout.write(data[0]) for sample in samples: if rootDic.has_key(sample): root = rootDic[sample] else: root = sMap.getIntegrationId( sample, integrationList) if not root: root = sample rootDic[sample] = root fout.write('\t' + root) fout.write('\n') if TCGA: fin.close() fout.close() os.system("cat " + obj['path'] + " |sed 1d >> " + outfile) #os.system("more +2 "+obj['path']+" >> "+outfile) else: while 1: line = fin.readline() if line == "": break line = string.replace(line, "\tnan\t", "\tNA\t") line = string.replace(line, "\tNAN\t", "\tNA\t") line = string.replace(line, "\tNaN\t", "\tNA\t") fout.write(line) fin.close() fout.close() #genomic data with dup else: print "genomic with dup", obj['path'] data = string.split(fin.readline()[:-1], "\t") fout.write(data[0]) for root in roots: fout.write('\t' + root) fout.write('\n') dupDic = {} while 1: duplist = [] line = fin.readline()[:-1] if line == "": break data = string.split(line, "\t") if data[0] not in findDupProbe: fout.write(data[0]) else: if data[0] not in dupDic: dupDic[data[0]] = [] values = data[1:] for root in roots: if len(roots[root]) != 1: total = "NA" n = 0 for i in roots[root]: if values[i] in ["nan", "NAN", "NaN"]: pass else: try: float(values[i]) if total == "NA": total = float(values[i]) else: total = total + float( values[i]) n = n + 1 except: pass if total != "NA": average = str(total / n) else: average = "NA" else: if values[roots[root][0]] in [ "nan", "NAN", "NaN" ]: average = "NA" else: try: float(values[roots[root][0]]) average = values[roots[root][0]] except: average = "NA" if data[0] not in findDupProbe: fout.write('\t' + average) else: duplist.append(average) if data[0] not in findDupProbe: fout.write('\n') else: dupDic[data[0]].append(duplist[:]) if dupDic != {}: for probe in dupDic: fout.write(probe) k = len(dupDic[probe][0]) valList = [] nList = [] for i in range(0, k): valList.append("NA") nList.append(0) for list in dupDic[probe]: for i in range(0, k): try: float(list[i]) if valList[i] == "NA": valList[i] = float(list[i]) else: valList[i] = valList[i] + float( list[i]) nList[i] = nList[i] + 1 except: pass for i in range(0, k): try: float(valList[i]) fout.write( "\t" + str(float(valList[i]) / nList[i])) except: fout.write("\tNA") fout.write("\n") fin.close() fout.close() #final clinical matrix output if REALRUN == 0 or REALRUN == 1: fout = open(clinFile, 'w') clinMatrix.store(fout) #sampleMap json #cgData1 if not CAVM: outfile = outDir + "sampleMap.json" fout = open(outfile, 'w') fout.write(json.dumps(sMapJ, indent=-1)) fout.close() #cohort json cp or create cohortPath = string.join( string.split(bookDic[sampleMap]['path'], "/")[0:-1], "/") + "/cohort.json" if os.path.exists(cohortPath): os.system("cp " + cohortPath + " " + outDir) else: outfile = outDir + "cohort.json" fout = open(outfile, 'w') cohortJ = {} cohortJ["type"] = "cohort" cohortJ["name"] = sampleMap fout.write(json.dumps(cohortJ, indent=-1)) fout.close()
def runFlatten(inDir, outDir, REALRUN, onlyGenomicSamples, SMAPNAME=None): dir = inDir bookDic={} sampleMaps={} ignore=0 bookDic=cgWalk(dir,ignore) if not bookDic : print "repo has problem" return 0 sampleMaps = collectSampleMaps(bookDic) missingMaps= collectMissingSampleMaps(bookDic) allMaps = sampleMaps.keys() allMaps.extend(missingMaps.keys()) for sampleMap in allMaps: if SMAPNAME and SMAPNAME!=sampleMap: print "skip", sampleMap continue print sampleMap path = bookDic[sampleMap]['path'] if os.path.abspath(path) in [ \ "/inside/home/jzhu/cgDataJing/scripts/data/public/TCGA/PANCAN/TCGA.PANCAN.sampleMap", \ "/inside/home/jzhu/cgDataJing/scripts/data/public/TCGA/PANCAN12/TCGA.PANCAN12.sampleMap" ]: print "ignore "+path continue if sampleMap in missingMaps: #construct an empty sampleMap sMap = SampleMapNew(None,sampleMap) #fill sMap with individual nodes, no connection changed = checkIdsAllIn (sMap, bookDic) #build connection else: name = bookDic[sampleMap]['name'] fin = open(path,'r') sMap = SampleMapNew(fin,name) if not sMap.getName(): print "Fail to initiate", name return 0 fin.close() changed = checkIdsAllIn (sMap, bookDic) if REALRUN in [0,1]: r = flattenEachSampleMap(sMap,bookDic,onlyGenomicSamples) if r== False: return 0 finalClinicalMatrix,finalClinicalMatrixJSON,finalClinFeature,finalClinFeatureJSON= r if finalClinicalMatrix.getROWnum()!=0: outputEachSampleMapRelated(outDir, bookDic, sMap, finalClinicalMatrix,finalClinicalMatrixJSON, finalClinFeature,finalClinFeatureJSON,REALRUN) if REALRUN == -2: finalClinFeature = flattenForClinicalFeature(sMap, bookDic) outputForClinFeature(outDir,sMap, finalClinFeature) cpGenomicEachSample(REALRUN, outDir, bookDic, sMap) cpProbeMaps(REALRUN,outDir,bookDic,sMap) #cpCohort if exists path = string.join(string.split(bookDic[sampleMap]['path'],"/")[0:-1],"/") + "/cohort.json" if os.path.exists(path): dataPackageDir = outDir + sampleMapBaseName(sMap) os.system("cp "+path+" "+dataPackageDir+"/") return 1
def TCGASampleMap (dir, outDir, cancer,log, REALRUN): #print status print cancer, __name__ #if cancer in ["PANCAN","PANCAN12"]: # return ignore =1 bookDic = cgWalk(dir,ignore) existMaps = collectSampleMaps(bookDic) missingMaps= collectMissingSampleMaps(bookDic) #removeExistMaps for map in existMaps: if map not in missingMaps: missingMaps[map]=existMaps[map] # all aliquote uuid dic aliquote_dic =TCGAUtil.uuid_Aliquot_all() sample_dic =TCGAUtil.uuid_Sample_all() #missingMaps --- actually this is all the maps for map in missingMaps: print map print missingMaps[map] sMap =SampleMapNew(None,map) #integration id intName= map+".integrationID" if intName in bookDic: fin = open(bookDic[intName]["path"],'r') integrationID=IntegrationId(intName, fin) fin.close() else: integrationID=IntegrationId(intName) samples =[] for name in missingMaps[map]: if REALRUN !=1: continue print name obj=bookDic[name] if obj['type']=="genomicMatrix": fin =open(obj['path'],'U') for sample in string.split(fin.readline()[:-1],"\t")[1:]: if sample =="": print name, "has bad empty sample id" if sample not in samples: samples.append(sample) fin.close() #elif obj['type']=="clinicalMatrix": # cMa = ClinicalMatrixNew(obj['path'],name) # for sample in cMa.getROWs(): # if sample not in samples: # samples.append(sample) elif obj['type'] in ["mutationVector","clinicalMatrix"]: path = obj['path'] os.system("cut -f 1 "+path+ " |sort |uniq > .tmp") fin=open('.tmp','r') fin.readline() for line in fin.readlines(): #if string.strip(line)=="": # break sample = string.strip(line) #string.split(line,'\t')[0] if sample =="": break if sample not in samples: samples.append(sample) else: continue for sample in samples: if REALRUN !=1: continue #TCGA uuid handling if sample[0:4]!="TCGA": if aliquote_dic.has_key(string.lower(sample)): TCGAbarcode = aliquote_dic[string.lower(sample)] else: print sample continue parent = TCGAbarcode child = sample sMap.addLink(parent,string.lower(child)) sMap.addLink(parent,string.upper(child)) sample = parent #do TCGA barcode trick parts= string.split(sample,"-") if len(parts)>3 and len(parts[3])==3: parts = parts[0:3]+ [parts[3][0:2],parts[3][2]]+parts[4:] #print parts """ parent = string.join(parts[0:3],"-") #parts[3] if len(parts)>3 and len(parts[3])==3: child=parent +"-" +parts[3][0:2] sMap.addLink(parent,child) parent=child child=string.join(parts[0:4],"-") sMap.addLink(parent,child) parent=child """ parent = string.join(parts[0:3],"-") for i in range (3,len(parts)): if i!=4: child = parent +"-" +parts[i] else: child = parent +parts[i] #add parent child sMap.addLink(parent,child) parent = child intID= TCGAUtil.barcode_IntegrationId(sample) integrationID.addId(intID) #output sampleMap if not os.path.exists( outDir ): os.makedirs( outDir ) if not os.path.exists( outDir +cancer+"/"): os.makedirs( outDir+cancer+"/" ) if REALRUN == 1: oHandle = open(outDir+cancer+"/"+map,"w") sMap.store(oHandle) #output integrationID if REALRUN ==1: oHandle = open(outDir+cancer+"/integrationID","w") integrationID.store(oHandle) oHandle.close() #output integrationID json oHandle = open(outDir+cancer+"/integrationID.json","w") J={} J['name']=intName J["anatomical_origin"]= TCGAUtil.anatomical_origin[cancer] J["sample_type"]="tumor" if cancer not in ["PANCAN","PANCAN12"]: J["primary_disease"]=TCGAUtil.cancerGroupTitle[cancer] else: J["primary_disease"]="cancer" #J["cohort"] ="TCGA "+TCGAUtil.cancerHumanReadable[cancer] J['domain']="TCGA" J['owner']="TCGA" J["cgDataVersion"]=1 J['type']="integrationId" J["version"]= datetime.date.today().isoformat() oHandle.write( json.dumps( J, indent=-1 ) ) oHandle.close() #output json oHandle = open(outDir+cancer+"/"+map+".json","w") J['name']=map J['type']="sampleMap" J["version"]= datetime.date.today().isoformat() J["cgDataVersion"]=1 J[":integrationId"]=intName #add info for old clinical data if os.path.exists( outDir+cancer+"/oldClin.json" ): J[':oldClin']=cancer+"_oldClin" #special code if TCGAUtil.featurePriority.has_key(cancer) and len(TCGAUtil.featurePriority[cancer])>=5: J["VIS"]=5 #blackList in PAAD if J['name'] in ["TCGA.PAAD.sampleMap"]: J["blacklist"]= [ "TCGA-FQ-6551", "TCGA-FQ-6552", "TCGA-FQ-6553", "TCGA-FQ-6554", "TCGA-FQ-6555", "TCGA-FQ-6558", "TCGA-FQ-6559"] oHandle.write( json.dumps( J, indent=-1 ) ) return
def CAVMid(dir, outDir, cancer, log, REALRUN): print cancer, sys._getframe().f_code.co_name ignore = 1 bookDic = cgWalk(dir, ignore) existMaps = collectSampleMaps(bookDic) missingMaps = collectMissingSampleMaps(bookDic) #removeExistMaps for map in existMaps: if map not in missingMaps: missingMaps[map] = existMaps[map] # all aliquote uuid dic aliquote_dic = TCGAUtil.uuid_Aliquot_all() sample_dic = TCGAUtil.uuid_Sample_all() if not os.path.exists(outDir): os.system("mkdir " + outDir) for map in missingMaps: print map sMap = SampleMapNew(None, map) for name in missingMaps[map]: samples = [] intDic = {} #keyed on CAVMid sampleDic = {} #keyd on original sample id obj = bookDic[name] print obj["name"] if obj['type'] in ["clinicalMatrix", "mutationVector"]: outfile = outDir + os.path.basename(obj['path']) os.system("cp " + obj['path'] + ".json " + outfile + ".json") fin = open(outfile + ".json", 'r') J = json.load(fin) fin.close() if J.has_key(":clinicalFeature"): cFobj = bookDic[J[":clinicalFeature"]] cFoutfile = outDir + os.path.basename(cFobj['path']) os.system("cp " + cFobj['path'] + " " + cFoutfile) os.system("cp " + cFobj['path'] + ".json " + cFoutfile + ".json") if REALRUN == -1: continue if REALRUN == 0 and obj['type'] == "mutationVector": continue fin = open(obj['path'], 'r') fin.readline() for line in fin.readlines(): sample = string.split(line, "\t")[0] if sample not in samples and sample != "": samples.append(sample) buildSampleDic(samples, sMap, intDic, sampleDic, aliquote_dic) fin = open(obj['path'], 'r') fout = open(outfile, 'w') fout.write(fin.readline()) for line in fin.readlines(): data = string.split(line, "\t") sample = data[0] try: fout.write(sampleDic[sample] + "\t") fout.write(string.join(data[1:], "\t")) except: fout.write(line) fout.close() if obj['type'] == "genomicMatrix": fin = open(obj['path'], 'U') for sample in string.split(fin.readline()[:-1], "\t")[1:]: if sample == "": print name, "has bad empty sample id" sys.exit() samples.append(sample) fin.close() outfile = outDir + os.path.basename(obj['path']) os.system("cp " + obj['path'] + ".json " + outfile + ".json") if REALRUN != 1: continue buildSampleDic(samples, sMap, intDic, sampleDic, aliquote_dic) process(obj['path'], outfile, samples, intDic)
def convertCAVM (inDir, outD ,REALRUN, CAVM, TCGA, MAPID=1): bookDic={} sampleMaps={} bookDic=cgWalk(inDir,0) if not os.path.exists (outD): os.system("mkdir "+outD) if not bookDic : print "repo has problem" return 0 sampleMaps = collectSampleMaps(bookDic) missingMaps= collectMissingSampleMaps(bookDic) allMaps = sampleMaps.keys() allMaps.extend(missingMaps.keys()) for sampleMap in allMaps: print sampleMap outDir = outD + sampleMap+"/" if not os.path.exists (outDir): os.system("mkdir "+outDir) path = bookDic[sampleMap]['path'] """ if string.find(os.path.abspath(path), "/inside/home/jzhu/cgDataJing/scripts/data_flatten/") ==-1: print "ignore "+path continue """ if sampleMap in missingMaps: #construct an empty sampleMap sMap = SampleMapNew(None,sampleMap) #fill sMap with individual nodes, no connection changed = checkIdsAllIn(sMap, bookDic) #build connection else: name = bookDic[sampleMap]['name'] fin = open(path,'r') sMap = SampleMapNew(fin,name) if not sMap.getName(): print "Fail to initiate", name return 0 fin.close() #cohort sampleMap json sMapJ={} fin = open(bookDic[sampleMap]['path']+".json",'r') sMapJ = json.loads(fin.read()) fin.close() #integration list integrationList =[] rootDic={} clinFile ="" clinMatrix = None #cohort COHORT ="" cohortPath= string.join(string.split(bookDic[sampleMap]['path'],"/")[0:-1],"/")+"/cohort.json" if os.path.exists(cohortPath): fin = open (cohortPath,'r') cohortJ= json.loads(fin.read()) COHORT = cohortJ["name"] for name in sampleMaps[sampleMap]: obj=bookDic[name] if obj['type']=="clinicalMatrix": clinFile = outDir +os.path.basename(obj['path']) #JSON fin = open (obj['path']+".json",'r') J=json.load(fin) fin.close() if COHORT : J["cohort"]=COHORT else: J['cohort']=J[':sampleMap'] J["label"]="Phenotypes" if CAVM: J.pop(':sampleMap') if J.has_key("dataSubType"): if J.has_key(":dataSubType"): J.pop(':dataSubType') else: if J.has_key(":dataSubType"): J["dataSubType"]=J[":dataSubType"] J.pop(':dataSubType') fout=open(clinFile+".json",'w') fout.write(json.dumps (J, indent=-1)) fout.close() if REALRUN != 0 and REALRUN !=1: continue if clinMatrix != None: print "only one clinical matrix is allowed" sys.exit() fin = open(obj['path'],'U') fout = open(clinFile,'w') line = fin.readline() fout.write(line) samples =[] for line in fin.readlines(): if MAPID: sample =string.split(line[:-1],"\t")[-1] else: sample =string.split(line[:-1],"\t")[0] if sample not in samples and sample !="": samples.append(sample) fout.write(sample+"\t") if MAPID: fout.write(string.join(string.split(line[:-1],"\t")[1:],"\t")) else: fout.write(string.join(string.split(line[:-1],"\t")[1:-1],"\t")+"\t"+sample) fout.write("\n") fout.close() integrationList = copy.deepcopy(samples) #clinicalFeature if J.has_key(":clinicalFeature"): cFobj= bookDic[J[":clinicalFeature"]] outfile = outDir +os.path.basename(cFobj['path']) os.system("cp "+cFobj['path']+" "+outfile) os.system("cp "+cFobj['path']+".json "+outfile+".json") #sampleMap data mapping information #cgData 1 if not CAVM: os.system("cp "+ bookDic[sampleMap]['path'] +" "+outDir+"sampleMap") #only expect one clinical matrix clinMatrix= ClinicalMatrixNew(clinFile, "clinMatrix") break for name in sampleMaps[sampleMap]: obj=bookDic[name] if obj['type'] in ["genomicSegment","mutationVector"]: path= obj['path'] print path outfile = outDir +os.path.basename(obj['path']) fin = open (obj['path']+".json",'r') J=json.load(fin) fin.close() if COHORT : J["cohort"]=COHORT else: J['cohort']=J[':sampleMap'] if CAVM: J.pop(':sampleMap') if J.has_key("dataSubType"): if J.has_key(":dataSubType"): J.pop(':dataSubType') else: if J.has_key(":dataSubType"): J["dataSubType"]=J[":dataSubType"] J.pop(':dataSubType') fout=open(outfile+".json",'w') fout.write(json.dumps (J, indent=-1)) fout.close() if REALRUN ==1 : fin =open(path,'r') fout =open(outDir+os.path.basename(path),'w') for line in fin.readlines(): data = string.split(line,'\t') sample =data[0] if rootDic.has_key(sample): root = rootDic[sample] else: root = sMap.getIntegrationId(sample,integrationList) if not root: root = sample rootDic[sample]=root fout.write(root+"\t"+string.join(data[1:],'\t')) fin.close() fout.close() if obj['type']=="genomicMatrix": print obj['name'] #JSON outfile = outDir +os.path.basename(obj['path']) fin = open (obj['path']+".json",'r') J=json.load(fin) fin.close() if COHORT : J["cohort"]=COHORT else: J['cohort']=J[':sampleMap'] if CAVM: J.pop(':sampleMap') if J.has_key("dataSubType"): if J.has_key(":dataSubType"): J.pop(':dataSubType') else: if J.has_key(":dataSubType"): J["dataSubType"]=J[":dataSubType"] J.pop(':dataSubType') fout=open(outfile+".json",'w') fout.write(json.dumps (J, indent=-1)) fout.close() if J.has_key('anatomical_origin'): sMapJ['anatomical_origin']=J['anatomical_origin'] if J.has_key('primary_disease'): sMapJ['primary_disease']=J['primary_disease'] if J.has_key('domain'): sMapJ['domain']=J['domain'] if J.has_key('sample_type'): sMapJ['sample_type']=J['sample_type'] if J.has_key('tags'): sMapJ['tags']=J['tags'] if REALRUN != 1 and REALRUN !=0: continue # add to clinMatrix the id mappings mappingCol= "_GENOMIC_ID_"+obj['name'] clinMatrix.addOneColWithSameValue(mappingCol,"") # need to find it out if there are more than one sample map to each _INTEGRATION ID roots={} findDup=0 fin =open(obj['path'],'U') samples =string.split(fin.readline()[:-1],"\t")[1:] for i in range(0,len(samples)): sample = samples[i] if sample =="": print name, "has bad empty sample id" sys.exit() if rootDic.has_key(sample): root = rootDic[sample] else: root = sMap.getIntegrationId(sample,integrationList) if not root: root = sample rootDic[sample]=root genomic_Id= clinMatrix.getDATA(root, mappingCol) if genomic_Id is None or genomic_Id =="": clinMatrix.setDATA(root, mappingCol,sample) else: genomic_Id = string.split(genomic_Id,",") if sample not in genomic_Id: genomic_Id.append(sample) genomic_Id= string.join(genomic_Id,',') #print sample, genomic_Id clinMatrix.setDATA(root, mappingCol,genomic_Id) if roots.has_key(root): roots[root].append(i) findDup=1 else: roots[root]=[i] fin.close() if REALRUN != 1: continue #probemap for genomic segment #if J.has_key(':genomicSegment'): if J.has_key(':probeMap'): if bookDic.has_key(J[':probeMap']): probeMap = bookDic[J[':probeMap']]['path'] os.system("cp "+probeMap+" "+outDir+os.path.basename(probeMap)) os.system("cp "+probeMap+".json "+outDir+os.path.basename(probeMap)+".json") #need to figure out if there are duplication in the probe ids findDupProbe=[] process = os.popen("r=$(cut -f 1 "+obj['path']+" | more +2 | sort |uniq -c | sed -e 's/ *//' -e 's/ /\t/' | sort -n |cut -f 1 |sort -un|tail -n 1); if [ $r -ne \"1\" ]; then echo $r ; fi") r = process.read() if r: print string.strip(r), obj['path'] process = os.popen("cut -f 1 "+obj['path']+" | more +2 | sort |uniq -c | sed -e 's/ *//' -e 's/ /\t/' | sort -n | grep -vP ^1'\t' | cut -f 2 |sort") r = process.read() list = string.split(r,"\n") print len(list) for probe in list: findDupProbe.append(probe) #genomic data no dup fout=open(outfile ,'w') fin =open(obj['path'],'U') if findDup ==0 and findDupProbe ==[]: data =string.split(fin.readline()[:-1],"\t") samples= data[1:] fout.write(data[0]) for sample in samples: if rootDic.has_key(sample): root = rootDic[sample] else: root = sMap.getIntegrationId(sample,integrationList) if not root: root = sample rootDic[sample]=root fout.write('\t'+root) fout.write('\n') if TCGA: fin.close() fout.close() os.system("cat "+obj['path']+" |sed 1d >> "+outfile) #os.system("more +2 "+obj['path']+" >> "+outfile) else: while 1: line = fin.readline() if line =="": break line = string.replace(line,"\tnan\t","\tNA\t") line = string.replace(line,"\tNAN\t","\tNA\t") line = string.replace(line,"\tNaN\t","\tNA\t") fout.write(line) fin.close() fout.close() #genomic data with dup else: print "genomic with dup",obj['path'] data =string.split(fin.readline()[:-1],"\t") fout.write(data[0]) for root in roots: fout.write('\t'+root) fout.write('\n') dupDic ={} while 1: duplist=[] line = fin.readline()[:-1] if line =="": break data = string.split(line,"\t") if data[0] not in findDupProbe: fout.write(data[0]) else: if data[0] not in dupDic: dupDic[data[0]]=[] values =data[1:] for root in roots: if len(roots[root])!=1: total="NA" n=0 for i in roots[root]: if values[i] in ["nan","NAN","NaN"]: pass else: try: float(values[i]) if total=="NA": total = float(values[i]) else: total = total +float(values[i]) n=n+1 except: pass if total != "NA": average = str(total / n) else: average ="NA" else: if values[roots[root][0]] in ["nan","NAN","NaN"]: average="NA" else: try: float(values[roots[root][0]]) average = values[roots[root][0]] except: average="NA" if data[0] not in findDupProbe: fout.write('\t'+average) else: duplist.append(average) if data[0] not in findDupProbe: fout.write('\n') else: dupDic[data[0]].append(duplist[:]) if dupDic!={}: for probe in dupDic: fout.write(probe) k = len (dupDic[probe][0]) valList=[] nList=[] for i in range (0,k): valList.append("NA") nList.append(0) for list in dupDic[probe]: for i in range (0,k): try: float(list[i]) if valList [i]=="NA": valList[i]=float(list[i]) else: valList[i] =valList[i] +float(list[i]) nList[i]=nList[i]+1 except: pass for i in range (0,k): try: float(valList[i]) fout.write("\t"+str(float(valList[i])/nList[i])) except: fout.write("\tNA") fout.write("\n") fin.close() fout.close() #final clinical matrix output if REALRUN == 0 or REALRUN ==1: fout= open(clinFile,'w') clinMatrix.store(fout) #sampleMap json #cgData1 if not CAVM: outfile = outDir +"sampleMap.json" fout=open(outfile, 'w') fout.write(json.dumps(sMapJ,indent=-1)) fout.close() #cohort json cp or create cohortPath= string.join(string.split(bookDic[sampleMap]['path'],"/")[0:-1],"/")+"/cohort.json" if os.path.exists(cohortPath): os.system("cp " + cohortPath +" " + outDir) else: outfile = outDir+"cohort.json" fout=open(outfile,'w') cohortJ={} cohortJ["type"]="cohort" cohortJ["name"]= sampleMap fout.write(json.dumps(cohortJ,indent=-1)) fout.close()
def runFlatten(inDir, outDir, REALRUN, onlyGenomicSamples, SMAPNAME=None): dir = inDir bookDic = {} sampleMaps = {} ignore = 0 bookDic = cgWalk(dir, ignore) if not bookDic: print "repo has problem" return 0 sampleMaps = collectSampleMaps(bookDic) missingMaps = collectMissingSampleMaps(bookDic) allMaps = sampleMaps.keys() allMaps.extend(missingMaps.keys()) for sampleMap in allMaps: if SMAPNAME and SMAPNAME != sampleMap: print "skip", sampleMap continue print sampleMap path = bookDic[sampleMap]['path'] if os.path.abspath(path) in [ \ "/inside/home/jzhu/cgDataJing/scripts/data/public/TCGA/PANCAN/TCGA.PANCAN.sampleMap", \ "/inside/home/jzhu/cgDataJing/scripts/data/public/TCGA/PANCAN12/TCGA.PANCAN12.sampleMap" ]: print "ignore " + path continue if sampleMap in missingMaps: #construct an empty sampleMap sMap = SampleMapNew(None, sampleMap) #fill sMap with individual nodes, no connection changed = checkIdsAllIn(sMap, bookDic) #build connection else: name = bookDic[sampleMap]['name'] fin = open(path, 'r') sMap = SampleMapNew(fin, name) if not sMap.getName(): print "Fail to initiate", name return 0 fin.close() changed = checkIdsAllIn(sMap, bookDic) if REALRUN in [0, 1]: r = flattenEachSampleMap(sMap, bookDic, onlyGenomicSamples) if r == False: return 0 finalClinicalMatrix, finalClinicalMatrixJSON, finalClinFeature, finalClinFeatureJSON = r if finalClinicalMatrix.getROWnum() != 0: outputEachSampleMapRelated(outDir, bookDic, sMap, finalClinicalMatrix, finalClinicalMatrixJSON, finalClinFeature, finalClinFeatureJSON, REALRUN) if REALRUN == -2: finalClinFeature = flattenForClinicalFeature(sMap, bookDic) outputForClinFeature(outDir, sMap, finalClinFeature) cpGenomicEachSample(REALRUN, outDir, bookDic, sMap) cpProbeMaps(REALRUN, outDir, bookDic, sMap) #cpCohort if exists path = string.join( string.split(bookDic[sampleMap]['path'], "/")[0:-1], "/") + "/cohort.json" if os.path.exists(path): dataPackageDir = outDir + sampleMapBaseName(sMap) os.system("cp " + path + " " + dataPackageDir + "/") return 1