def CAVMid(dir, outDir, cancer, log, REALRUN): print cancer, sys._getframe().f_code.co_name ignore = 1 bookDic = cgWalk(dir, ignore) existMaps = collectSampleMaps(bookDic) missingMaps = collectMissingSampleMaps(bookDic) #removeExistMaps for map in existMaps: if map not in missingMaps: missingMaps[map] = existMaps[map] # all aliquote uuid dic aliquote_dic = TCGAUtil.uuid_Aliquot_all() sample_dic = TCGAUtil.uuid_Sample_all() if not os.path.exists(outDir): os.system("mkdir " + outDir) for map in missingMaps: print map sMap = SampleMapNew(None, map) for name in missingMaps[map]: samples = [] intDic = {} #keyed on CAVMid sampleDic = {} #keyd on original sample id obj = bookDic[name] print obj["name"] if obj['type'] in ["clinicalMatrix", "mutationVector"]: outfile = outDir + os.path.basename(obj['path']) os.system("cp " + obj['path'] + ".json " + outfile + ".json") fin = open(outfile + ".json", 'r') J = json.load(fin) fin.close() if J.has_key(":clinicalFeature"): cFobj = bookDic[J[":clinicalFeature"]] cFoutfile = outDir + os.path.basename(cFobj['path']) os.system("cp " + cFobj['path'] + " " + cFoutfile) os.system("cp " + cFobj['path'] + ".json " + cFoutfile + ".json") if REALRUN == -1: continue if REALRUN == 0 and obj['type'] == "mutationVector": continue fin = open(obj['path'], 'r') fin.readline() for line in fin.readlines(): sample = string.split(line, "\t")[0] if sample not in samples and sample != "": samples.append(sample) buildSampleDic(samples, sMap, intDic, sampleDic, aliquote_dic) fin = open(obj['path'], 'r') fout = open(outfile, 'w') fout.write(fin.readline()) for line in fin.readlines(): data = string.split(line, "\t") sample = data[0] try: fout.write(sampleDic[sample] + "\t") fout.write(string.join(data[1:], "\t")) except: fout.write(line) fout.close() if obj['type'] == "genomicMatrix": fin = open(obj['path'], 'U') for sample in string.split(fin.readline()[:-1], "\t")[1:]: if sample == "": print name, "has bad empty sample id" sys.exit() samples.append(sample) fin.close() outfile = outDir + os.path.basename(obj['path']) os.system("cp " + obj['path'] + ".json " + outfile + ".json") if REALRUN != 1: continue buildSampleDic(samples, sMap, intDic, sampleDic, aliquote_dic) process(obj['path'], outfile, samples, intDic)
def TCGASampleMap (dir, outDir, cancer,log, REALRUN): #print status print cancer, __name__ #if cancer in ["PANCAN","PANCAN12"]: # return ignore =1 bookDic = cgWalk(dir,ignore) existMaps = collectSampleMaps(bookDic) missingMaps= collectMissingSampleMaps(bookDic) #removeExistMaps for map in existMaps: if map not in missingMaps: missingMaps[map]=existMaps[map] # all aliquote uuid dic aliquote_dic =TCGAUtil.uuid_Aliquot_all() sample_dic =TCGAUtil.uuid_Sample_all() #missingMaps --- actually this is all the maps for map in missingMaps: print map print missingMaps[map] sMap =SampleMapNew(None,map) #integration id intName= map+".integrationID" if intName in bookDic: fin = open(bookDic[intName]["path"],'r') integrationID=IntegrationId(intName, fin) fin.close() else: integrationID=IntegrationId(intName) samples =[] for name in missingMaps[map]: if REALRUN !=1: continue print name obj=bookDic[name] if obj['type']=="genomicMatrix": fin =open(obj['path'],'U') for sample in string.split(fin.readline()[:-1],"\t")[1:]: if sample =="": print name, "has bad empty sample id" if sample not in samples: samples.append(sample) fin.close() #elif obj['type']=="clinicalMatrix": # cMa = ClinicalMatrixNew(obj['path'],name) # for sample in cMa.getROWs(): # if sample not in samples: # samples.append(sample) elif obj['type'] in ["mutationVector","clinicalMatrix"]: path = obj['path'] os.system("cut -f 1 "+path+ " |sort |uniq > .tmp") fin=open('.tmp','r') fin.readline() for line in fin.readlines(): #if string.strip(line)=="": # break sample = string.strip(line) #string.split(line,'\t')[0] if sample =="": break if sample not in samples: samples.append(sample) else: continue for sample in samples: if REALRUN !=1: continue #TCGA uuid handling if sample[0:4]!="TCGA": if aliquote_dic.has_key(string.lower(sample)): TCGAbarcode = aliquote_dic[string.lower(sample)] else: print sample continue parent = TCGAbarcode child = sample sMap.addLink(parent,string.lower(child)) sMap.addLink(parent,string.upper(child)) sample = parent #do TCGA barcode trick parts= string.split(sample,"-") if len(parts)>3 and len(parts[3])==3: parts = parts[0:3]+ [parts[3][0:2],parts[3][2]]+parts[4:] #print parts """ parent = string.join(parts[0:3],"-") #parts[3] if len(parts)>3 and len(parts[3])==3: child=parent +"-" +parts[3][0:2] sMap.addLink(parent,child) parent=child child=string.join(parts[0:4],"-") sMap.addLink(parent,child) parent=child """ parent = string.join(parts[0:3],"-") for i in range (3,len(parts)): if i!=4: child = parent +"-" +parts[i] else: child = parent +parts[i] #add parent child sMap.addLink(parent,child) parent = child intID= TCGAUtil.barcode_IntegrationId(sample) integrationID.addId(intID) #output sampleMap if not os.path.exists( outDir ): os.makedirs( outDir ) if not os.path.exists( outDir +cancer+"/"): os.makedirs( outDir+cancer+"/" ) if REALRUN == 1: oHandle = open(outDir+cancer+"/"+map,"w") sMap.store(oHandle) #output integrationID if REALRUN ==1: oHandle = open(outDir+cancer+"/integrationID","w") integrationID.store(oHandle) oHandle.close() #output integrationID json oHandle = open(outDir+cancer+"/integrationID.json","w") J={} J['name']=intName J["anatomical_origin"]= TCGAUtil.anatomical_origin[cancer] J["sample_type"]="tumor" if cancer not in ["PANCAN","PANCAN12"]: J["primary_disease"]=TCGAUtil.cancerGroupTitle[cancer] else: J["primary_disease"]="cancer" #J["cohort"] ="TCGA "+TCGAUtil.cancerHumanReadable[cancer] J['domain']="TCGA" J['owner']="TCGA" J["cgDataVersion"]=1 J['type']="integrationId" J["version"]= datetime.date.today().isoformat() oHandle.write( json.dumps( J, indent=-1 ) ) oHandle.close() #output json oHandle = open(outDir+cancer+"/"+map+".json","w") J['name']=map J['type']="sampleMap" J["version"]= datetime.date.today().isoformat() J["cgDataVersion"]=1 J[":integrationId"]=intName #add info for old clinical data if os.path.exists( outDir+cancer+"/oldClin.json" ): J[':oldClin']=cancer+"_oldClin" #special code if TCGAUtil.featurePriority.has_key(cancer) and len(TCGAUtil.featurePriority[cancer])>=5: J["VIS"]=5 #blackList in PAAD if J['name'] in ["TCGA.PAAD.sampleMap"]: J["blacklist"]= [ "TCGA-FQ-6551", "TCGA-FQ-6552", "TCGA-FQ-6553", "TCGA-FQ-6554", "TCGA-FQ-6555", "TCGA-FQ-6558", "TCGA-FQ-6559"] oHandle.write( json.dumps( J, indent=-1 ) ) return
import TCGAUtil dic = TCGAUtil.uuid_Aliquot_all() dic = TCGAUtil.uuid_Sample_all() TCGAUtil.uuid_normal_cellline() TCGAUtil.uuid_cellline()
def cohort_variable(var, value, inDir, outDir, cancer, REALRUN, doDerived): print inDir print outDir if REALRUN: ignore = 1 bookDic = cgWalk(inDir, ignore) existMaps = collectSampleMaps(bookDic) missingMaps = collectMissingSampleMaps(bookDic) #removeExistMaps for map in existMaps: if map not in missingMaps: missingMaps[map] = existMaps[map] # all aliquote uuid dic aliquote_dic = TCGAUtil.uuid_Aliquot_all() sample_dic = TCGAUtil.uuid_Sample_all() if len(missingMaps) != 1: return map = missingMaps.keys()[0] print map samples = [] for name in missingMaps[map]: obj = bookDic[name] if obj['type'] == "genomicMatrix": fin = open(obj['path'], 'U') for sample in string.split(fin.readline()[:-1], "\t")[1:]: if sample == "": print name, "has bad empty sample id" sys.exit() if sample not in samples: samples.append(sample) fin.close() #take too long """ if obj['type']=="mutationVector": fin =open(obj['path'],'U') fin.readline() while 1: line = fin.readline() if string.strip(line) =="": break sample = string.split(line,'\t')[0] if sample not in samples: samples.append(sample) print sample, obj['path'] fin.close() """ intDic = {} for sample in samples: #TCGA uuid handling uuid = sample TCGAbarcode = "" if uuid[0:4] != "TCGA": if aliquote_dic.has_key(string.lower(uuid)): TCGAbarcode = aliquote_dic[string.lower(uuid)] else: TCGAbarcode = uuid else: TCGAbarcode = sample intID = TCGAUtil.barcode_IntegrationId(TCGAbarcode) if intID == None: # ids is on patient level above integration level continue if not intDic.has_key(intID): intDic[intID] = "" outfile = outDir + cancer + "/" + var fout = open(outfile, "w") fout.write("sample\t" + var + "\n") for intId in intDic: fout.write(intId + "\t" + value + "\n") fout.close() #data josn J = {} J["version"] = datetime.date.today().isoformat() J["name"] = "TCGA_" + cancer + "_" + var J["type"] = "clinicalMatrix" J["dataSubType"] = "phenotype" J[":sampleMap"] = "TCGA." + cancer + ".sampleMap" J["cohort"] = "TCGA " + TCGAUtil.cancerHumanReadable[ cancer] + " (" + cancer + ")" outfile = outDir + cancer + "/" + var oHandle = open(outfile + ".json", "w") oHandle.write(json.dumps(J, indent=-1)) oHandle.close() if doDerived: if cancer in ["LUAD", "LUSC"]: derived_cancer = "LUNG" doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN) if cancer in ["COAD", "READ"]: derived_cancer = "COADREAD" doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN) if cancer in ["GBM", "LGG"]: derived_cancer = "GBMLGG" doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN)
import TCGAUtil dic=TCGAUtil.uuid_Aliquot_all() dic=TCGAUtil.uuid_Sample_all() TCGAUtil.uuid_normal_cellline() TCGAUtil.uuid_cellline()
def cohort_variable (var, value, inDir, outDir, cancer, REALRUN, doDerived): print inDir print outDir if REALRUN: ignore =1 bookDic=cgWalk(inDir,ignore) existMaps = collectSampleMaps(bookDic) missingMaps= collectMissingSampleMaps(bookDic) #removeExistMaps for map in existMaps: if map not in missingMaps: missingMaps[map]=existMaps[map] # all aliquote uuid dic aliquote_dic =TCGAUtil.uuid_Aliquot_all() sample_dic =TCGAUtil.uuid_Sample_all() if len(missingMaps)!=1: return map = missingMaps.keys()[0] print map samples =[] for name in missingMaps[map]: obj=bookDic[name] if obj['type']=="genomicMatrix": fin =open(obj['path'],'U') for sample in string.split(fin.readline()[:-1],"\t")[1:]: if sample =="": print name, "has bad empty sample id" sys.exit() if sample not in samples: samples.append(sample) fin.close() #take too long """ if obj['type']=="mutationVector": fin =open(obj['path'],'U') fin.readline() while 1: line = fin.readline() if string.strip(line) =="": break sample = string.split(line,'\t')[0] if sample not in samples: samples.append(sample) print sample, obj['path'] fin.close() """ intDic={} for sample in samples: #TCGA uuid handling uuid =sample TCGAbarcode ="" if uuid[0:4]!="TCGA": if aliquote_dic.has_key(string.lower(uuid)): TCGAbarcode = aliquote_dic[string.lower(uuid)] else: TCGAbarcode = uuid else: TCGAbarcode = sample intID= TCGAUtil.barcode_IntegrationId(TCGAbarcode) if intID == None: # ids is on patient level above integration level continue if not intDic.has_key(intID): intDic[intID]="" outfile = outDir+cancer+"/"+ var fout =open(outfile,"w") fout.write("sample\t"+var+"\n") for intId in intDic: fout.write(intId+"\t"+ value+"\n") fout.close() #data josn J={} J["version"]= datetime.date.today().isoformat() J["name"]="TCGA_"+cancer+"_"+var J["type"]= "clinicalMatrix" J["dataSubType"]="phenotype" J[":sampleMap"]="TCGA."+cancer+".sampleMap" J["cohort"]="TCGA "+TCGAUtil.cancerHumanReadable[cancer]+" ("+cancer+")" outfile = outDir+cancer+"/"+var oHandle = open(outfile +".json","w") oHandle.write( json.dumps( J, indent=-1 ) ) oHandle.close() if doDerived: if cancer in ["LUAD","LUSC"]: derived_cancer="LUNG" doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN) if cancer in ["COAD","READ"]: derived_cancer="COADREAD" doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN) if cancer in ["GBM","LGG"]: derived_cancer="GBMLGG" doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN)
def CAVMid (dir, outDir, cancer,log, REALRUN): print cancer, sys._getframe().f_code.co_name ignore =1 bookDic=cgWalk(dir,ignore) existMaps = collectSampleMaps(bookDic) missingMaps= collectMissingSampleMaps(bookDic) #removeExistMaps for map in existMaps: if map not in missingMaps: missingMaps[map]=existMaps[map] # all aliquote uuid dic aliquote_dic =TCGAUtil.uuid_Aliquot_all() sample_dic =TCGAUtil.uuid_Sample_all() if not os.path.exists (outDir): os.system("mkdir "+outDir) for map in missingMaps: print map sMap =SampleMapNew(None,map) for name in missingMaps[map]: samples =[] intDic={}#keyed on CAVMid sampleDic={} #keyd on original sample id obj=bookDic[name] print obj["name"] if obj['type'] in ["clinicalMatrix","mutationVector"]: outfile = outDir +os.path.basename(obj['path']) os.system("cp "+obj['path']+".json "+outfile+".json") fin = open (outfile+".json",'r') J=json.load(fin) fin.close() if J.has_key(":clinicalFeature"): cFobj= bookDic[J[":clinicalFeature"]] cFoutfile = outDir +os.path.basename(cFobj['path']) os.system("cp "+cFobj['path']+" "+cFoutfile) os.system("cp "+cFobj['path']+".json "+cFoutfile+".json") if REALRUN ==-1: continue if REALRUN ==0 and obj['type']=="mutationVector": continue fin = open(obj['path'],'r') fin.readline() for line in fin.readlines(): sample =string.split(line,"\t")[0] if sample not in samples and sample !="": samples.append(sample) buildSampleDic (samples, sMap, intDic, sampleDic, aliquote_dic) fin = open(obj['path'],'r') fout = open(outfile,'w') fout.write(fin.readline()) for line in fin.readlines(): data =string.split(line,"\t") sample =data[0] try: fout.write(sampleDic[sample]+"\t") fout.write(string.join(data[1:],"\t")) except: fout.write(line) fout.close() if obj['type']=="genomicMatrix": fin =open(obj['path'],'U') for sample in string.split(fin.readline()[:-1],"\t")[1:]: if sample =="": print name, "has bad empty sample id" sys.exit() samples.append(sample) fin.close() outfile = outDir +os.path.basename(obj['path']) os.system("cp "+obj['path']+".json "+outfile+".json") if REALRUN !=1: continue buildSampleDic (samples, sMap, intDic, sampleDic,aliquote_dic) process(obj['path'], outfile, samples, intDic)