def add_col_PseudoSample(clinMatrix, col): # add sample type informatin to pseudo samples rows = clinMatrix.getROWs() for row in rows: st = clinMatrix.getDATA(row, col) if st != None and st != "": #assuming sample ids are TCGA barcode integration_id = TCGAUtil.barcode_IntegrationId(row) if clinMatrix.hasRow(integration_id): clinMatrix.setDATA(integration_id, col, st) else: clinMatrix.addNewRows([integration_id], {col: st}) r = clinMatrix.validate() if r == False: print "add pseudoSample clinical infor", col, "fail"
def buildSampleDic(samples, sMap, intDic, sampleDic, aliquote_dic): for sample in samples: #TCGA uuid handling uuid = sample if sample[0:4] != "TCGA": if aliquote_dic.has_key(string.lower(sample)): TCGAbarcode = aliquote_dic[string.lower(sample)] else: print sample parent = TCGAbarcode child = sample sMap.addLink(parent, child) sample = parent #do TCGA barcode trick parts = string.split(sample, "-") parent = string.join(parts[0:3], "-") #parts[3] if len(parts) > 3 and len(parts[3]) == 3: child = parent + "-" + parts[3][0:2] sMap.addLink(parent, child) parent = child child = string.join(parts[0:4], "-") sMap.addLink(parent, child) parent = child for i in range(4, len(parts)): child = parent + "-" + parts[i] #add parent child sMap.addLink(parent, child) parent = child intID = TCGAUtil.barcode_IntegrationId(sample) if intDic.has_key(intID): intDic[intID].append(uuid) else: intDic[intID] = [uuid] sampleDic[uuid] = intID
def buildSampleDic (samples, sMap, intDic, sampleDic,aliquote_dic): for sample in samples: #TCGA uuid handling uuid=sample if sample[0:4]!="TCGA": if aliquote_dic.has_key(string.lower(sample)): TCGAbarcode = aliquote_dic[string.lower(sample)] else: print sample parent = TCGAbarcode child = sample sMap.addLink(parent,child) sample = parent #do TCGA barcode trick parts= string.split(sample,"-") parent = string.join(parts[0:3],"-") #parts[3] if len(parts)>3 and len(parts[3])==3: child=parent +"-" +parts[3][0:2] sMap.addLink(parent,child) parent=child child=string.join(parts[0:4],"-") sMap.addLink(parent,child) parent=child for i in range (4,len(parts)): child = parent +"-" +parts[i] #add parent child sMap.addLink(parent,child) parent = child intID= TCGAUtil.barcode_IntegrationId(sample) if intDic.has_key(intID): intDic[intID].append(uuid) else: intDic[intID]=[uuid] sampleDic[uuid]=intID
def TCGASampleMap (dir, outDir, cancer,log, REALRUN): #print status print cancer, __name__ #if cancer in ["PANCAN","PANCAN12"]: # return ignore =1 bookDic = cgWalk(dir,ignore) existMaps = collectSampleMaps(bookDic) missingMaps= collectMissingSampleMaps(bookDic) #removeExistMaps for map in existMaps: if map not in missingMaps: missingMaps[map]=existMaps[map] # all aliquote uuid dic aliquote_dic =TCGAUtil.uuid_Aliquot_all() sample_dic =TCGAUtil.uuid_Sample_all() #missingMaps --- actually this is all the maps for map in missingMaps: print map print missingMaps[map] sMap =SampleMapNew(None,map) #integration id intName= map+".integrationID" if intName in bookDic: fin = open(bookDic[intName]["path"],'r') integrationID=IntegrationId(intName, fin) fin.close() else: integrationID=IntegrationId(intName) samples =[] for name in missingMaps[map]: if REALRUN !=1: continue print name obj=bookDic[name] if obj['type']=="genomicMatrix": fin =open(obj['path'],'U') for sample in string.split(fin.readline()[:-1],"\t")[1:]: if sample =="": print name, "has bad empty sample id" if sample not in samples: samples.append(sample) fin.close() #elif obj['type']=="clinicalMatrix": # cMa = ClinicalMatrixNew(obj['path'],name) # for sample in cMa.getROWs(): # if sample not in samples: # samples.append(sample) elif obj['type'] in ["mutationVector","clinicalMatrix"]: path = obj['path'] os.system("cut -f 1 "+path+ " |sort |uniq > .tmp") fin=open('.tmp','r') fin.readline() for line in fin.readlines(): #if string.strip(line)=="": # break sample = string.strip(line) #string.split(line,'\t')[0] if sample =="": break if sample not in samples: samples.append(sample) else: continue for sample in samples: if REALRUN !=1: continue #TCGA uuid handling if sample[0:4]!="TCGA": if aliquote_dic.has_key(string.lower(sample)): TCGAbarcode = aliquote_dic[string.lower(sample)] else: print sample continue parent = TCGAbarcode child = sample sMap.addLink(parent,string.lower(child)) sMap.addLink(parent,string.upper(child)) sample = parent #do TCGA barcode trick parts= string.split(sample,"-") if len(parts)>3 and len(parts[3])==3: parts = parts[0:3]+ [parts[3][0:2],parts[3][2]]+parts[4:] #print parts """ parent = string.join(parts[0:3],"-") #parts[3] if len(parts)>3 and len(parts[3])==3: child=parent +"-" +parts[3][0:2] sMap.addLink(parent,child) parent=child child=string.join(parts[0:4],"-") sMap.addLink(parent,child) parent=child """ parent = string.join(parts[0:3],"-") for i in range (3,len(parts)): if i!=4: child = parent +"-" +parts[i] else: child = parent +parts[i] #add parent child sMap.addLink(parent,child) parent = child intID= TCGAUtil.barcode_IntegrationId(sample) integrationID.addId(intID) #output sampleMap if not os.path.exists( outDir ): os.makedirs( outDir ) if not os.path.exists( outDir +cancer+"/"): os.makedirs( outDir+cancer+"/" ) if REALRUN == 1: oHandle = open(outDir+cancer+"/"+map,"w") sMap.store(oHandle) #output integrationID if REALRUN ==1: oHandle = open(outDir+cancer+"/integrationID","w") integrationID.store(oHandle) oHandle.close() #output integrationID json oHandle = open(outDir+cancer+"/integrationID.json","w") J={} J['name']=intName J["anatomical_origin"]= TCGAUtil.anatomical_origin[cancer] J["sample_type"]="tumor" if cancer not in ["PANCAN","PANCAN12"]: J["primary_disease"]=TCGAUtil.cancerGroupTitle[cancer] else: J["primary_disease"]="cancer" #J["cohort"] ="TCGA "+TCGAUtil.cancerHumanReadable[cancer] J['domain']="TCGA" J['owner']="TCGA" J["cgDataVersion"]=1 J['type']="integrationId" J["version"]= datetime.date.today().isoformat() oHandle.write( json.dumps( J, indent=-1 ) ) oHandle.close() #output json oHandle = open(outDir+cancer+"/"+map+".json","w") J['name']=map J['type']="sampleMap" J["version"]= datetime.date.today().isoformat() J["cgDataVersion"]=1 J[":integrationId"]=intName #add info for old clinical data if os.path.exists( outDir+cancer+"/oldClin.json" ): J[':oldClin']=cancer+"_oldClin" #special code if TCGAUtil.featurePriority.has_key(cancer) and len(TCGAUtil.featurePriority[cancer])>=5: J["VIS"]=5 #blackList in PAAD if J['name'] in ["TCGA.PAAD.sampleMap"]: J["blacklist"]= [ "TCGA-FQ-6551", "TCGA-FQ-6552", "TCGA-FQ-6553", "TCGA-FQ-6554", "TCGA-FQ-6555", "TCGA-FQ-6558", "TCGA-FQ-6559"] oHandle.write( json.dumps( J, indent=-1 ) ) return
def cohort_variable(var, value, inDir, outDir, cancer, REALRUN, doDerived): print inDir print outDir if REALRUN: ignore = 1 bookDic = cgWalk(inDir, ignore) existMaps = collectSampleMaps(bookDic) missingMaps = collectMissingSampleMaps(bookDic) #removeExistMaps for map in existMaps: if map not in missingMaps: missingMaps[map] = existMaps[map] # all aliquote uuid dic aliquote_dic = TCGAUtil.uuid_Aliquot_all() sample_dic = TCGAUtil.uuid_Sample_all() if len(missingMaps) != 1: return map = missingMaps.keys()[0] print map samples = [] for name in missingMaps[map]: obj = bookDic[name] if obj['type'] == "genomicMatrix": fin = open(obj['path'], 'U') for sample in string.split(fin.readline()[:-1], "\t")[1:]: if sample == "": print name, "has bad empty sample id" sys.exit() if sample not in samples: samples.append(sample) fin.close() #take too long """ if obj['type']=="mutationVector": fin =open(obj['path'],'U') fin.readline() while 1: line = fin.readline() if string.strip(line) =="": break sample = string.split(line,'\t')[0] if sample not in samples: samples.append(sample) print sample, obj['path'] fin.close() """ intDic = {} for sample in samples: #TCGA uuid handling uuid = sample TCGAbarcode = "" if uuid[0:4] != "TCGA": if aliquote_dic.has_key(string.lower(uuid)): TCGAbarcode = aliquote_dic[string.lower(uuid)] else: TCGAbarcode = uuid else: TCGAbarcode = sample intID = TCGAUtil.barcode_IntegrationId(TCGAbarcode) if intID == None: # ids is on patient level above integration level continue if not intDic.has_key(intID): intDic[intID] = "" outfile = outDir + cancer + "/" + var fout = open(outfile, "w") fout.write("sample\t" + var + "\n") for intId in intDic: fout.write(intId + "\t" + value + "\n") fout.close() #data josn J = {} J["version"] = datetime.date.today().isoformat() J["name"] = "TCGA_" + cancer + "_" + var J["type"] = "clinicalMatrix" J["dataSubType"] = "phenotype" J[":sampleMap"] = "TCGA." + cancer + ".sampleMap" J["cohort"] = "TCGA " + TCGAUtil.cancerHumanReadable[ cancer] + " (" + cancer + ")" outfile = outDir + cancer + "/" + var oHandle = open(outfile + ".json", "w") oHandle.write(json.dumps(J, indent=-1)) oHandle.close() if doDerived: if cancer in ["LUAD", "LUSC"]: derived_cancer = "LUNG" doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN) if cancer in ["COAD", "READ"]: derived_cancer = "COADREAD" doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN) if cancer in ["GBM", "LGG"]: derived_cancer = "GBMLGG" doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN)
def cohort_variable (var, value, inDir, outDir, cancer, REALRUN, doDerived): print inDir print outDir if REALRUN: ignore =1 bookDic=cgWalk(inDir,ignore) existMaps = collectSampleMaps(bookDic) missingMaps= collectMissingSampleMaps(bookDic) #removeExistMaps for map in existMaps: if map not in missingMaps: missingMaps[map]=existMaps[map] # all aliquote uuid dic aliquote_dic =TCGAUtil.uuid_Aliquot_all() sample_dic =TCGAUtil.uuid_Sample_all() if len(missingMaps)!=1: return map = missingMaps.keys()[0] print map samples =[] for name in missingMaps[map]: obj=bookDic[name] if obj['type']=="genomicMatrix": fin =open(obj['path'],'U') for sample in string.split(fin.readline()[:-1],"\t")[1:]: if sample =="": print name, "has bad empty sample id" sys.exit() if sample not in samples: samples.append(sample) fin.close() #take too long """ if obj['type']=="mutationVector": fin =open(obj['path'],'U') fin.readline() while 1: line = fin.readline() if string.strip(line) =="": break sample = string.split(line,'\t')[0] if sample not in samples: samples.append(sample) print sample, obj['path'] fin.close() """ intDic={} for sample in samples: #TCGA uuid handling uuid =sample TCGAbarcode ="" if uuid[0:4]!="TCGA": if aliquote_dic.has_key(string.lower(uuid)): TCGAbarcode = aliquote_dic[string.lower(uuid)] else: TCGAbarcode = uuid else: TCGAbarcode = sample intID= TCGAUtil.barcode_IntegrationId(TCGAbarcode) if intID == None: # ids is on patient level above integration level continue if not intDic.has_key(intID): intDic[intID]="" outfile = outDir+cancer+"/"+ var fout =open(outfile,"w") fout.write("sample\t"+var+"\n") for intId in intDic: fout.write(intId+"\t"+ value+"\n") fout.close() #data josn J={} J["version"]= datetime.date.today().isoformat() J["name"]="TCGA_"+cancer+"_"+var J["type"]= "clinicalMatrix" J["dataSubType"]="phenotype" J[":sampleMap"]="TCGA."+cancer+".sampleMap" J["cohort"]="TCGA "+TCGAUtil.cancerHumanReadable[cancer]+" ("+cancer+")" outfile = outDir+cancer+"/"+var oHandle = open(outfile +".json","w") oHandle.write( json.dumps( J, indent=-1 ) ) oHandle.close() if doDerived: if cancer in ["LUAD","LUSC"]: derived_cancer="LUNG" doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN) if cancer in ["COAD","READ"]: derived_cancer="COADREAD" doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN) if cancer in ["GBM","LGG"]: derived_cancer="GBMLGG" doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN)