def flattenForClinicalFeature(sMap, bookDic): clinFeatures = [] finalClinFeature = None sampleMap = sMap.getName() datasets = collectNamesBelongToSampleMap(bookDic, sampleMap) for name in datasets: obj = bookDic[name] if obj['type'] == "clinicalMatrix": clinFeature = None #clinFeature obj if obj.has_key(':clinicalFeature'): path = bookDic[obj[':clinicalFeature']]['path'] neme = bookDic[obj[':clinicalFeature']]['name'] clinFeature = ClinicalFeatureNew(path, name) #get matrix obj path = obj['path'] name = obj['name'] cMatrix = ClinicalMatrixNew(path, name, False, clinFeature) if clinFeature: clinFeatures.append(clinFeature) fout = open(".tmp", 'w') fout.close() for clinF in clinFeatures: fout = open(".tmptmp", 'w') clinF.store(fout) fout.close() os.system("cat .tmptmp >> .tmp") fin = open(".tmp", 'r') jsonName = trackName_fix(sampleMapBaseName(sMap) + "_clinicalFeature") finalClinFeature = ClinicalFeatureNew(fin, jsonName) if not finalClinFeature.isValid(): print "final clinFeature file .tmp is invalid" return 0 fin.close() #vis exceptions VIS_limit = 4 if bookDic.has_key( sMap.getName()) and bookDic[sMap.getName()].has_key("VIS"): VIS_limit = bookDic[sMap.getName()]["VIS"] finalClinFeature.fillInPriorityVisibility(VIS_limit) finalClinFeature.setFeatureShortTitle("_PATIENT", "_PATIENT_ID") finalClinFeature.setFeatureLongTitle("_PATIENT", "_PATIENT_ID") finalClinFeature.setFeatureValueType("_PATIENT", "category") finalClinFeature.setFeatureShortTitle("_INTEGRATION", "_SAMPLE_ID") finalClinFeature.setFeatureLongTitle("_INTEGRATION", "_SAMPLE_ID") finalClinFeature.setFeatureValueType("_INTEGRATION", "category") return finalClinFeature
def flattenForClinicalFeature(sMap, bookDic): clinFeatures=[] finalClinFeature=None sampleMap = sMap.getName() datasets = collectNamesBelongToSampleMap(bookDic, sampleMap) for name in datasets: obj= bookDic[name] if obj['type']=="clinicalMatrix": clinFeature=None #clinFeature obj if obj.has_key(':clinicalFeature'): path= bookDic[obj[':clinicalFeature']]['path'] neme = bookDic[obj[':clinicalFeature']]['name'] clinFeature = ClinicalFeatureNew(path, name) #get matrix obj path = obj['path'] name = obj['name'] cMatrix = ClinicalMatrixNew(path,name,False, clinFeature) if clinFeature: clinFeatures.append(clinFeature) fout = open(".tmp",'w') fout.close() for clinF in clinFeatures: fout = open(".tmptmp",'w') clinF.store(fout) fout.close() os.system("cat .tmptmp >> .tmp") fin = open(".tmp",'r') jsonName= trackName_fix(sampleMapBaseName(sMap)+"_clinicalFeature") finalClinFeature =ClinicalFeatureNew(fin,jsonName) if not finalClinFeature.isValid(): print "final clinFeature file .tmp is invalid" return 0 fin.close() #vis exceptions VIS_limit=4 if bookDic.has_key(sMap.getName()) and bookDic[sMap.getName()].has_key("VIS"): VIS_limit= bookDic[sMap.getName()]["VIS"] finalClinFeature.fillInPriorityVisibility(VIS_limit) finalClinFeature.setFeatureShortTitle("_PATIENT","_PATIENT_ID") finalClinFeature.setFeatureLongTitle("_PATIENT","_PATIENT_ID") finalClinFeature.setFeatureValueType("_PATIENT","category") finalClinFeature.setFeatureShortTitle("_INTEGRATION","_SAMPLE_ID") finalClinFeature.setFeatureLongTitle("_INTEGRATION","_SAMPLE_ID") finalClinFeature.setFeatureValueType("_INTEGRATION","category") return finalClinFeature
def flattenEachSampleMap(sMap, bookDic,onlyGenomicSamples): sampleMap = sMap.getName() jsonName= trackName_fix(sampleMapBaseName(sMap)+"_clinicalMatrix") finalClinMatrix= ClinicalMatrixNew(None,jsonName) finalClinMatrixJSON={} finalClinMatrixJSON["name"]=jsonName finalClinMatrixJSON["type"]="clinicalMatrix" finalClinMatrixJSON["path"]="" finalClinMatrixJSON[":sampleMap"]=sampleMap clinFeatures=[] finalClinFeatureJSON=None finalClinFeature=None # add all ids to sMap sMapChanged= checkIdsAllIn(sMap, bookDic) #build initial clinical Matrix with sampleMap ids, all with empty data emptyData={} success = finalClinMatrix.addNewRows(sMap.getNodes(),emptyData) if not success: print "fail to add all initial ids from sampleMap" datasets = collectNamesBelongToSampleMap(bookDic, sampleMap) datasetsOrdered =[] #only the ClinicalMatrix ordered list for name in datasets: obj= bookDic[name] if obj['type']=="clinicalMatrix": if obj.has_key('outOfDate') and obj['outOfDate'] in ["yes", "Yes","YES"]: datasetsOrdered.append(name) elif not obj.has_key('outOfDate') and not obj.has_key('upToDate'): datasetsOrdered.insert(0,name) upToDateSets={} for name in datasets: obj= bookDic[name] if obj['type']=="clinicalMatrix": if obj.has_key('upToDate') : upToDateSets[obj['upToDate']]=name keys= upToDateSets.keys() keys.sort() for version in keys: name = upToDateSets [version] datasetsOrdered.insert(0,name) for name in datasetsOrdered: obj= bookDic[name] if obj['type']=="clinicalMatrix": clinFeature=None #clinFeature obj if obj.has_key(':clinicalFeature'): path= bookDic[obj[':clinicalFeature']]['path'] neme = bookDic[obj[':clinicalFeature']]['name'] clinFeature = ClinicalFeatureNew(path, name) #get matrix obj path = obj['path'] name = obj['name'] cMatrix = ClinicalMatrixNew(path,name,False, clinFeature) if finalClinMatrix==None: finalClinMatrix= cMatrix if finalClinMatrixJSON==None: finalClinMatrixJSON= obj #merge final and cMatrix if finalClinMatrix != cMatrix: print "name=",cMatrix.getName() r = finalClinMatrix.addNewCols(cMatrix,validation=True) if r!=True: print "Fail to merge" return False #add clinFeature if clinFeature: clinFeatures.append(clinFeature) #merge finalClinMatrixJSON with new json if finalClinMatrixJSON != obj: jsonName= trackName_fix(sampleMapBaseName(sMap)+"_clinicalMatrix") finalClinMatrixJSON= cgDataMergeJSON(finalClinMatrixJSON, obj, jsonName) # final ClinFeature json if clinFeature: clinFeatureJSON = bookDic[obj[':clinicalFeature']] if finalClinFeatureJSON==None: finalClinFeatureJSON= clinFeatureJSON else: jsonName= trackName_fix(sampleMapBaseName(sMap)+"_clinicalFeature") finalClinFeatureJSON["version"]=datetime.date.today().isoformat() finalClinFeatureJSON["type"]="clinicalFeature" finalClinFeatureJSON["name"]=jsonName #final clinicalFeature if finalClinFeatureJSON: fout = open(".tmp",'w') fout.close() for clinF in clinFeatures: fout = open(".tmptmp",'w') clinF.store(fout) fout.close() os.system("cat .tmptmp >> .tmp") fin = open(".tmp",'r') finalClinFeature =ClinicalFeatureNew(fin,finalClinFeatureJSON['name']) if not finalClinFeature.isValid(): print "final clinFeature file .tmp is invalid" return 0 fin.close() #SURVIVAL analysis data foundE=0 foundT=0 if finalClinFeature: features= finalClinFeature.getFeatures() for feature in features: sameAs = finalClinFeature.getFeatureSameAs(feature) if sameAs =="_TIME_TO_EVENT": #check there is only one parameter is set to be _TIME_TO_EVENT if foundT==1: print "ERROR there is already _TIME_TO_EVENT" continue #check matrix does not have _TIME_TO_EVNET if sameAs in finalClinMatrix.getCOLs(): print "ERROR there is already _TIME_TO_EVENT in matrix" continue #data check need to check these are floats or "" in both clinFeature and clinMatrix GOOD=1 if finalClinMatrix.isTypeFloat(feature)!= True: print "ERROR _TIME_TO_EVENT parent feature values are not correct", finalClinMatrix.getColStates(feature) GOOD=0 if GOOD: foundT=1 finalClinMatrix.addNewColfromOld(sameAs, feature) finalClinFeature.setFeatureValueType(sameAs,"float") if sameAs =="_EVENT": #check there is only one parameter is set to be _EVENT if foundE==1: print "ERROR there is already _EVENT" continue #check matrix does not have _EVNET if sameAs in finalClinMatrix.getCOLs(): print "ERROR there is already _EVENT in matrix" continue #data check GOOD=1 states= finalClinMatrix.getColStates(feature) """ for state in states: if state not in [0,1,"0","1",""]: print "ERROR _EVENT values are not correct", state GOOD=0 break """ if len(states) not in [2,3]: GOOD=0 if len(states)==3 and states.count('')!=1: GOOD=0 if GOOD: foundE=1 finalClinMatrix.addNewColfromOld(sameAs, feature) finalClinFeature.setFeatureValueType(sameAs,"category") #finalClinFeature.setFeatureStates(sameAs,["0","1"]) #finalClinFeature.setFeatureStateOrder(sameAs,["0","1"]) #clinical data push down roots = sMap.allRoots() for root in roots: r = finalClinMatrix.pushToChildren (root,sMap) if r != True: print "Fail to push down" return 0 print "after clinical push down", sampleMap,finalClinMatrix.getROWnum() # collect all genomic data keepSamples = getAllGenomicIds(sMap, bookDic) # removing rows without genomic data from clinical data matrix due to mysql enum limitation # should remove this step after the display functionality is done better, currently cgb clinical data range in feature control panel shows the full range of clinical data without checking if the specific track/dataset has the full value range. if onlyGenomicSamples: print "genomic sample count", len(keepSamples) success= finalClinMatrix.onlyKeepRows(keepSamples) if not success: print "fail to remove extra rows" else: print "after keeping sample with genomic data", finalClinMatrix.getROWnum() #add to the clinical matrix any samples with genomic data but no clinical data emptyData={} for col in finalClinMatrix.getCOLs(): emptyData[col]="" success = finalClinMatrix.addNewRows(keepSamples,emptyData) if not success: print "fail to add new roows" else: print "after adding all genomic data", finalClinMatrix.getROWnum() if finalClinMatrix.validate() != True: print "Fail to validate" cMatrix = oldCMatrix return 0 # end of collecting all genomic data #code to remove blacklist samples and all its descendants badList= badListSelfAndDescendants (sMap, bookDic) if badList!=[]: #remove badList samples finalClinMatrix.removeRows(badList, True) print "after remove badList", finalClinMatrix.getROWnum() #identify empty features badFeatures= finalClinMatrix.findBadColsNotRemove() print "emptye features:", badFeatures #finalBadFeatures=[] #if finalClinFeature: ########### don't understand this # for feature in badFeatures: # #get short label # shortTitle = finalClinFeature.getShortTitle(feature) # if not shortTitle: # print feature,"remove" # finalBadFeatures.append(feature) # else: # print shortTitle,"not remove" #else: # finalBadFeatures =badFeatures[:] #remove bad features finalBadFeatures= badFeatures finalClinMatrix.removeCols(finalBadFeatures) print "remove features", finalBadFeatures # add _PATIENT col if finalClinMatrix.addColRoot(sMap) == None: print "Fail to addColRoot" return 0 # add _INTEGRATION col intList=[] if bookDic.has_key(sampleMap) and bookDic[sampleMap].has_key(":integrationId"): intName=bookDic[sampleMap][":integrationId"] fin= open(bookDic[intName]["path"],"r") intId = IntegrationId(intName,fin) intList = intId.getList() finalClinMatrix.addColIntegration(sMap,intList) # final ClinFeature json if finalClinFeatureJSON==None: jsonName= trackName_fix(sampleMapBaseName(sMap)+"_clinicalFeature") finalClinFeatureJSON= {} finalClinFeatureJSON["version"]=datetime.date.today().isoformat() finalClinFeatureJSON["type"]="clinicalFeature" finalClinFeatureJSON["name"]=jsonName finalClinFeatureJSON["path"]="" finalClinFeature = ClinicalFeatureNew (None, finalClinFeatureJSON["name"]) #final clinicalFeature if finalClinFeature: finalClinFeature.removeFeatures(finalBadFeatures) finalClinFeature.cleanState() finalClinFeature.checkFeatureWithMatrix(finalClinMatrix) #clinicalFeature fillin ValueType finalClinFeature.fillInValueTypeWithMatrix(finalClinMatrix) #clinicalFeature fillin missing features finalClinFeature.fillInFeaturesWithMatrix(finalClinMatrix) #clinicalFeature fillin short and long titles finalClinFeature.fillInTitles() #clinicalFeature fillin priority visibility #vis exceptions VIS_limit=4 if bookDic.has_key(sMap.getName()) and bookDic[sMap.getName()].has_key("VIS"): VIS_limit= bookDic[sMap.getName()]["VIS"] finalClinFeature.fillInPriorityVisibility(VIS_limit) finalClinFeature.setFeatureShortTitle("_PATIENT","_PATIENT_ID") finalClinFeature.setFeatureLongTitle("_PATIENT","_PATIENT_ID") finalClinFeature.setFeatureValueType("_PATIENT","category") finalClinFeature.setFeatureShortTitle("_INTEGRATION","_SAMPLE_ID") finalClinFeature.setFeatureLongTitle("_INTEGRATION","_SAMPLE_ID") finalClinFeature.setFeatureValueType("_INTEGRATION","category") print sampleMap,finalClinMatrix.getROWnum() return finalClinMatrix,finalClinMatrixJSON, finalClinFeature, finalClinFeatureJSON
def flattenEachSampleMap(sMap, bookDic, onlyGenomicSamples): sampleMap = sMap.getName() jsonName = trackName_fix(sampleMapBaseName(sMap) + "_clinicalMatrix") finalClinMatrix = ClinicalMatrixNew(None, jsonName) finalClinMatrixJSON = {} finalClinMatrixJSON["name"] = jsonName finalClinMatrixJSON["type"] = "clinicalMatrix" finalClinMatrixJSON["path"] = "" finalClinMatrixJSON[":sampleMap"] = sampleMap clinFeatures = [] finalClinFeatureJSON = None finalClinFeature = None # add all ids to sMap sMapChanged = checkIdsAllIn(sMap, bookDic) #build initial clinical Matrix with sampleMap ids, all with empty data emptyData = {} success = finalClinMatrix.addNewRows(sMap.getNodes(), emptyData) if not success: print "fail to add all initial ids from sampleMap" datasets = collectNamesBelongToSampleMap(bookDic, sampleMap) datasetsOrdered = [] #only the ClinicalMatrix ordered list for name in datasets: obj = bookDic[name] if obj['type'] == "clinicalMatrix": if obj.has_key('outOfDate') and obj['outOfDate'] in [ "yes", "Yes", "YES" ]: datasetsOrdered.append(name) elif not obj.has_key('outOfDate') and not obj.has_key('upToDate'): datasetsOrdered.insert(0, name) upToDateSets = {} for name in datasets: obj = bookDic[name] if obj['type'] == "clinicalMatrix": if obj.has_key('upToDate'): upToDateSets[obj['upToDate']] = name keys = upToDateSets.keys() keys.sort() for version in keys: name = upToDateSets[version] datasetsOrdered.insert(0, name) for name in datasetsOrdered: obj = bookDic[name] if obj['type'] == "clinicalMatrix": clinFeature = None #clinFeature obj if obj.has_key(':clinicalFeature'): path = bookDic[obj[':clinicalFeature']]['path'] neme = bookDic[obj[':clinicalFeature']]['name'] clinFeature = ClinicalFeatureNew(path, name) #get matrix obj path = obj['path'] name = obj['name'] cMatrix = ClinicalMatrixNew(path, name, False, clinFeature) if finalClinMatrix == None: finalClinMatrix = cMatrix if finalClinMatrixJSON == None: finalClinMatrixJSON = obj #merge final and cMatrix if finalClinMatrix != cMatrix: print "name=", cMatrix.getName() r = finalClinMatrix.addNewCols(cMatrix, validation=True) if r != True: print "Fail to merge" return False #add clinFeature if clinFeature: clinFeatures.append(clinFeature) #merge finalClinMatrixJSON with new json if finalClinMatrixJSON != obj: jsonName = trackName_fix( sampleMapBaseName(sMap) + "_clinicalMatrix") finalClinMatrixJSON = cgDataMergeJSON(finalClinMatrixJSON, obj, jsonName) # final ClinFeature json if clinFeature: clinFeatureJSON = bookDic[obj[':clinicalFeature']] if finalClinFeatureJSON == None: finalClinFeatureJSON = clinFeatureJSON else: jsonName = trackName_fix( sampleMapBaseName(sMap) + "_clinicalFeature") finalClinFeatureJSON["version"] = datetime.date.today( ).isoformat() finalClinFeatureJSON["type"] = "clinicalFeature" finalClinFeatureJSON["name"] = jsonName #final clinicalFeature if finalClinFeatureJSON: fout = open(".tmp", 'w') fout.close() for clinF in clinFeatures: fout = open(".tmptmp", 'w') clinF.store(fout) fout.close() os.system("cat .tmptmp >> .tmp") fin = open(".tmp", 'r') finalClinFeature = ClinicalFeatureNew(fin, finalClinFeatureJSON['name']) if not finalClinFeature.isValid(): print "final clinFeature file .tmp is invalid" return 0 fin.close() #SURVIVAL analysis data foundE = 0 foundT = 0 if finalClinFeature: features = finalClinFeature.getFeatures() for feature in features: sameAs = finalClinFeature.getFeatureSameAs(feature) if sameAs == "_TIME_TO_EVENT": #check there is only one parameter is set to be _TIME_TO_EVENT if foundT == 1: print "ERROR there is already _TIME_TO_EVENT" continue #check matrix does not have _TIME_TO_EVNET if sameAs in finalClinMatrix.getCOLs(): print "ERROR there is already _TIME_TO_EVENT in matrix" continue #data check need to check these are floats or "" in both clinFeature and clinMatrix GOOD = 1 if finalClinMatrix.isTypeFloat(feature) != True: print "ERROR _TIME_TO_EVENT parent feature values are not correct", finalClinMatrix.getColStates( feature) GOOD = 0 if GOOD: foundT = 1 finalClinMatrix.addNewColfromOld(sameAs, feature) finalClinFeature.setFeatureValueType(sameAs, "float") if sameAs == "_EVENT": #check there is only one parameter is set to be _EVENT if foundE == 1: print "ERROR there is already _EVENT" continue #check matrix does not have _EVNET if sameAs in finalClinMatrix.getCOLs(): print "ERROR there is already _EVENT in matrix" continue #data check GOOD = 1 states = finalClinMatrix.getColStates(feature) """ for state in states: if state not in [0,1,"0","1",""]: print "ERROR _EVENT values are not correct", state GOOD=0 break """ if len(states) not in [2, 3]: GOOD = 0 if len(states) == 3 and states.count('') != 1: GOOD = 0 if GOOD: foundE = 1 finalClinMatrix.addNewColfromOld(sameAs, feature) finalClinFeature.setFeatureValueType(sameAs, "category") #finalClinFeature.setFeatureStates(sameAs,["0","1"]) #finalClinFeature.setFeatureStateOrder(sameAs,["0","1"]) #clinical data push down roots = sMap.allRoots() for root in roots: r = finalClinMatrix.pushToChildren(root, sMap) if r != True: print "Fail to push down" return 0 print "after clinical push down", sampleMap, finalClinMatrix.getROWnum() # collect all genomic data keepSamples = getAllGenomicIds(sMap, bookDic) # removing rows without genomic data from clinical data matrix due to mysql enum limitation # should remove this step after the display functionality is done better, currently cgb clinical data range in feature control panel shows the full range of clinical data without checking if the specific track/dataset has the full value range. if onlyGenomicSamples: print "genomic sample count", len(keepSamples) success = finalClinMatrix.onlyKeepRows(keepSamples) if not success: print "fail to remove extra rows" else: print "after keeping sample with genomic data", finalClinMatrix.getROWnum( ) #add to the clinical matrix any samples with genomic data but no clinical data emptyData = {} for col in finalClinMatrix.getCOLs(): emptyData[col] = "" success = finalClinMatrix.addNewRows(keepSamples, emptyData) if not success: print "fail to add new roows" else: print "after adding all genomic data", finalClinMatrix.getROWnum() if finalClinMatrix.validate() != True: print "Fail to validate" cMatrix = oldCMatrix return 0 # end of collecting all genomic data #code to remove blacklist samples and all its descendants badList = badListSelfAndDescendants(sMap, bookDic) if badList != []: #remove badList samples finalClinMatrix.removeRows(badList, True) print "after remove badList", finalClinMatrix.getROWnum() #identify empty features badFeatures = finalClinMatrix.findBadColsNotRemove() print "emptye features:", badFeatures #finalBadFeatures=[] #if finalClinFeature: ########### don't understand this # for feature in badFeatures: # #get short label # shortTitle = finalClinFeature.getShortTitle(feature) # if not shortTitle: # print feature,"remove" # finalBadFeatures.append(feature) # else: # print shortTitle,"not remove" #else: # finalBadFeatures =badFeatures[:] #remove bad features finalBadFeatures = badFeatures finalClinMatrix.removeCols(finalBadFeatures) print "remove features", finalBadFeatures # add _PATIENT col if finalClinMatrix.addColRoot(sMap) == None: print "Fail to addColRoot" return 0 # add _INTEGRATION col intList = [] if bookDic.has_key(sampleMap) and bookDic[sampleMap].has_key( ":integrationId"): intName = bookDic[sampleMap][":integrationId"] fin = open(bookDic[intName]["path"], "r") intId = IntegrationId(intName, fin) intList = intId.getList() finalClinMatrix.addColIntegration(sMap, intList) # final ClinFeature json if finalClinFeatureJSON == None: jsonName = trackName_fix(sampleMapBaseName(sMap) + "_clinicalFeature") finalClinFeatureJSON = {} finalClinFeatureJSON["version"] = datetime.date.today().isoformat() finalClinFeatureJSON["type"] = "clinicalFeature" finalClinFeatureJSON["name"] = jsonName finalClinFeatureJSON["path"] = "" finalClinFeature = ClinicalFeatureNew(None, finalClinFeatureJSON["name"]) #final clinicalFeature if finalClinFeature: finalClinFeature.removeFeatures(finalBadFeatures) finalClinFeature.cleanState() finalClinFeature.checkFeatureWithMatrix(finalClinMatrix) #clinicalFeature fillin ValueType finalClinFeature.fillInValueTypeWithMatrix(finalClinMatrix) #clinicalFeature fillin missing features finalClinFeature.fillInFeaturesWithMatrix(finalClinMatrix) #clinicalFeature fillin short and long titles finalClinFeature.fillInTitles() #clinicalFeature fillin priority visibility #vis exceptions VIS_limit = 4 if bookDic.has_key( sMap.getName()) and bookDic[sMap.getName()].has_key("VIS"): VIS_limit = bookDic[sMap.getName()]["VIS"] finalClinFeature.fillInPriorityVisibility(VIS_limit) finalClinFeature.setFeatureShortTitle("_PATIENT", "_PATIENT_ID") finalClinFeature.setFeatureLongTitle("_PATIENT", "_PATIENT_ID") finalClinFeature.setFeatureValueType("_PATIENT", "category") finalClinFeature.setFeatureShortTitle("_INTEGRATION", "_SAMPLE_ID") finalClinFeature.setFeatureLongTitle("_INTEGRATION", "_SAMPLE_ID") finalClinFeature.setFeatureValueType("_INTEGRATION", "category") print sampleMap, finalClinMatrix.getROWnum() return finalClinMatrix, finalClinMatrixJSON, finalClinFeature, finalClinFeatureJSON