def process_steps(self, beddf, newPath, parentId, databaseSynId, seq_assay_id): """ Process bed file, update bed database, write bed file to path Args: beddf: Bed dataframe newPath: Path to new bed file parentId: Synapse id to store gene panel file databaseSynId: Synapse id of bed database seq_assay_id: GENIE seq assay id Returns: string: Path to new bed file """ final_beddf = self._process(beddf, seq_assay_id, newPath, parentId) process_functions.updateData( self.syn, databaseSynId, final_beddf, seq_assay_id, filterByColumn="SEQ_ASSAY_ID", toDelete=True, ) final_beddf.to_csv(newPath, sep="\t", index=False) return newPath
def process_steps(self, filePath, databaseToSynIdMappingDf): logger.debug("Performing process_steps for {}".format(self._fileType)) folder_id = databaseToSynIdMappingDf.Id[ databaseToSynIdMappingDf['Database'] == self._fileType][0] table_id = databaseToSynIdMappingDf.Id[ databaseToSynIdMappingDf['Database'] == f"{self._fileType}_table"][0] logger.debug(f"Storing file at {folder_id}") f = self.syn.store(synapseclient.File(filePath, parent=folder_id, annotations=dict( center=self.center, fileType=self._fileType)), forceVersion=False) # Add information about assay to the table data = self._get_dataframe(filePath) data['entity_id'] = f.id process_functions.updateData(syn=self.syn, databaseSynId=table_id, newData=data, filterBy=self.center, filterByColumn="center", col=self._required_columns, toDelete=True) return (filePath)
def process_steps(self, patientCountsDf, newPath, oncotreeLink, databaseSynId): patientCountsDf = self._process(patientCountsDf, oncotreeLink) process_functions.updateData(self.syn, databaseSynId, patientCountsDf, self.center) patientCountsDf.to_csv(newPath, sep="\t", index=False) return (newPath)
def process_steps(self, mutationInCis, newPath, databaseSynId): process_functions.updateData(self.syn, databaseSynId, mutationInCis, self.center, filterByColumn="Center") mutationInCis.to_csv(newPath, sep="\t", index=False) return (newPath)
def process_steps(self, fusion, databaseSynId, newPath, test): fusion = self._process(fusion, test) process_functions.updateData(self.syn, databaseSynId, fusion, self.center, toDelete=True) fusion.to_csv(newPath, sep="\t", index=False) return (newPath)
def process_steps(self, seg, newPath, databaseSynId): seg = self._process(seg) process_functions.updateData(self.syn, databaseSynId, seg, self.center, toDelete=True) seg.to_csv(newPath, sep="\t", index=False) return newPath
def process_steps(self, filePath, **kwargs): logger.info('PROCESSING %s' % filePath) newPath = kwargs['newPath'] databaseSynId = kwargs['databaseSynId'] clinicalSPDf = pd.read_csv(filePath, sep="\t", comment="#") clinicalSPDf = self._process(clinicalSPDf) process_functions.updateData(self.syn, databaseSynId, clinicalSPDf, self.center) clinicalSPDf.to_csv(newPath, sep="\t", index=False) return (newPath)
def process_steps(self, filePath, **kwargs): logger.info('PROCESSING %s' % filePath) databaseSynId = kwargs['databaseSynId'] newPath = kwargs['newPath'] vitalStatusDf = pd.read_csv(filePath, sep="\t", comment="#") vitalStatusDf = self._process(vitalStatusDf) #cols = vitalStatusDf.columns process_functions.updateData(self.syn, databaseSynId, vitalStatusDf, self.center) vitalStatusDf.to_csv(newPath, sep="\t",index=False) return(newPath)
def storeProcessedMaf( self, filePath, mafSynId, centerMafSynId, isNarrow=False): ''' Stores SP maf to database ''' logger.info('STORING %s' % filePath) mafDataFrame = pd.read_csv(filePath, sep="\t") process_functions.updateData( self.syn, mafSynId, mafDataFrame, self.center, filterByColumn="Center", toDelete=True) return(filePath)
def process_steps(self, deleteSamples, fileSynId, databaseSynId, newPath): info = self.syn.get(fileSynId, downloadFile=False) deleteSamples = self._process(deleteSamples, info.modifiedOn.split(".")[0]) process_functions.updateData( self.syn, databaseSynId, deleteSamples, self.center, filterByColumn="center", toDelete=True, ) return newPath
def process_steps(self, filePath, **kwargs): logger.info('PROCESSING %s' % filePath) newPath = kwargs['newPath'] databaseSynId = kwargs['databaseSynId'] mutationInCis = pd.read_csv(filePath, comment="#") #cols = mutationInCis.columns process_functions.updateData(self.syn, databaseSynId, mutationInCis, self.center, filterByColumn="Center") mutationInCis.to_csv(newPath, sep="\t", index=False) return (newPath)
def process_steps(self, filePath, **kwargs): #For CBS files if kwargs.get("path") is not None: filePath = kwargs['path'] newPath = filePath else: newPath = kwargs['newPath'] logger.info('PROCESSING %s' % filePath) databaseSynId = kwargs['databaseSynId'] seg = pd.read_csv(filePath, sep="\t") seg = self._process(seg) process_functions.updateData(self.syn, databaseSynId, seg, self.center, toDelete=True) seg.to_csv(newPath,sep="\t",index=False) return(newPath)
def process_steps(self, filePath, **kwargs): logger.info('PROCESSING %s' % filePath) databaseSynId = kwargs['databaseSynId'] newPath = kwargs['newPath'] test = kwargs['test'] fusion = pd.read_csv(filePath, sep="\t", comment="#") fusion = self._process(fusion, test) process_functions.updateData(self.syn, databaseSynId, fusion, self.center, toDelete=True) fusion.to_csv(newPath, sep="\t", index=False) return (newPath)
def process_steps(self, filePath, **kwargs): logger.info('PROCESSING %s' % filePath) fileSynId = kwargs['fileSynId'] databaseSynId = kwargs['databaseSynId'] newPath = kwargs['newPath'] info = self.syn.get(fileSynId, downloadFile=False) deleteSamples = pd.read_csv(filePath, header=None) deleteSamples = self._process(deleteSamples, info.modifiedOn.split(".")[0]) process_functions.updateData(self.syn, databaseSynId, deleteSamples, self.center, filterByColumn="center", toDelete=True) return (newPath)
def process_steps(self, filePath, **kwargs): newPath = kwargs['newPath'] parentId = kwargs['parentId'] databaseSynId = kwargs['databaseSynId'] logger.info('PROCESSING %s' % filePath) #standardize all SEQ_ASSAY_IDs seq_assay_id = os.path.basename(filePath).replace(".bed", "").upper() gene = pd.read_csv(filePath, sep="\t", header=None) bed = self._process(gene, seq_assay_id, newPath, parentId) process_functions.updateData(self.syn, databaseSynId, bed, seq_assay_id, filterByColumn="SEQ_ASSAY_ID", toDelete=True) bed.to_csv(newPath, sep="\t", index=False) return (newPath)
def process_steps(self, filePath, newPath, databaseSynId): logger.info('PROCESSING %s' % filePath) # databaseSynId = kwargs['databaseSynId'] # Must pass in a list assay_info_df = self._get_dataframe([filePath]) process_assay_info_df = self._process(assay_info_df) col = ['SEQ_ASSAY_ID', 'is_paired_end', 'library_selection', 'library_strategy', 'platform', 'read_length', 'instrument_model', 'gene_padding', 'number_of_genes', 'variant_classifications', 'CENTER'] process_functions.updateData( self.syn, databaseSynId, process_assay_info_df, self.center, col=col, filterByColumn="CENTER", toDelete=True) return(filePath)
def process_steps(self, assay_info_df, newPath, databaseSynId): # databaseSynId = kwargs['databaseSynId'] # Must pass in a list process_assay_info_df = self._process(assay_info_df) col = [ 'SEQ_ASSAY_ID', 'is_paired_end', 'library_selection', 'library_strategy', 'platform', 'read_length', 'instrument_model', 'gene_padding', 'number_of_genes', 'variant_classifications', 'CENTER' ] process_functions.updateData(self.syn, databaseSynId, process_assay_info_df, self.center, col=col, filterByColumn="CENTER", toDelete=True) process_assay_info_df.to_csv(newPath, sep="\t", index=False) return (newPath)
def process_steps(self, filePath, databaseToSynIdMappingDf, newPath, parentId): data = self._get_dataframe(filePath) table_id = databaseToSynIdMappingDf.Id[ databaseToSynIdMappingDf['Database'] == self._fileType][0] data['center'] = self.center logger.debug(f"Updating {self._fileType} data in table {table_id}.") process_functions.updateData(syn=self.syn, databaseSynId=table_id, newData=data, filterBy=self.center, filterByColumn="center", col=None, toDelete=True) data.to_csv(newPath, sep="\t", index=False) return (newPath)
def process_steps(self, assay_info_df, newPath, databaseSynId): """ Process bed input and update bed database Args: assay_info_df: Assay information dataframe newPath: Path to processed assay information databaseSynId: assay information database synapse id Returns: path to assay information dataframe """ # Must pass in a list process_assay_info_df = self._process(assay_info_df) process_functions.updateData( self.syn, databaseSynId, process_assay_info_df, self.center, filterByColumn="CENTER", toDelete=True, ) process_assay_info_df.to_csv(newPath, sep="\t", index=False) return newPath
def process_steps(self, filePath, databaseToSynIdMappingDf, newPath, parentId, oncotreeLink): patientSynId = databaseToSynIdMappingDf.Id[ databaseToSynIdMappingDf['Database'] == "patient"][0] sampleSynId = databaseToSynIdMappingDf.Id[ databaseToSynIdMappingDf['Database'] == "sample"][0] clinicalDf = pd.read_csv(filePath, sep="\t", comment="#") patient = False sample = False # These synapse ids for the clinical tier release scope is # hardcoded because it never changes patientColsTable = self.syn.tableQuery( 'select fieldName from syn8545211 where patient is ' 'True and inClinicalDb is True') patientCols = patientColsTable.asDataFrame()['fieldName'].tolist() sampleColsTable = self.syn.tableQuery( 'select fieldName from syn8545211 where sample is True ' 'and inClinicalDb is True') sampleCols = sampleColsTable.asDataFrame()['fieldName'].tolist() if "patient" in filePath.lower(): clinicalTemplate = pd.DataFrame(columns=patientCols) patient = True elif "sample" in filePath.lower(): clinicalTemplate = pd.DataFrame(columns=sampleCols) sample = True else: clinicalTemplate = pd.DataFrame(columns=set(patientCols + sampleCols)) sample = True patient = True newClinicalDf = self._process(clinicalDf, clinicalTemplate) if patient: patientClinical = newClinicalDf[patientCols].drop_duplicates( "PATIENT_ID") self.uploadMissingData(patientClinical, "PATIENT_ID", patientSynId, parentId) # retractedPatientSynId) process_functions.updateData(self.syn, patientSynId, patientClinical, self.center, col=patientCols, toDelete=True) if sample: if sum(newClinicalDf["SAMPLE_ID"].duplicated()) > 0: logger.error("There are duplicated samples, " "and the duplicates are removed") sampleClinical = newClinicalDf[sampleCols].drop_duplicates( "SAMPLE_ID") # Exclude all clinical samples with wrong oncotree codes oncotree_mapping = pd.DataFrame() oncotree_mapping_dict = \ process_functions.get_oncotree_code_mappings(oncotreeLink) # Add in unknown key for oncotree code oncotree_mapping_dict['UNKNOWN'] = {} oncotree_mapping['ONCOTREE_CODE'] = oncotree_mapping_dict.keys() # Make oncotree codes uppercase (SpCC/SPCC) sampleClinical['ONCOTREE_CODE'] = sampleClinical[ 'ONCOTREE_CODE'].astype(str).str.upper() sampleClinical = sampleClinical[ sampleClinical['ONCOTREE_CODE'].isin( oncotree_mapping['ONCOTREE_CODE'])] self.uploadMissingData(sampleClinical, "SAMPLE_ID", sampleSynId, parentId) # ,retractedSampleSynId) process_functions.updateData(self.syn, sampleSynId, sampleClinical, self.center, col=sampleCols, toDelete=True) newClinicalDf.to_csv(newPath, sep="\t", index=False) return (newPath)
def process_steps(self, clinicalSPDf, newPath, databaseSynId): clinicalSPDf = self._process(clinicalSPDf) process_functions.updateData(self.syn, databaseSynId, clinicalSPDf, self.center) clinicalSPDf.to_csv(newPath, sep="\t", index=False) return newPath
def run(self): """ This function runs the redcap export to export all files """ if not os.path.exists(self._SPONSORED_PROJECT): os.mkdir(self._SPONSORED_PROJECT) else: filelists = os.listdir(self._SPONSORED_PROJECT) for file in filelists: if file != "case_lists": os.remove(os.path.join(self._SPONSORED_PROJECT, file)) # Create full mapping table to get the values of the data model mapping = self.syn.tableQuery( "select genie_field_name,instrument from {} where " "{} is true and phi is false".format( self._DATA_ELEMENT_SYN_ID, self._SPONSORED_PROJECT.lower())) mappingDf = mapping.asDataFrame() newMappingDf = pd.DataFrame() for field, instrument in zip(mappingDf.genie_field_name, mappingDf.instrument): # Do not want to append the # values if "#" in field: # find fields with # and replace with however many times # it should loop through newfields = [ field.replace("#", str(count)) for count in list(range(1, self._NUM_COUNTS + 1)) ] newDataFrame = pd.DataFrame({ "genie_field_name": newfields, "instrument": [instrument] * len(newfields) }) else: newDataFrame = pd.DataFrame( { "genie_field_name": field, "instrument": instrument }, index=[0]) newMappingDf = newMappingDf.append(newDataFrame, sort=False) # If there are ever missing fields, they must be added in # or else the script will fail # missingFields= ['her_status_sample','sample_seq_yn'] # missingFieldType = ['sample_information']*2 # newMappingDf = newMappingDf.append(pd.DataFrame({ # "genie_field_name": missingFields, # "instrument": missingFieldType})) # Extract patient/sample/treatment columns patientCols = extractColumns(newMappingDf, [ "patient_information", "treatment_information_general", "diagnosis_information" ], [ 'errors_patient_info_yn', 'patient_info_errors', 'errors_dx_info_yn', 'dx_info_errors', 'so_yn' ]) sampleCols = extractColumns(newMappingDf, ["sample_information"], [ "test_sample", "fgfr4_variant", "errors_sample_info_yn", "sample_info_errors" ]) treatmentCols = extractColumns(newMappingDf, ["treatment_information_detailed"], []) unlabelledEnt = self.syn.get(self._UNLABELLED_SYN_ID) labelledEnt = self.syn.get(self._LABELLED_SYN_ID) unlabeledDf = pd.read_csv(unlabelledEnt.path) labeledDf = pd.read_csv(labelledEnt.path) # Add on CENTER column for all three file formats patientCols.append("redcap_data_access_group") sampleCols.append("redcap_data_access_group") treatmentCols.append("redcap_data_access_group") labeledDf.columns = unlabeledDf.columns labeledDf['redcap_data_access_group'][ labeledDf['redcap_data_access_group'] == "hop"] = "JHU" labeledDf['redcap_data_access_group'] = \ labeledDf['redcap_data_access_group'].apply(lambda x: x.upper()) patientDf = labeledDf[patientCols] patientRows = labeledDf.redcap_repeat_instrument.isnull() patientDf = patientDf[patientRows] sampleDf = labeledDf[sampleCols] sampleRows = labeledDf.redcap_repeat_instrument == "Sample Information" sampleDf = sampleDf[sampleRows] # Red cap header to cbio header Table mapping redCapToCbioMapping = self.syn.tableQuery( "SELECT * FROM %s" % self._REDCAP_TO_CBIOMAPPING_SYNID) redCapToCbioMappingDf = redCapToCbioMapping.asDataFrame() # Get all the samples/patients that should be uploaded to SP projects # Hard coded clinical database genie_clinicalDb = self.syn.tableQuery( 'select SAMPLE_ID, PATIENT_ID, ONCOTREE_CODE, SEQ_ASSAY_ID ' 'from syn7517674') genie_clinicalDf = genie_clinicalDb.asDataFrame() # Hard coded clinicalSP database # nonGenie_clinicalDb = self.syn.tableQuery( # 'SELECT * FROM syn11492579') # nonGenie_clinicalDf = nonGenie_clinicalDb.asDataFrame() # genie_clinicalDf = genie_clinicalDf.append(nonGenie_clinicalDf) # Only patients and samples that exist in the # sponsored project uploads are going to be pulled into the SP project finalPatientDf = self.configureClinicalDf(patientDf, redCapToCbioMappingDf) patient_date_col = [ col for col in finalPatientDf.columns if col.endswith("INT") ] patient_date_col.append("OS_MONTHS") final_patientdf_datesdays = finalPatientDf.copy() finalPatientDf[patient_date_col] = \ finalPatientDf[patient_date_col].applymap(change_days_to_months) subsetPatientDf = finalPatientDf[finalPatientDf['PATIENT_ID'].isin( genie_clinicalDf['PATIENT_ID'])] del subsetPatientDf['SP'] # Remove CENTER and ONCOTREE_CODE from patient because you # cannot have these columns in both sample and patient Df, # it will fail validation for cbioportal del subsetPatientDf['CENTER'] del subsetPatientDf['ONCOTREE_CODE'] patientPath = self.writeClinicalFile(subsetPatientDf, redCapToCbioMappingDf, "patient") finalSampleDf = self.configureClinicalDf(sampleDf, redCapToCbioMappingDf) sample_date_cols = ['SAMPLE_DATE_INT', 'AGE_AT_SEQ_REPORT'] final_sampledf_datesdays = finalSampleDf.copy() finalSampleDf[sample_date_cols] = \ finalSampleDf[sample_date_cols].applymap(change_days_to_months) # Fill in ONCOTREE_CODE finalSampleDf['ONCOTREE_CODE'] = [ genie_clinicalDf['ONCOTREE_CODE'][genie_clinicalDf['SAMPLE_ID'] == sample].values[0] if sum(genie_clinicalDf['SAMPLE_ID'] == sample) > 0 else float('nan') for sample in finalSampleDf['SAMPLE_ID'] ] # Fill in SEQ_ASSAY_ID finalSampleDf['SEQ_ASSAY_ID'] = [ genie_clinicalDf['SEQ_ASSAY_ID'][genie_clinicalDf['SAMPLE_ID'] == sample].values[0] if sum(genie_clinicalDf['SAMPLE_ID'] == sample) > 0 else float('nan') for sample in finalSampleDf['SAMPLE_ID'] ] subsetSampleDf = finalSampleDf[finalSampleDf['SAMPLE_ID'].isin( genie_clinicalDf['SAMPLE_ID'])] del subsetSampleDf['SP'] samplePath = self.writeClinicalFile(subsetSampleDf, redCapToCbioMappingDf, "sample") # Remove oncotree code here, because no longer need it mergedClinicalDf = subsetSampleDf.merge(subsetPatientDf, on="PATIENT_ID", how="outer") if mergedClinicalDf.get("SAMPLE_ID") is not None: print("Samples not in GENIE clinical databases (SP and normal)") notFoundSamples = mergedClinicalDf[ 'SAMPLE_ID'][~mergedClinicalDf['SAMPLE_ID']. isin(genie_clinicalDf['SAMPLE_ID'])] if not notFoundSamples.empty: print(notFoundSamples[~notFoundSamples.isnull()]) notFoundSamples.to_csv("notfoundsamples.csv", header=False) if not self.staging: self.syn.store( synapseclient.File( "notfoundsamples.csv", parent=self._SP_REDCAP_EXPORTS_SYNID)) # Hard coded most up to date oncotree version oncotreeLink = self.syn.get("syn13890902").externalURL # Use the old oncotree link for now oncotreeLink = 'http://oncotree.mskcc.org/api/tumorTypes/tree?version=oncotree_2017_06_21' oncotreeDict = \ process_functions.get_oncotree_code_mappings(oncotreeLink) mergedClinicalDf['CANCER_TYPE'] = [ oncotreeDict[code.upper()].get("CANCER_TYPE", float('nan')) for code in mergedClinicalDf['ONCOTREE_CODE'] ] mergedClinicalDf['CANCER_TYPE_DETAILED'] = [ oncotreeDict[code.upper()].get("CANCER_TYPE_DETAILED", float('nan')) for code in mergedClinicalDf['ONCOTREE_CODE'] ] mergedClinicalDf['ONCOTREE_PRIMARY_NODE'] = [ oncotreeDict[code.upper()].get("ONCOTREE_PRIMARY_NODE", float('nan')) for code in mergedClinicalDf['ONCOTREE_CODE'] ] mergedClinicalDf['ONCOTREE_SECONDARY_NODE'] = [ oncotreeDict[code.upper()].get("ONCOTREE_SECONDARY_NODE", float('nan')) for code in mergedClinicalDf['ONCOTREE_CODE'] ] mergedClinicalDf.to_csv("%s/data_clinical.txt" % self._SPONSORED_PROJECT, index=False, sep="\t") if not self.staging: process_functions.updateData(self.syn, "syn17010637", finalPatientDf, self._SPONSORED_PROJECT, filterByColumn="SP", toDelete=True) patientFileEnt = File(patientPath, parent=self._SP_SYN_ID) patientEnt = self.syn.store(patientFileEnt, used=labelledEnt.id, executed=self._GITHUB_REPO) process_functions.updateData(self.syn, "syn17010638", finalSampleDf, self._SPONSORED_PROJECT, filterByColumn="SP", toDelete=True) sampleFileEnt = File(samplePath, parent=self._SP_SYN_ID) sampleEnt = self.syn.store(sampleFileEnt, used=labelledEnt.id, executed=self._GITHUB_REPO) treatmentDf = labeledDf[treatmentCols] treatmentRows = labeledDf.redcap_repeat_instrument == \ "Treatment Information Detailed" treatmentDf = treatmentDf[treatmentRows] finalTimelineDf = self.makeTimeLineDf(treatmentDf, final_patientdf_datesdays) finalTimelineDf.PATIENT_ID = finalTimelineDf.apply( lambda x: process_functions.checkGenieId(x['PATIENT_ID'], x[ 'CENTER']), axis=1) if not self.staging: process_functions.updateData(self.syn, "syn17011214", finalTimelineDf, self._SPONSORED_PROJECT, filterByColumn="SP", toDelete=True) # METASTATIC DIAGNOSIS (append to timeline) metaDiagnosisDf = self.createMetaDiagnosisDf(finalTimelineDf) # Maintain ordering of timeline ordering = finalTimelineDf.columns.tolist() # Two extra timeline columns from specimen file ordering.extend(["SAMPLE_ID", "SAMPLE_NOTES"]) finalTimelineDf = finalTimelineDf.append(metaDiagnosisDf, sort=False) # Create specimen file to append to timeline file too specimenDf = self.createSpecimenDf(final_sampledf_datesdays, final_patientdf_datesdays) specimenDf = specimenDf[specimenDf['SAMPLE_ID'].isin( genie_clinicalDf['SAMPLE_ID'])] # dates = ['START_DATE', 'STOP_DATE', 'LINE_START'] finalTimelineDf = finalTimelineDf.append(specimenDf, sort=False) # No need to convert timeline dates to months # finalTimelineDf[dates] = \ # finalTimelineDf[dates].applymap(change_days_to_months) finalTimelineDf = finalTimelineDf[ordering] finalTimelineDf = finalTimelineDf[finalTimelineDf['PATIENT_ID'].isin( genie_clinicalDf['PATIENT_ID'])] finalTimelineDf['AGENT'][finalTimelineDf['AGENT'].isnull()] = "Unknown" timelineText = finalTimelineDf.to_csv(index=False, sep="\t") timelineText = replace0(timelineText) timeline_path = "%s/data_timeline.txt" % self._SPONSORED_PROJECT with open(timeline_path, 'w') as timelineFile: timelineFile.write(timelineText) if not self.staging: fileEnt = File(timeline_path, parent=self._SP_SYN_ID) self.syn.store(fileEnt, used=labelledEnt.id, executed=self._GITHUB_REPO) # Get database to synapse id mapping table so no need to # hardcode synapse ids databaseToSynIdMapping = \ self.syn.tableQuery('SELECT * FROM syn10967259') databaseToSynIdMappingDf = databaseToSynIdMapping.asDataFrame() centerMafFileViewSynId = databaseToSynIdMappingDf['Id'][ databaseToSynIdMappingDf['Database'] == "centerMafView"][0] centerMafSynIds = self.syn.tableQuery( "select id from {} where name like '%mutation%'".format( centerMafFileViewSynId)) centerMafSynIdsDf = centerMafSynIds.asDataFrame() # This value must be set outside here because the first maf file might # Not be part of the centers index = 0 mafpath = "{}/data_mutations_extended.txt".format( self._SPONSORED_PROJECT) for mafSynId in centerMafSynIdsDf.id: mafEnt = self.syn.get(mafSynId, downloadFile=False) mafcenter = mafEnt.name.split("_")[3] if mafcenter in finalSampleDf['CENTER'].tolist(): mafEnt = self.syn.get(mafSynId) print("running", mafEnt.name) with open(mafEnt.path, "r") as mafFile: header = mafFile.readline() headers = header.replace("\n", "").split("\t") if index == 0: with open(mafpath, 'w') as f: f.write(header) index += 1 for row in mafFile: rowArray = row.replace("\n", "").split("\t") center = rowArray[headers.index('Center')] newMergedRow = configureMafRow( rowArray, headers, finalSampleDf['SAMPLE_ID']) if newMergedRow is not None: with open(mafpath, 'a') as f: f.write(newMergedRow) # No longer need to pulling from non genie db fileEnt = File(mafpath, parent=self._SP_SYN_ID) if not self.staging: self.syn.store(fileEnt, used=centerMafSynIdsDf.id.tolist(), executed=self._GITHUB_REPO) CNA_PATH = "%s/data_CNA.txt" % self._SPONSORED_PROJECT CNA_CENTER_PATH = self._SPONSORED_PROJECT + "/data_CNA_%s.txt" centerCNASynIds = self.syn.tableQuery( "select id from {} where name like 'data_CNA%'".format( centerMafFileViewSynId)) centerCNASynIdsDf = centerCNASynIds.asDataFrame() # Grab all unique symbols and form cnaTemplate allSymbols = set() for cnaSynId in centerCNASynIdsDf.id: cnaEnt = self.syn.get(cnaSynId) with open(cnaEnt.path, "r") as cnaFile: # Read first line first to get all the samples cnaFile.readline() # Get all hugo symbols allSymbols = allSymbols.union( set(line.split("\t")[0] for line in cnaFile)) cnaTemplate = pd.DataFrame({"Hugo_Symbol": list(allSymbols)}) cnaTemplate.sort_values("Hugo_Symbol", inplace=True) cnaTemplate.to_csv(CNA_PATH, sep="\t", index=False) withMergedHugoSymbol = pd.Series("Hugo_Symbol") withMergedHugoSymbol = \ withMergedHugoSymbol.append(pd.Series(finalSampleDf['SAMPLE_ID'])) cnaSamples = [] for cnaSynId in centerCNASynIdsDf.id: cnaEnt = self.syn.get(cnaSynId) center = cnaEnt.name.replace("data_CNA_", "").replace(".txt", "") print(cnaEnt.path) # if center in CENTER_MAPPING_DF.center.tolist(): centerCNA = pd.read_csv(cnaEnt.path, sep="\t") merged = cnaTemplate.merge(centerCNA, on="Hugo_Symbol", how="outer") merged.sort_values("Hugo_Symbol", inplace=True) # This is to remove more samples for the final cna file merged = merged[merged.columns[merged.columns.isin( withMergedHugoSymbol)]] cnaText = process_functions.removePandasDfFloat(merged) # Must do this replace twice because \t\t\t -> # \tNA\t\t -> \tNA\tNA\t cnaText = cnaText.replace("\t\t", "\tNA\t").replace( "\t\t", "\tNA\t").replace('\t\n', "\tNA\n") with open(CNA_CENTER_PATH % center, "w") as cnaFile: cnaFile.write(cnaText) cnaSamples.extend(merged.columns[1:].tolist()) # Join CNA file joinCommand = ["join", CNA_PATH, CNA_CENTER_PATH % center] output = subprocess.check_output(joinCommand) with open(CNA_PATH, "w") as cnaFile: cnaFile.write(output.decode("utf-8").replace(" ", "\t")) fileEnt = File(CNA_PATH, parent=self._SP_SYN_ID) if not self.staging: self.syn.store(fileEnt, used=centerCNASynIdsDf.id.tolist(), executed=self._GITHUB_REPO) self.createGeneMatrixDf(finalSampleDf, cnaSamples, labelledEnt) fusion = self.syn.tableQuery("SELECT * FROM syn7893268 where " "TUMOR_SAMPLE_BARCODE in ('{}')".format( "','".join( finalSampleDf['SAMPLE_ID']))) fusions_df = fusion.asDataFrame() if not fusions_df.empty: fusions_df = fusions_df.rename( columns={ 'HUGO_SYMBOL': 'Hugo_Symbol', 'ENTREZ_GENE_ID': 'Entrez_Gene_Id', 'CENTER': 'Center', 'TUMOR_SAMPLE_BARCODE': 'Tumor_Sample_Barcode', 'FUSION': 'Fusion', 'DNA_SUPPORT': 'DNA_support', 'RNA_SUPPORT': 'RNA_support', 'METHOD': 'Method', 'FRAME': 'Frame', 'COMMENTS': 'Comments' }) fusions_df.Entrez_Gene_Id[fusions_df.Entrez_Gene_Id == 0] = pd.np.nan fusionText = fusions_df.to_csv(sep="\t", index=False) fusionText = replace0(fusionText) fusion_path = "%s/data_fusions.txt" % self._SPONSORED_PROJECT with open(fusion_path, "w") as fusionFile: fusionFile.write(fusionText) fileEnt = File(fusion_path, parent=self._SP_SYN_ID) if not self.staging: self.syn.store(fileEnt, used=fusion.tableId, executed=self._GITHUB_REPO) seg = self.syn.tableQuery( "SELECT ID, CHROM, LOCSTART, LOCEND, NUMMARK, SEGMEAN " "FROM syn7893341 where ID in ('{}')".format("','".join( finalSampleDf['SAMPLE_ID']))) seg_df = seg.asDataFrame() if not seg_df.empty: seg_df.rename(columns={ "CHROM": "chrom", "LOCSTART": "loc.start", "LOCEND": "loc.end", "NUMMARK": "num.mark", "SEGMEAN": "seg.mean" }, inplace=True) segText = replace0(seg_df.to_csv(sep="\t", index=False)) segpath = "{}/genie_{}_data_cna_hg19.seg".format( self._SPONSORED_PROJECT, self._SPONSORED_PROJECT.lower()) with open(segpath, 'w') as segFile: segFile.write(segText) fileEnt = File(segpath, parent=self._SP_SYN_ID) if not self.staging: self.syn.store(fileEnt, used=seg.tableId, executed=self._GITHUB_REPO) # Create case lists if not os.path.exists(self._CASE_LIST_PATH): os.mkdir(self._CASE_LIST_PATH) else: caselists = os.listdir(self._CASE_LIST_PATH) for caselist in caselists: os.remove(os.path.join(self._CASE_LIST_PATH, caselist)) # Write out cases sequenced so people can tell # which samples were sequenced create_case_lists.main( "%s/data_clinical.txt" % self._SPONSORED_PROJECT, "%s/data_gene_matrix.txt" % self._SPONSORED_PROJECT, self._CASE_LIST_PATH, "genie_{}".format(self._SPONSORED_PROJECT.lower())) caseListFiles = os.listdir(self._CASE_LIST_PATH) for casePath in caseListFiles: casePath = os.path.join(self._CASE_LIST_PATH, casePath) fileEnt = File(casePath, parent=self._CASE_LIST_SYN_ID) if not self.staging: self.syn.store(fileEnt, used=[patientEnt.id, sampleEnt.id], executed=self._GITHUB_REPO) seq_assays = "','".join(set(finalSampleDf['SEQ_ASSAY_ID'])) bed = self.syn.tableQuery( "SELECT Hugo_Symbol, SEQ_ASSAY_ID FROM syn8457748 where " "SEQ_ASSAY_ID in ('{}') and " "Feature_Type = 'exon' and " "Hugo_Symbol is not null and " "includeInPanel is true".format(seq_assays)) beddf = bed.asDataFrame() bed = self.syn.tableQuery( "SELECT Hugo_Symbol, SEQ_ASSAY_ID FROM syn11516678 where " "SEQ_ASSAY_ID in ('{}') and " "Feature_Type = 'exon' and " "Hugo_Symbol is not null and " "includeInPanel is true".format(seq_assays)) non_genie_beddf = bed.asDataFrame() beddf = beddf.append(non_genie_beddf) seq_assay_groups = beddf.groupby('SEQ_ASSAY_ID') for seq_assay_id, seqdf in seq_assay_groups: unique_genes = seqdf.Hugo_Symbol.unique() gene_panel_text = ("stable_id: {seq_assay_id}\n" "description: {seq_assay_id}, " "Number of Genes - {num_genes}\n" "gene_list:\t{genelist}".format( seq_assay_id=seq_assay_id, num_genes=len(unique_genes), genelist="\t".join(unique_genes))) gene_panel_name = "data_gene_panel_" + seq_assay_id + ".txt" gene_panel_path = os.path.join(self._SPONSORED_PROJECT, gene_panel_name) with open(gene_panel_path, "w+") as f: f.write(gene_panel_text) fileEnt = File(gene_panel_path, parent=self._SP_SYN_ID) if not self.staging: self.syn.store(fileEnt, executed=self._GITHUB_REPO) # Make sure to re download all the metadata files again self.reviseMetadataFiles() cmd = [ 'python', os.path.join(self.cbioPath, "core/src/main/scripts/importer/validateData.py"), "-s", self._SPONSORED_PROJECT, "-n" ] subprocess.call(cmd)
def process_steps( self, clinicalDf, databaseToSynIdMappingDf, newPath, parentId, oncotree_link, clinicalTemplate, sample, patient, patientCols, sampleCols, ): """Process clincial file, redact PHI values, upload to clinical database """ patientdb_idx = databaseToSynIdMappingDf["Database"] == "patient" patient_synid = databaseToSynIdMappingDf.Id[patientdb_idx][0] sampledb_idx = databaseToSynIdMappingDf["Database"] == "sample" sample_synid = databaseToSynIdMappingDf.Id[sampledb_idx][0] newClinicalDf = self._process(clinicalDf, clinicalTemplate) newClinicalDf = redact_phi(newClinicalDf) if patient: cols = newClinicalDf.columns[newClinicalDf.columns.isin(patientCols)] patientClinical = newClinicalDf[cols].drop_duplicates("PATIENT_ID") self.uploadMissingData( patientClinical, "PATIENT_ID", patient_synid, parentId ) process_functions.updateData( self.syn, patient_synid, patientClinical, self.center, col=cols.tolist(), toDelete=True, ) if sample: cols = newClinicalDf.columns[newClinicalDf.columns.isin(sampleCols)] if sum(newClinicalDf["SAMPLE_ID"].duplicated()) > 0: logger.error( "There are duplicated samples, " "and the duplicates are removed" ) sampleClinical = newClinicalDf[cols].drop_duplicates("SAMPLE_ID") # Exclude all clinical samples with wrong oncotree codes oncotree_mapping = pd.DataFrame() oncotree_mapping_dict = process_functions.get_oncotree_code_mappings( oncotree_link ) # Add in unknown key for oncotree code oncotree_mapping_dict["UNKNOWN"] = {} oncotree_mapping["ONCOTREE_CODE"] = list(oncotree_mapping_dict.keys()) # Make oncotree codes uppercase (SpCC/SPCC) sampleClinical["ONCOTREE_CODE"] = ( sampleClinical["ONCOTREE_CODE"].astype(str).str.upper() ) sampleClinical = sampleClinical[ sampleClinical["ONCOTREE_CODE"].isin(oncotree_mapping["ONCOTREE_CODE"]) ] self.uploadMissingData(sampleClinical, "SAMPLE_ID", sample_synid, parentId) # ,retractedSampleSynId) process_functions.updateData( self.syn, sample_synid, sampleClinical, self.center, col=cols.tolist(), toDelete=True, ) newClinicalDf.to_csv(newPath, sep="\t", index=False) return newPath