def test_get_oncotree_code_mappings(): from genie import process_functions with mock.patch("genie.process_functions.retry_get_url", return_value=fake_oncotree) as retry_get_url: onco_mapping = \ process_functions.get_oncotree_code_mappings(json_oncotreeurl) retry_get_url.called_once_with(json_oncotreeurl) assert onco_mapping == expected_onco_mapping
def _process(self, patientCountsDf, oncotreeLink): patientCountsDf['CENTER'] = self.center oncotree_mapping_dict = \ process_functions.get_oncotree_code_mappings(oncotreeLink) patientCountsDf['PRIMARY_CODE'] = [ oncotree_mapping_dict[i.upper()]['ONCOTREE_PRIMARY_NODE'] for i in patientCountsDf.ONCOTREE_CODE ] return (patientCountsDf)
def _validate(self, patCountsDf, oncotreeLink): total_error = "" warning = "" # oncotree_mapping = process_functions.get_oncotree_codes(oncotreeLink) # if oncotree_mapping.empty: oncotree_mapping = pd.DataFrame() oncotree_mapping_dict = \ process_functions.get_oncotree_code_mappings(oncotreeLink) oncotree_mapping['ONCOTREE_CODE'] = oncotree_mapping_dict.keys() haveColumn = \ process_functions.checkColExist(patCountsDf, "ONCOTREE_CODE") if haveColumn: if sum(patCountsDf['ONCOTREE_CODE'].duplicated()) > 0: total_error += ( "Patient Counts: " "Must not have any duplicated ONCOTREE CODES.\n") if not all(patCountsDf['ONCOTREE_CODE'].isin( oncotree_mapping['ONCOTREE_CODE'])): unmapped_oncotrees = patCountsDf[ 'ONCOTREE_CODE'][~patCountsDf['ONCOTREE_CODE']. isin(oncotree_mapping['ONCOTREE_CODE'])] total_error += ( "Patient Counts: Please double check that all your " "ONCOTREE CODES exist in the mapping. You have {} codes " "that don't map. These are the codes that " "don't map: {}\n".format(len(unmapped_oncotrees), ",".join( set(unmapped_oncotrees)))) else: total_error += ( "Patient Counts: File must have ONCOTREE_CODE column.\n") haveColumn = process_functions.checkColExist(patCountsDf, "NUM_PATIENTS_PD1_PDL1") if haveColumn: if not all([ isinstance(i, int) for i in patCountsDf['NUM_PATIENTS_PD1_PDL1'] ]): total_error += ( "Patient Counts: Must not have any null values, " "and must be all integers.\n") else: total_error += ("Patient Counts: File must have " "NUM_PATIENTS_PD1_PDL1 column.\n") return (total_error, warning)
def main(): """ Parses a clinical file with a ONCOTREE_CODE column and add/update the CANCER_TYPE and CANCER_TYPE_DETAILED columns inplace with values from an oncotree instance. """ parser = argparse.ArgumentParser() parser.add_argument('-o', '--oncotree-url', action='store', dest='oncotree_url', required=True, help='The url of the raw oncotree text file') parser.add_argument('-c', '--clinical-file', action='store', dest='clinical_file', required=True, help='Path to the clinical file') parser.add_argument('-j', '--json', action='store_true', dest='json', help='If oncotree url is json format') args = parser.parse_args() oncotree_url = args.oncotree_url clinical_filename = args.clinical_file json = args.json if not os.path.exists(clinical_filename): print('clinical file cannot be found ' + clinical_filename) sys.exit(2) oncotree = process.get_oncotree_codes(oncotree_url) if oncotree.empty: oncotree = process.get_oncotree_code_mappings(oncotree_url) process_clinical_file_json(oncotree, clinical_filename) else: oncotree = get_oncotree(oncotree_url) spreadsheet_fields = [ i for i in oncotree[0].split("\t") if "level" in i ] spreadsheet_fields.reverse() process_clinical_file(oncotree, clinical_filename, spreadsheet_fields) report_failed_matches()
def _validate(self, clinicaldf, oncotree_link): """ This function validates the clinical file to make sure it adhere to the clinical SOP. Args: clinicalDF: Merged clinical file with patient and sample information oncotree_link: Link to oncotree Returns: Error message """ total_error = StringIO() warning = StringIO() clinicaldf.columns = [col.upper() for col in clinicaldf.columns] # CHECK: for empty rows empty_rows = clinicaldf.isnull().values.all(axis=1) if empty_rows.any(): total_error.write("Clinical file(s): No empty rows allowed.\n") # Remove completely empty rows to speed up processing clinicaldf = clinicaldf[~empty_rows] clinicaldf = clinicaldf.fillna("") oncotree_mapping_dict = process_functions.get_oncotree_code_mappings( oncotree_link ) oncotree_mapping = pd.DataFrame( {"ONCOTREE_CODE": list(oncotree_mapping_dict.keys())} ) sampletype_mapping = process_functions.getGenieMapping(self.syn, "syn7434273") ethnicity_mapping = process_functions.getGenieMapping(self.syn, "syn7434242") race_mapping = process_functions.getGenieMapping(self.syn, "syn7434236") sex_mapping = process_functions.getGenieMapping(self.syn, "syn7434222") # CHECK: SAMPLE_ID sample_id = "SAMPLE_ID" haveSampleColumn = process_functions.checkColExist(clinicaldf, sample_id) if not haveSampleColumn: total_error.write("Sample Clinical File: Must have SAMPLE_ID column.\n") else: if sum(clinicaldf[sample_id].duplicated()) > 0: total_error.write( "Sample Clinical File: No duplicated SAMPLE_ID " "allowed.\nIf there are no duplicated " "SAMPLE_IDs, and both sample and patient files are " "uploaded, then please check to make sure no duplicated " "PATIENT_IDs exist in the patient clinical file.\n" ) # CHECK: PATIENT_ID patientId = "PATIENT_ID" # #CHECK: PATIENT_ID IN SAMPLE FILE havePatientColumn = process_functions.checkColExist(clinicaldf, patientId) if not havePatientColumn: total_error.write("Patient Clinical File: Must have PATIENT_ID column.\n") # CHECK: within the sample file that the sample ids match # the patient ids if haveSampleColumn and havePatientColumn: # Make sure sample and patient ids are string cols clinicaldf[sample_id] = clinicaldf[sample_id].astype(str) clinicaldf[patientId] = clinicaldf[patientId].astype(str) if not all( [ patient in sample for sample, patient in zip( clinicaldf[sample_id], clinicaldf[patientId] ) ] ): total_error.write( "Sample Clinical File: PATIENT_ID's much be contained in " "the SAMPLE_ID's (ex. SAGE-1 <-> SAGE-1-2)\n" ) # #CHECK: All samples must have associated patient data # (GENIE requires patient data) if not all(clinicaldf[patientId] != ""): total_error.write( "Patient Clinical File: All samples must have associated " "patient information and no null patient ids allowed. " "These samples are missing patient data: {}\n".format( ", ".join( clinicaldf[sample_id][clinicaldf[patientId] == ""].unique() ) ) ) # CHECK: All patients should have associated sample data if not all(clinicaldf[sample_id] != ""): # ## MAKE WARNING FOR NOW### warning.write( "Sample Clinical File: All patients must have associated " "sample information. These patients are missing sample " "data: {}\n".format( ", ".join( clinicaldf[patientId][clinicaldf[sample_id] == ""].unique() ) ) ) # CHECK: AGE_AT_SEQ_REPORT age = "AGE_AT_SEQ_REPORT" haveColumn = process_functions.checkColExist(clinicaldf, age) if haveColumn: # Deal with HIPAA converted rows from DFCI # First for loop can't int(text) because there # are instances that have <3435 age_seq_report_df = clinicaldf[ ~clinicaldf[age].isin(["Unknown", ">32485", "<6570"]) ] # age_seq_report_df[age] = \ # remove_greaterthan_lessthan_str(age_seq_report_df[age]) if not all([process_functions.checkInt(i) for i in age_seq_report_df[age]]): total_error.write( "Sample Clinical File: Please double check your " "AGE_AT_SEQ_REPORT. It must be an integer, 'Unknown', " "'>32485', '<6570'.\n" ) else: age_seq_report_df[age] = age_seq_report_df[age].astype(int) median_age = age_seq_report_df[age].median() if median_age < 100: total_error.write( "Sample Clinical File: Please double check your " "AGE_AT_SEQ_REPORT. You may be reporting this value " "in YEARS, please report in DAYS.\n" ) else: total_error.write( "Sample Clinical File: Must have AGE_AT_SEQ_REPORT column.\n" ) # CHECK: ONCOTREE_CODE haveColumn = process_functions.checkColExist(clinicaldf, "ONCOTREE_CODE") maleOncoCodes = ["TESTIS", "PROSTATE", "PENIS"] womenOncoCodes = ["CERVIX", "VULVA", "UTERUS", "OVARY"] if haveColumn: # Make oncotree codes uppercase (SpCC/SPCC) clinicaldf["ONCOTREE_CODE"] = ( clinicaldf["ONCOTREE_CODE"].astype(str).str.upper() ) oncotree_codes = clinicaldf["ONCOTREE_CODE"][ clinicaldf["ONCOTREE_CODE"] != "UNKNOWN" ] if not all(oncotree_codes.isin(oncotree_mapping["ONCOTREE_CODE"])): unmapped_oncotrees = oncotree_codes[ ~oncotree_codes.isin(oncotree_mapping["ONCOTREE_CODE"]) ] total_error.write( "Sample Clinical File: Please double check that all your " "ONCOTREE CODES exist in the mapping. You have {} samples " "that don't map. These are the codes that " "don't map: {}\n".format( len(unmapped_oncotrees), ",".join(set(unmapped_oncotrees)), ) ) # Should add the SEX mismatch into the dashboard file if ( process_functions.checkColExist(clinicaldf, "SEX") and "oncotree_mapping_dict" in locals() and havePatientColumn and haveSampleColumn ): wrongCodeSamples = [] # This is to check if oncotree codes match the sex, # returns list of samples that have conflicting codes and sex for code, patient, sample in zip( clinicaldf["ONCOTREE_CODE"], clinicaldf["PATIENT_ID"], clinicaldf["SAMPLE_ID"], ): if ( oncotree_mapping_dict.get(code) is not None and sum(clinicaldf["PATIENT_ID"] == patient) > 0 ): primaryCode = oncotree_mapping_dict[code][ "ONCOTREE_PRIMARY_NODE" ] sex = clinicaldf["SEX"][ clinicaldf["PATIENT_ID"] == patient ].values[0] sex = float("nan") if sex == "" else float(sex) if ( oncotree_mapping_dict[code]["ONCOTREE_PRIMARY_NODE"] in maleOncoCodes and sex != 1.0 ): wrongCodeSamples.append(sample) if ( oncotree_mapping_dict[code]["ONCOTREE_PRIMARY_NODE"] in womenOncoCodes and sex != 2.0 ): wrongCodeSamples.append(sample) if len(wrongCodeSamples) > 0: warning.write( "Sample Clinical File: Some SAMPLE_IDs have " "conflicting SEX and ONCOTREE_CODES: {}\n".format( ",".join(wrongCodeSamples) ) ) else: total_error.write("Sample Clinical File: Must have ONCOTREE_CODE column.\n") warn, error = process_functions.check_col_and_values( clinicaldf, "SAMPLE_TYPE", sampletype_mapping["CODE"].tolist(), "Sample Clinical File", required=True, ) total_error.write(error) # CHECK: SEQ_ASSAY_ID haveColumn = process_functions.checkColExist(clinicaldf, "SEQ_ASSAY_ID") if haveColumn: if not all([i != "" for i in clinicaldf["SEQ_ASSAY_ID"]]): total_error.write( "Sample Clinical File: Please double check your " "SEQ_ASSAY_ID columns, there are empty rows.\n" ) # must remove empty seq assay ids first # Checking if seq assay ids start with the center name empty_seq_idx = clinicaldf.SEQ_ASSAY_ID != "" seqassay_ids = clinicaldf.SEQ_ASSAY_ID[empty_seq_idx] uniq_seqassay_ids = seqassay_ids.unique() invalid_seqassay = [] for seqassay in uniq_seqassay_ids: # SEQ Ids are all capitalized now, so no need to check # for differences in case if not seqassay.upper().startswith(self.center): invalid_seqassay.append(seqassay) if invalid_seqassay: total_error.write( "Sample Clinical File: Please make sure your " "SEQ_ASSAY_IDs start with your center " "abbreviation: {}.\n".format(", ".join(invalid_seqassay)) ) else: total_error.write("Sample Clinical File: Must have SEQ_ASSAY_ID column.\n") haveColumn = process_functions.checkColExist(clinicaldf, "SEQ_DATE") seq_date_error = ( "Sample Clinical File: SEQ_DATE must be one of five values- " "For Jan-March: use Jan-YEAR. " "For Apr-June: use Apr-YEAR. " "For July-Sep: use Jul-YEAR. " "For Oct-Dec: use Oct-YEAR. (ie. Apr-2017) " "For values that don't have SEQ_DATES that " "you want released use 'release'.\n" ) if haveColumn: clinicaldf["SEQ_DATE"] = [ i.title() for i in clinicaldf["SEQ_DATE"].astype(str) ] seqdate = clinicaldf["SEQ_DATE"][clinicaldf["SEQ_DATE"] != "Release"] if sum(clinicaldf["SEQ_DATE"] == "") > 0: total_error.write( "Sample Clinical File: Samples without SEQ_DATEs will " "NOT be released.\n" ) try: if not seqdate.empty: seqdate.apply( lambda date: datetime.datetime.strptime(date, "%b-%Y") ) if not seqdate.str.startswith(("Jan", "Apr", "Jul", "Oct")).all(): total_error.write(seq_date_error) except ValueError: total_error.write(seq_date_error) else: total_error.write("Sample Clinical File: Must have SEQ_DATE column.\n") # CHECK: BIRTH_YEAR error = _check_year( clinicaldf=clinicaldf, year_col="BIRTH_YEAR", filename="Patient Clinical File", allowed_string_values=["Unknown", ">89", "<18"], ) total_error.write(error) # CHECK: YEAR DEATH error = _check_year( clinicaldf=clinicaldf, year_col="YEAR_DEATH", filename="Patient Clinical File", allowed_string_values=[ "Unknown", "Not Collected", "Not Applicable", "Not Released", ">89", "<18", ], ) total_error.write(error) # CHECK: YEAR CONTACT error = _check_year( clinicaldf=clinicaldf, year_col="YEAR_CONTACT", filename="Patient Clinical File", allowed_string_values=[ "Unknown", "Not Collected", "Not Released", ">89", "<18", ], ) total_error.write(error) # CHECK: INT CONTACT haveColumn = process_functions.checkColExist(clinicaldf, "INT_CONTACT") if haveColumn: if not all( [ process_functions.checkInt(i) for i in clinicaldf.INT_CONTACT if i not in [ ">32485", "<6570", "Unknown", "Not Collected", "Not Released", ] ] ): total_error.write( "Patient Clinical File: Please double check your " "INT_CONTACT column, it must be an integer, '>32485', " "'<6570', 'Unknown', 'Not Released' or 'Not Collected'.\n" ) else: total_error.write("Patient Clinical File: Must have INT_CONTACT column.\n") # INT DOD haveColumn = process_functions.checkColExist(clinicaldf, "INT_DOD") if haveColumn: if not all( [ process_functions.checkInt(i) for i in clinicaldf.INT_DOD if i not in [ ">32485", "<6570", "Unknown", "Not Collected", "Not Applicable", "Not Released", ] ] ): total_error.write( "Patient Clinical File: Please double check your INT_DOD " "column, it must be an integer, '>32485', '<6570', " "'Unknown', 'Not Collected', 'Not Released' or " "'Not Applicable'.\n" ) else: total_error.write("Patient Clinical File: Must have INT_DOD column.\n") haveColumn = process_functions.checkColExist(clinicaldf, "DEAD") if haveColumn: # Need to have check_bool function if not all( [ str(i).upper() in ["TRUE", "FALSE"] for i in clinicaldf.DEAD if i not in ["Unknown", "Not Collected", "Not Released"] ] ): total_error.write( "Patient Clinical File: Please double check your " "DEAD column, it must be True, False, 'Unknown', " "'Not Released' or 'Not Collected'.\n" ) else: total_error.write("Patient Clinical File: Must have DEAD column.\n") # CHECK: contact vital status value consistency contact_error = _check_int_year_consistency( clinicaldf=clinicaldf, cols=["YEAR_CONTACT", "INT_CONTACT"], string_vals=["Not Collected", "Unknown", "Not Released"], ) total_error.write(contact_error) # CHECK: death vital status value consistency death_error = _check_int_year_consistency( clinicaldf=clinicaldf, cols=["YEAR_DEATH", "INT_DOD"], string_vals=[ "Not Collected", "Unknown", "Not Applicable", "Not Released", ], ) total_error.write(death_error) death_error = _check_int_dead_consistency(clinicaldf=clinicaldf) total_error.write(death_error) # CHECK: SAMPLE_CLASS is optional attribute have_column = process_functions.checkColExist(clinicaldf, "SAMPLE_CLASS") if have_column: sample_class_vals = pd.Series(clinicaldf["SAMPLE_CLASS"].unique().tolist()) if not sample_class_vals.isin(["Tumor", "cfDNA"]).all(): total_error.write( "Sample Clinical File: SAMPLE_CLASS column must " "be 'Tumor', or 'cfDNA'\n" ) # CHECK: PRIMARY_RACE warn, error = process_functions.check_col_and_values( clinicaldf, "PRIMARY_RACE", race_mapping["CODE"].tolist(), "Patient Clinical File", ) warning.write(warn) total_error.write(error) # CHECK: SECONDARY_RACE warn, error = process_functions.check_col_and_values( clinicaldf, "SECONDARY_RACE", race_mapping["CODE"].tolist(), "Patient Clinical File", ) warning.write(warn) total_error.write(error) # CHECK: TERTIARY_RACE warn, error = process_functions.check_col_and_values( clinicaldf, "TERTIARY_RACE", race_mapping["CODE"].tolist(), "Patient Clinical File", ) warning.write(warn) total_error.write(error) # CHECK: SEX warn, error = process_functions.check_col_and_values( clinicaldf, "SEX", sex_mapping["CODE"].tolist(), "Patient Clinical File", required=True, ) warning.write(warn) total_error.write(error) # CHECK: ETHNICITY warn, error = process_functions.check_col_and_values( clinicaldf, "ETHNICITY", ethnicity_mapping["CODE"].tolist(), "Patient Clinical File", ) warning.write(warn) total_error.write(error) return total_error.getvalue(), warning.getvalue()
def process_steps( self, clinicalDf, databaseToSynIdMappingDf, newPath, parentId, oncotree_link, clinicalTemplate, sample, patient, patientCols, sampleCols, ): """Process clincial file, redact PHI values, upload to clinical database """ patientdb_idx = databaseToSynIdMappingDf["Database"] == "patient" patient_synid = databaseToSynIdMappingDf.Id[patientdb_idx][0] sampledb_idx = databaseToSynIdMappingDf["Database"] == "sample" sample_synid = databaseToSynIdMappingDf.Id[sampledb_idx][0] newClinicalDf = self._process(clinicalDf, clinicalTemplate) newClinicalDf = redact_phi(newClinicalDf) if patient: cols = newClinicalDf.columns[newClinicalDf.columns.isin(patientCols)] patientClinical = newClinicalDf[cols].drop_duplicates("PATIENT_ID") self.uploadMissingData( patientClinical, "PATIENT_ID", patient_synid, parentId ) process_functions.updateData( self.syn, patient_synid, patientClinical, self.center, col=cols.tolist(), toDelete=True, ) if sample: cols = newClinicalDf.columns[newClinicalDf.columns.isin(sampleCols)] if sum(newClinicalDf["SAMPLE_ID"].duplicated()) > 0: logger.error( "There are duplicated samples, " "and the duplicates are removed" ) sampleClinical = newClinicalDf[cols].drop_duplicates("SAMPLE_ID") # Exclude all clinical samples with wrong oncotree codes oncotree_mapping = pd.DataFrame() oncotree_mapping_dict = process_functions.get_oncotree_code_mappings( oncotree_link ) # Add in unknown key for oncotree code oncotree_mapping_dict["UNKNOWN"] = {} oncotree_mapping["ONCOTREE_CODE"] = list(oncotree_mapping_dict.keys()) # Make oncotree codes uppercase (SpCC/SPCC) sampleClinical["ONCOTREE_CODE"] = ( sampleClinical["ONCOTREE_CODE"].astype(str).str.upper() ) sampleClinical = sampleClinical[ sampleClinical["ONCOTREE_CODE"].isin(oncotree_mapping["ONCOTREE_CODE"]) ] self.uploadMissingData(sampleClinical, "SAMPLE_ID", sample_synid, parentId) # ,retractedSampleSynId) process_functions.updateData( self.syn, sample_synid, sampleClinical, self.center, col=cols.tolist(), toDelete=True, ) newClinicalDf.to_csv(newPath, sep="\t", index=False) return newPath
def run(self): """ This function runs the redcap export to export all files """ if not os.path.exists(self._SPONSORED_PROJECT): os.mkdir(self._SPONSORED_PROJECT) else: filelists = os.listdir(self._SPONSORED_PROJECT) for file in filelists: if file != "case_lists": os.remove(os.path.join(self._SPONSORED_PROJECT, file)) # Create full mapping table to get the values of the data model mapping = self.syn.tableQuery( "select genie_field_name,instrument from {} where " "{} is true and phi is false".format( self._DATA_ELEMENT_SYN_ID, self._SPONSORED_PROJECT.lower())) mappingDf = mapping.asDataFrame() newMappingDf = pd.DataFrame() for field, instrument in zip(mappingDf.genie_field_name, mappingDf.instrument): # Do not want to append the # values if "#" in field: # find fields with # and replace with however many times # it should loop through newfields = [ field.replace("#", str(count)) for count in list(range(1, self._NUM_COUNTS + 1)) ] newDataFrame = pd.DataFrame({ "genie_field_name": newfields, "instrument": [instrument] * len(newfields) }) else: newDataFrame = pd.DataFrame( { "genie_field_name": field, "instrument": instrument }, index=[0]) newMappingDf = newMappingDf.append(newDataFrame, sort=False) # If there are ever missing fields, they must be added in # or else the script will fail # missingFields= ['her_status_sample','sample_seq_yn'] # missingFieldType = ['sample_information']*2 # newMappingDf = newMappingDf.append(pd.DataFrame({ # "genie_field_name": missingFields, # "instrument": missingFieldType})) # Extract patient/sample/treatment columns patientCols = extractColumns(newMappingDf, [ "patient_information", "treatment_information_general", "diagnosis_information" ], [ 'errors_patient_info_yn', 'patient_info_errors', 'errors_dx_info_yn', 'dx_info_errors', 'so_yn' ]) sampleCols = extractColumns(newMappingDf, ["sample_information"], [ "test_sample", "fgfr4_variant", "errors_sample_info_yn", "sample_info_errors" ]) treatmentCols = extractColumns(newMappingDf, ["treatment_information_detailed"], []) unlabelledEnt = self.syn.get(self._UNLABELLED_SYN_ID) labelledEnt = self.syn.get(self._LABELLED_SYN_ID) unlabeledDf = pd.read_csv(unlabelledEnt.path) labeledDf = pd.read_csv(labelledEnt.path) # Add on CENTER column for all three file formats patientCols.append("redcap_data_access_group") sampleCols.append("redcap_data_access_group") treatmentCols.append("redcap_data_access_group") labeledDf.columns = unlabeledDf.columns labeledDf['redcap_data_access_group'][ labeledDf['redcap_data_access_group'] == "hop"] = "JHU" labeledDf['redcap_data_access_group'] = \ labeledDf['redcap_data_access_group'].apply(lambda x: x.upper()) patientDf = labeledDf[patientCols] patientRows = labeledDf.redcap_repeat_instrument.isnull() patientDf = patientDf[patientRows] sampleDf = labeledDf[sampleCols] sampleRows = labeledDf.redcap_repeat_instrument == "Sample Information" sampleDf = sampleDf[sampleRows] # Red cap header to cbio header Table mapping redCapToCbioMapping = self.syn.tableQuery( "SELECT * FROM %s" % self._REDCAP_TO_CBIOMAPPING_SYNID) redCapToCbioMappingDf = redCapToCbioMapping.asDataFrame() # Get all the samples/patients that should be uploaded to SP projects # Hard coded clinical database genie_clinicalDb = self.syn.tableQuery( 'select SAMPLE_ID, PATIENT_ID, ONCOTREE_CODE, SEQ_ASSAY_ID ' 'from syn7517674') genie_clinicalDf = genie_clinicalDb.asDataFrame() # Hard coded clinicalSP database # nonGenie_clinicalDb = self.syn.tableQuery( # 'SELECT * FROM syn11492579') # nonGenie_clinicalDf = nonGenie_clinicalDb.asDataFrame() # genie_clinicalDf = genie_clinicalDf.append(nonGenie_clinicalDf) # Only patients and samples that exist in the # sponsored project uploads are going to be pulled into the SP project finalPatientDf = self.configureClinicalDf(patientDf, redCapToCbioMappingDf) patient_date_col = [ col for col in finalPatientDf.columns if col.endswith("INT") ] patient_date_col.append("OS_MONTHS") final_patientdf_datesdays = finalPatientDf.copy() finalPatientDf[patient_date_col] = \ finalPatientDf[patient_date_col].applymap(change_days_to_months) subsetPatientDf = finalPatientDf[finalPatientDf['PATIENT_ID'].isin( genie_clinicalDf['PATIENT_ID'])] del subsetPatientDf['SP'] # Remove CENTER and ONCOTREE_CODE from patient because you # cannot have these columns in both sample and patient Df, # it will fail validation for cbioportal del subsetPatientDf['CENTER'] del subsetPatientDf['ONCOTREE_CODE'] patientPath = self.writeClinicalFile(subsetPatientDf, redCapToCbioMappingDf, "patient") finalSampleDf = self.configureClinicalDf(sampleDf, redCapToCbioMappingDf) sample_date_cols = ['SAMPLE_DATE_INT', 'AGE_AT_SEQ_REPORT'] final_sampledf_datesdays = finalSampleDf.copy() finalSampleDf[sample_date_cols] = \ finalSampleDf[sample_date_cols].applymap(change_days_to_months) # Fill in ONCOTREE_CODE finalSampleDf['ONCOTREE_CODE'] = [ genie_clinicalDf['ONCOTREE_CODE'][genie_clinicalDf['SAMPLE_ID'] == sample].values[0] if sum(genie_clinicalDf['SAMPLE_ID'] == sample) > 0 else float('nan') for sample in finalSampleDf['SAMPLE_ID'] ] # Fill in SEQ_ASSAY_ID finalSampleDf['SEQ_ASSAY_ID'] = [ genie_clinicalDf['SEQ_ASSAY_ID'][genie_clinicalDf['SAMPLE_ID'] == sample].values[0] if sum(genie_clinicalDf['SAMPLE_ID'] == sample) > 0 else float('nan') for sample in finalSampleDf['SAMPLE_ID'] ] subsetSampleDf = finalSampleDf[finalSampleDf['SAMPLE_ID'].isin( genie_clinicalDf['SAMPLE_ID'])] del subsetSampleDf['SP'] samplePath = self.writeClinicalFile(subsetSampleDf, redCapToCbioMappingDf, "sample") # Remove oncotree code here, because no longer need it mergedClinicalDf = subsetSampleDf.merge(subsetPatientDf, on="PATIENT_ID", how="outer") if mergedClinicalDf.get("SAMPLE_ID") is not None: print("Samples not in GENIE clinical databases (SP and normal)") notFoundSamples = mergedClinicalDf[ 'SAMPLE_ID'][~mergedClinicalDf['SAMPLE_ID']. isin(genie_clinicalDf['SAMPLE_ID'])] if not notFoundSamples.empty: print(notFoundSamples[~notFoundSamples.isnull()]) notFoundSamples.to_csv("notfoundsamples.csv", header=False) if not self.staging: self.syn.store( synapseclient.File( "notfoundsamples.csv", parent=self._SP_REDCAP_EXPORTS_SYNID)) # Hard coded most up to date oncotree version oncotreeLink = self.syn.get("syn13890902").externalURL # Use the old oncotree link for now oncotreeLink = 'http://oncotree.mskcc.org/api/tumorTypes/tree?version=oncotree_2017_06_21' oncotreeDict = \ process_functions.get_oncotree_code_mappings(oncotreeLink) mergedClinicalDf['CANCER_TYPE'] = [ oncotreeDict[code.upper()].get("CANCER_TYPE", float('nan')) for code in mergedClinicalDf['ONCOTREE_CODE'] ] mergedClinicalDf['CANCER_TYPE_DETAILED'] = [ oncotreeDict[code.upper()].get("CANCER_TYPE_DETAILED", float('nan')) for code in mergedClinicalDf['ONCOTREE_CODE'] ] mergedClinicalDf['ONCOTREE_PRIMARY_NODE'] = [ oncotreeDict[code.upper()].get("ONCOTREE_PRIMARY_NODE", float('nan')) for code in mergedClinicalDf['ONCOTREE_CODE'] ] mergedClinicalDf['ONCOTREE_SECONDARY_NODE'] = [ oncotreeDict[code.upper()].get("ONCOTREE_SECONDARY_NODE", float('nan')) for code in mergedClinicalDf['ONCOTREE_CODE'] ] mergedClinicalDf.to_csv("%s/data_clinical.txt" % self._SPONSORED_PROJECT, index=False, sep="\t") if not self.staging: process_functions.updateData(self.syn, "syn17010637", finalPatientDf, self._SPONSORED_PROJECT, filterByColumn="SP", toDelete=True) patientFileEnt = File(patientPath, parent=self._SP_SYN_ID) patientEnt = self.syn.store(patientFileEnt, used=labelledEnt.id, executed=self._GITHUB_REPO) process_functions.updateData(self.syn, "syn17010638", finalSampleDf, self._SPONSORED_PROJECT, filterByColumn="SP", toDelete=True) sampleFileEnt = File(samplePath, parent=self._SP_SYN_ID) sampleEnt = self.syn.store(sampleFileEnt, used=labelledEnt.id, executed=self._GITHUB_REPO) treatmentDf = labeledDf[treatmentCols] treatmentRows = labeledDf.redcap_repeat_instrument == \ "Treatment Information Detailed" treatmentDf = treatmentDf[treatmentRows] finalTimelineDf = self.makeTimeLineDf(treatmentDf, final_patientdf_datesdays) finalTimelineDf.PATIENT_ID = finalTimelineDf.apply( lambda x: process_functions.checkGenieId(x['PATIENT_ID'], x[ 'CENTER']), axis=1) if not self.staging: process_functions.updateData(self.syn, "syn17011214", finalTimelineDf, self._SPONSORED_PROJECT, filterByColumn="SP", toDelete=True) # METASTATIC DIAGNOSIS (append to timeline) metaDiagnosisDf = self.createMetaDiagnosisDf(finalTimelineDf) # Maintain ordering of timeline ordering = finalTimelineDf.columns.tolist() # Two extra timeline columns from specimen file ordering.extend(["SAMPLE_ID", "SAMPLE_NOTES"]) finalTimelineDf = finalTimelineDf.append(metaDiagnosisDf, sort=False) # Create specimen file to append to timeline file too specimenDf = self.createSpecimenDf(final_sampledf_datesdays, final_patientdf_datesdays) specimenDf = specimenDf[specimenDf['SAMPLE_ID'].isin( genie_clinicalDf['SAMPLE_ID'])] # dates = ['START_DATE', 'STOP_DATE', 'LINE_START'] finalTimelineDf = finalTimelineDf.append(specimenDf, sort=False) # No need to convert timeline dates to months # finalTimelineDf[dates] = \ # finalTimelineDf[dates].applymap(change_days_to_months) finalTimelineDf = finalTimelineDf[ordering] finalTimelineDf = finalTimelineDf[finalTimelineDf['PATIENT_ID'].isin( genie_clinicalDf['PATIENT_ID'])] finalTimelineDf['AGENT'][finalTimelineDf['AGENT'].isnull()] = "Unknown" timelineText = finalTimelineDf.to_csv(index=False, sep="\t") timelineText = replace0(timelineText) timeline_path = "%s/data_timeline.txt" % self._SPONSORED_PROJECT with open(timeline_path, 'w') as timelineFile: timelineFile.write(timelineText) if not self.staging: fileEnt = File(timeline_path, parent=self._SP_SYN_ID) self.syn.store(fileEnt, used=labelledEnt.id, executed=self._GITHUB_REPO) # Get database to synapse id mapping table so no need to # hardcode synapse ids databaseToSynIdMapping = \ self.syn.tableQuery('SELECT * FROM syn10967259') databaseToSynIdMappingDf = databaseToSynIdMapping.asDataFrame() centerMafFileViewSynId = databaseToSynIdMappingDf['Id'][ databaseToSynIdMappingDf['Database'] == "centerMafView"][0] centerMafSynIds = self.syn.tableQuery( "select id from {} where name like '%mutation%'".format( centerMafFileViewSynId)) centerMafSynIdsDf = centerMafSynIds.asDataFrame() # This value must be set outside here because the first maf file might # Not be part of the centers index = 0 mafpath = "{}/data_mutations_extended.txt".format( self._SPONSORED_PROJECT) for mafSynId in centerMafSynIdsDf.id: mafEnt = self.syn.get(mafSynId, downloadFile=False) mafcenter = mafEnt.name.split("_")[3] if mafcenter in finalSampleDf['CENTER'].tolist(): mafEnt = self.syn.get(mafSynId) print("running", mafEnt.name) with open(mafEnt.path, "r") as mafFile: header = mafFile.readline() headers = header.replace("\n", "").split("\t") if index == 0: with open(mafpath, 'w') as f: f.write(header) index += 1 for row in mafFile: rowArray = row.replace("\n", "").split("\t") center = rowArray[headers.index('Center')] newMergedRow = configureMafRow( rowArray, headers, finalSampleDf['SAMPLE_ID']) if newMergedRow is not None: with open(mafpath, 'a') as f: f.write(newMergedRow) # No longer need to pulling from non genie db fileEnt = File(mafpath, parent=self._SP_SYN_ID) if not self.staging: self.syn.store(fileEnt, used=centerMafSynIdsDf.id.tolist(), executed=self._GITHUB_REPO) CNA_PATH = "%s/data_CNA.txt" % self._SPONSORED_PROJECT CNA_CENTER_PATH = self._SPONSORED_PROJECT + "/data_CNA_%s.txt" centerCNASynIds = self.syn.tableQuery( "select id from {} where name like 'data_CNA%'".format( centerMafFileViewSynId)) centerCNASynIdsDf = centerCNASynIds.asDataFrame() # Grab all unique symbols and form cnaTemplate allSymbols = set() for cnaSynId in centerCNASynIdsDf.id: cnaEnt = self.syn.get(cnaSynId) with open(cnaEnt.path, "r") as cnaFile: # Read first line first to get all the samples cnaFile.readline() # Get all hugo symbols allSymbols = allSymbols.union( set(line.split("\t")[0] for line in cnaFile)) cnaTemplate = pd.DataFrame({"Hugo_Symbol": list(allSymbols)}) cnaTemplate.sort_values("Hugo_Symbol", inplace=True) cnaTemplate.to_csv(CNA_PATH, sep="\t", index=False) withMergedHugoSymbol = pd.Series("Hugo_Symbol") withMergedHugoSymbol = \ withMergedHugoSymbol.append(pd.Series(finalSampleDf['SAMPLE_ID'])) cnaSamples = [] for cnaSynId in centerCNASynIdsDf.id: cnaEnt = self.syn.get(cnaSynId) center = cnaEnt.name.replace("data_CNA_", "").replace(".txt", "") print(cnaEnt.path) # if center in CENTER_MAPPING_DF.center.tolist(): centerCNA = pd.read_csv(cnaEnt.path, sep="\t") merged = cnaTemplate.merge(centerCNA, on="Hugo_Symbol", how="outer") merged.sort_values("Hugo_Symbol", inplace=True) # This is to remove more samples for the final cna file merged = merged[merged.columns[merged.columns.isin( withMergedHugoSymbol)]] cnaText = process_functions.removePandasDfFloat(merged) # Must do this replace twice because \t\t\t -> # \tNA\t\t -> \tNA\tNA\t cnaText = cnaText.replace("\t\t", "\tNA\t").replace( "\t\t", "\tNA\t").replace('\t\n', "\tNA\n") with open(CNA_CENTER_PATH % center, "w") as cnaFile: cnaFile.write(cnaText) cnaSamples.extend(merged.columns[1:].tolist()) # Join CNA file joinCommand = ["join", CNA_PATH, CNA_CENTER_PATH % center] output = subprocess.check_output(joinCommand) with open(CNA_PATH, "w") as cnaFile: cnaFile.write(output.decode("utf-8").replace(" ", "\t")) fileEnt = File(CNA_PATH, parent=self._SP_SYN_ID) if not self.staging: self.syn.store(fileEnt, used=centerCNASynIdsDf.id.tolist(), executed=self._GITHUB_REPO) self.createGeneMatrixDf(finalSampleDf, cnaSamples, labelledEnt) fusion = self.syn.tableQuery("SELECT * FROM syn7893268 where " "TUMOR_SAMPLE_BARCODE in ('{}')".format( "','".join( finalSampleDf['SAMPLE_ID']))) fusions_df = fusion.asDataFrame() if not fusions_df.empty: fusions_df = fusions_df.rename( columns={ 'HUGO_SYMBOL': 'Hugo_Symbol', 'ENTREZ_GENE_ID': 'Entrez_Gene_Id', 'CENTER': 'Center', 'TUMOR_SAMPLE_BARCODE': 'Tumor_Sample_Barcode', 'FUSION': 'Fusion', 'DNA_SUPPORT': 'DNA_support', 'RNA_SUPPORT': 'RNA_support', 'METHOD': 'Method', 'FRAME': 'Frame', 'COMMENTS': 'Comments' }) fusions_df.Entrez_Gene_Id[fusions_df.Entrez_Gene_Id == 0] = pd.np.nan fusionText = fusions_df.to_csv(sep="\t", index=False) fusionText = replace0(fusionText) fusion_path = "%s/data_fusions.txt" % self._SPONSORED_PROJECT with open(fusion_path, "w") as fusionFile: fusionFile.write(fusionText) fileEnt = File(fusion_path, parent=self._SP_SYN_ID) if not self.staging: self.syn.store(fileEnt, used=fusion.tableId, executed=self._GITHUB_REPO) seg = self.syn.tableQuery( "SELECT ID, CHROM, LOCSTART, LOCEND, NUMMARK, SEGMEAN " "FROM syn7893341 where ID in ('{}')".format("','".join( finalSampleDf['SAMPLE_ID']))) seg_df = seg.asDataFrame() if not seg_df.empty: seg_df.rename(columns={ "CHROM": "chrom", "LOCSTART": "loc.start", "LOCEND": "loc.end", "NUMMARK": "num.mark", "SEGMEAN": "seg.mean" }, inplace=True) segText = replace0(seg_df.to_csv(sep="\t", index=False)) segpath = "{}/genie_{}_data_cna_hg19.seg".format( self._SPONSORED_PROJECT, self._SPONSORED_PROJECT.lower()) with open(segpath, 'w') as segFile: segFile.write(segText) fileEnt = File(segpath, parent=self._SP_SYN_ID) if not self.staging: self.syn.store(fileEnt, used=seg.tableId, executed=self._GITHUB_REPO) # Create case lists if not os.path.exists(self._CASE_LIST_PATH): os.mkdir(self._CASE_LIST_PATH) else: caselists = os.listdir(self._CASE_LIST_PATH) for caselist in caselists: os.remove(os.path.join(self._CASE_LIST_PATH, caselist)) # Write out cases sequenced so people can tell # which samples were sequenced create_case_lists.main( "%s/data_clinical.txt" % self._SPONSORED_PROJECT, "%s/data_gene_matrix.txt" % self._SPONSORED_PROJECT, self._CASE_LIST_PATH, "genie_{}".format(self._SPONSORED_PROJECT.lower())) caseListFiles = os.listdir(self._CASE_LIST_PATH) for casePath in caseListFiles: casePath = os.path.join(self._CASE_LIST_PATH, casePath) fileEnt = File(casePath, parent=self._CASE_LIST_SYN_ID) if not self.staging: self.syn.store(fileEnt, used=[patientEnt.id, sampleEnt.id], executed=self._GITHUB_REPO) seq_assays = "','".join(set(finalSampleDf['SEQ_ASSAY_ID'])) bed = self.syn.tableQuery( "SELECT Hugo_Symbol, SEQ_ASSAY_ID FROM syn8457748 where " "SEQ_ASSAY_ID in ('{}') and " "Feature_Type = 'exon' and " "Hugo_Symbol is not null and " "includeInPanel is true".format(seq_assays)) beddf = bed.asDataFrame() bed = self.syn.tableQuery( "SELECT Hugo_Symbol, SEQ_ASSAY_ID FROM syn11516678 where " "SEQ_ASSAY_ID in ('{}') and " "Feature_Type = 'exon' and " "Hugo_Symbol is not null and " "includeInPanel is true".format(seq_assays)) non_genie_beddf = bed.asDataFrame() beddf = beddf.append(non_genie_beddf) seq_assay_groups = beddf.groupby('SEQ_ASSAY_ID') for seq_assay_id, seqdf in seq_assay_groups: unique_genes = seqdf.Hugo_Symbol.unique() gene_panel_text = ("stable_id: {seq_assay_id}\n" "description: {seq_assay_id}, " "Number of Genes - {num_genes}\n" "gene_list:\t{genelist}".format( seq_assay_id=seq_assay_id, num_genes=len(unique_genes), genelist="\t".join(unique_genes))) gene_panel_name = "data_gene_panel_" + seq_assay_id + ".txt" gene_panel_path = os.path.join(self._SPONSORED_PROJECT, gene_panel_name) with open(gene_panel_path, "w+") as f: f.write(gene_panel_text) fileEnt = File(gene_panel_path, parent=self._SP_SYN_ID) if not self.staging: self.syn.store(fileEnt, executed=self._GITHUB_REPO) # Make sure to re download all the metadata files again self.reviseMetadataFiles() cmd = [ 'python', os.path.join(self.cbioPath, "core/src/main/scripts/importer/validateData.py"), "-s", self._SPONSORED_PROJECT, "-n" ] subprocess.call(cmd)
def _validate(self, clinicalDF, oncotreeLink): """ This function validates the clinical file to make sure it adhere to the clinical SOP. Args: clinicalDF: Merged clinical file with patient and sample information oncotreeLink: Link to oncotree Returns: Error message """ total_error = "" warning = "" clinicalDF.columns = [col.upper() for col in clinicalDF.columns] clinicalDF = clinicalDF.fillna("") # oncotree_mapping = process_functions.get_oncotree_codes(oncotreeLink) # if oncotree_mapping.empty: oncotree_mapping = pd.DataFrame() oncotree_mapping_dict = \ process_functions.get_oncotree_code_mappings(oncotreeLink) oncotree_mapping['ONCOTREE_CODE'] = oncotree_mapping_dict.keys() sampleType_mapping = \ process_functions.getGenieMapping(self.syn, "syn7434273") ethnicity_mapping = \ process_functions.getGenieMapping(self.syn, "syn7434242") race_mapping = \ process_functions.getGenieMapping(self.syn, "syn7434236") sex_mapping = \ process_functions.getGenieMapping(self.syn, "syn7434222") # CHECK: SAMPLE_ID sampleId = 'SAMPLE_ID' haveSampleColumn = \ process_functions.checkColExist(clinicalDF, sampleId) if not haveSampleColumn: total_error += \ "Sample Clinical File: Must have SAMPLE_ID column.\n" else: if sum(clinicalDF[sampleId].duplicated()) > 0: total_error += ( "Sample Clinical File: No duplicated SAMPLE_ID " "allowed.\nIf there are no duplicated " "SAMPLE_IDs, and both sample and patient files are " "uploaded, then please check to make sure no duplicated " "PATIENT_IDs exist in the patient clinical file.\n") # CHECK: PATIENT_ID patientId = "PATIENT_ID" # #CHECK: PATIENT_ID IN SAMPLE FILE havePatientColumn = \ process_functions.checkColExist(clinicalDF, patientId) if not havePatientColumn: total_error += \ "Patient Clinical File: Must have PATIENT_ID column.\n" # CHECK: within the sample file that the sample ids match # the patient ids if haveSampleColumn and havePatientColumn: # Make sure sample and patient ids are string cols clinicalDF[sampleId] = clinicalDF[sampleId].astype(str) clinicalDF[patientId] = clinicalDF[patientId].astype(str) if not all([ patient in sample for sample, patient in zip( clinicalDF[sampleId], clinicalDF[patientId]) ]): total_error += ( "Sample Clinical File: PATIENT_ID's much be contained in " "the SAMPLE_ID's (ex. SAGE-1 <-> SAGE-1-2)\n") # #CHECK: All samples must have associated patient data # (GENIE requires patient data) if not all(clinicalDF[patientId] != ""): total_error += ( "Patient Clinical File: All samples must have associated " "patient information and no null patient ids allowed. " "These samples are missing patient data: {}\n".format( ", ".join( clinicalDF[sampleId][clinicalDF[patientId] == ""])) ) # CHECK: All patients should have associated sample data if not all(clinicalDF[sampleId] != ""): # ## MAKE WARNING FOR NOW### warning += ( "Sample Clinical File: All patients must have associated " "sample information. These patients are missing sample " "data: {}\n".format(", ".join( clinicalDF[patientId][clinicalDF[sampleId] == ""]))) # CHECK: AGE_AT_SEQ_REPORT age = "AGE_AT_SEQ_REPORT" haveColumn = process_functions.checkColExist(clinicalDF, age) if haveColumn: # Deal with HIPAA converted rows from DFCI # First for loop can't int(text) because there # are instances that have <3435 age_seq_report_df = \ clinicalDF[~clinicalDF[age].isin(['Unknown'])] age_seq_report_df[age] = \ remove_greaterthan_lessthan_str(age_seq_report_df[age]) if not all([ process_functions.checkInt(i) for i in age_seq_report_df[age] ]): total_error += ( "Sample Clinical File: Please double check your " "AGE_AT_SEQ_REPORT. It must be an integer or 'Unknown'.\n") else: age_seq_report_df[age] = age_seq_report_df[age].astype(int) median_age = pd.np.median(age_seq_report_df[age]) if median_age < 100: total_error += ( "Sample Clinical File: Please double check your " "AGE_AT_SEQ_REPORT. You may be reporting this value " "in YEARS, please report in DAYS.\n") else: total_error += \ "Sample Clinical File: Must have AGE_AT_SEQ_REPORT column.\n" # CHECK: ONCOTREE_CODE haveColumn = \ process_functions.checkColExist(clinicalDF, "ONCOTREE_CODE") maleOncoCodes = ["TESTIS", "PROSTATE", "PENIS"] womenOncoCodes = ["CERVIX", "VULVA", "UTERUS", "OVARY"] if haveColumn: # Make oncotree codes uppercase (SpCC/SPCC) clinicalDF['ONCOTREE_CODE'] = \ clinicalDF['ONCOTREE_CODE'].astype(str).str.upper() oncotree_codes = clinicalDF['ONCOTREE_CODE'][ clinicalDF['ONCOTREE_CODE'] != "UNKNOWN"] if not all(oncotree_codes.isin(oncotree_mapping['ONCOTREE_CODE'])): unmapped_oncotrees = oncotree_codes[ ~oncotree_codes.isin(oncotree_mapping['ONCOTREE_CODE'])] total_error += ( "Sample Clinical File: Please double check that all your " "ONCOTREE CODES exist in the mapping. You have {} samples " "that don't map. These are the codes that " "don't map: {}\n".format(len(unmapped_oncotrees), ",".join( set(unmapped_oncotrees)))) if process_functions.checkColExist(clinicalDF, "SEX") and \ 'oncotree_mapping_dict' in locals() and \ havePatientColumn and \ haveSampleColumn: wrongCodeSamples = [] # This is to check if oncotree codes match the sex, # returns list of samples that have conflicting codes and sex for code, patient, sample in zip(clinicalDF['ONCOTREE_CODE'], clinicalDF['PATIENT_ID'], clinicalDF['SAMPLE_ID']): if oncotree_mapping_dict.get(code) is not None and \ sum(clinicalDF['PATIENT_ID'] == patient) > 0: primaryCode = oncotree_mapping_dict[code][ 'ONCOTREE_PRIMARY_NODE'] sex = clinicalDF['SEX'][clinicalDF['PATIENT_ID'] == patient].values[0] sex = float('nan') if sex == '' else float(sex) if oncotree_mapping_dict[code][ 'ONCOTREE_PRIMARY_NODE'] in maleOncoCodes and \ sex != 1.0: wrongCodeSamples.append(sample) if oncotree_mapping_dict[code][ 'ONCOTREE_PRIMARY_NODE'] in womenOncoCodes and\ sex != 2.0: wrongCodeSamples.append(sample) if len(wrongCodeSamples) > 0: warning += ( "Sample Clinical File: Some SAMPLE_IDs have " "conflicting SEX and ONCOTREE_CODES: {}\n".format( ",".join(wrongCodeSamples))) else: total_error += \ "Sample Clinical File: Must have ONCOTREE_CODE column.\n" warn, error = process_functions.check_col_and_values( clinicalDF, "SAMPLE_TYPE", sampleType_mapping['CODE'].tolist(), "Sample Clinical File", required=True) total_error += error # CHECK: SEQ_ASSAY_ID haveColumn = \ process_functions.checkColExist(clinicalDF, "SEQ_ASSAY_ID") if haveColumn: if not all([i != "" for i in clinicalDF['SEQ_ASSAY_ID']]): total_error += ( "Sample Clinical File: Please double check your " "SEQ_ASSAY_ID columns, there are empty rows.\n") # must remove empty seq assay ids first # Checking if seq assay ids start with the center name seqAssayIds = \ clinicalDF.SEQ_ASSAY_ID[clinicalDF.SEQ_ASSAY_ID != ""] allSeqAssays = seqAssayIds.unique() notNormalized = [] not_caps = [] for seqassay in allSeqAssays: # SEQ Ids are all capitalized now, so no need to check # for differences in case if not seqassay.upper().startswith(self.center): not_caps.append(seqassay) if len(not_caps) > 0: total_error += ("Sample Clinical File: Please make sure your " "SEQ_ASSAY_IDs start with your center " "abbreviation: {}.\n".format( ", ".join(not_caps))) else: total_error += \ "Sample Clinical File: Must have SEQ_ASSAY_ID column.\n" haveColumn = process_functions.checkColExist(clinicalDF, "SEQ_DATE") seq_date_error = ( "Sample Clinical File: SEQ_DATE must be one of five values- " "For Jan-March: use Jan-YEAR. " "For Apr-June: use Apr-YEAR. " "For July-Sep: use Jul-YEAR. " "For Oct-Dec: use Oct-YEAR. (ie. Apr-2017) " "For values that don't have SEQ_DATES that " "you want released use 'release'.\n") if haveColumn: clinicalDF['SEQ_DATE'] = [ i.title() for i in clinicalDF['SEQ_DATE'].astype(str) ] seqDate = clinicalDF['SEQ_DATE'][ clinicalDF['SEQ_DATE'] != 'Release'] if sum(clinicalDF['SEQ_DATE'] == '') > 0: total_error += ( "Sample Clinical File: Samples without SEQ_DATEs will " "NOT be released.\n") try: if not seqDate.empty: dates = seqDate.apply( lambda date: datetime.datetime.strptime(date, '%b-%Y')) # REMOVE JUN LATER if not all([ i.startswith(("Jul", "Jan", "Oct", "Apr")) for i in seqDate ]): total_error += seq_date_error except ValueError: total_error += seq_date_error else: total_error += "Sample Clinical File: Must have SEQ_DATE column.\n" # CHECK: BIRTH_YEAR birth_year = "BIRTH_YEAR" haveColumn = process_functions.checkColExist(clinicalDF, birth_year) if haveColumn: birth_year_df = \ clinicalDF[~clinicalDF[birth_year].isin(['Unknown'])] # Deal with HIPAA converted rows from DFCI # First for loop can't int(text) because there are # instances that have <YYYY birth_year_df[birth_year] = \ remove_greaterthan_lessthan_str(birth_year_df[birth_year]) try: years = birth_year_df[birth_year].apply( lambda x: datetime.datetime.strptime(str(int( x)), '%Y').year > datetime.datetime.utcnow().year) assert not years.any() except Exception: total_error += ( "Patient Clinical File: Please double check your " "BIRTH_YEAR column, it must be an integer in YYYY format " "> {year} or 'Unknown'.\n".format( year=datetime.datetime.utcnow().year)) else: total_error += \ "Patient Clinical File: Must have BIRTH_YEAR column.\n" # CHECK: VITAL_STATUS # YEAR DEATH haveColumn = process_functions.checkColExist(clinicalDF, "YEAR_DEATH") if haveColumn: notNullYears = clinicalDF.YEAR_DEATH[~clinicalDF.YEAR_DEATH.isin( ['Unknown', 'Not Collected', 'Not Applicable'])] try: notNullYears.apply( lambda x: datetime.datetime.strptime(str(int(x)), '%Y')) except Exception: total_error += ( "Patient Clinical File: Please double check your " "YEAR_DEATH column, it must be an integer in YYYY format, " "'Unknown', 'Not Applicable' or 'Not Collected'.\n") else: total_error += \ "Patient Clinical File: Must have YEAR_DEATH column.\n" # YEAR CONTACT haveColumn = process_functions.checkColExist(clinicalDF, "YEAR_CONTACT") if haveColumn: notNullYears = clinicalDF.YEAR_CONTACT[ ~clinicalDF.YEAR_CONTACT.isin(['Unknown', 'Not Collected'])] try: notNullYears.apply( lambda x: datetime.datetime.strptime(str(int(x)), '%Y')) except Exception: total_error += ( "Patient Clinical File: Please double check your " "YEAR_CONTACT column, it must be an integer in YYYY " "format, 'Unknown' or 'Not Collected'.\n") else: total_error += \ "Patient Clinical File: Must have YEAR_CONTACT column.\n" # INT CONTACT haveColumn = process_functions.checkColExist(clinicalDF, "INT_CONTACT") if haveColumn: if not all([ process_functions.checkInt(i) for i in clinicalDF.INT_CONTACT if i not in ['>32485', '<6570', 'Unknown', 'Not Collected'] ]): total_error += ( "Patient Clinical File: Please double check your " "INT_CONTACT column, it must be an integer, '>32485', " "'<6570', 'Unknown' or 'Not Collected'.\n") else: total_error += \ "Patient Clinical File: Must have INT_CONTACT column.\n" # INT DOD haveColumn = process_functions.checkColExist(clinicalDF, "INT_DOD") if haveColumn: if not all([ process_functions.checkInt(i) for i in clinicalDF.INT_DOD if i not in [ '>32485', '<6570', 'Unknown', 'Not Collected', 'Not Applicable' ] ]): total_error += ( "Patient Clinical File: Please double check your INT_DOD " "column, it must be an integer, '>32485', '<6570', " "'Unknown', 'Not Collected' or 'Not Applicable'.\n") else: total_error += \ "Patient Clinical File: Must have INT_DOD column.\n" haveColumn = process_functions.checkColExist(clinicalDF, "DEAD") if haveColumn: # Need to have check_bool function if not all([ str(i).upper() in ['TRUE', 'FALSE'] for i in clinicalDF.DEAD if i not in ['Unknown', 'Not Collected'] ]): total_error += ( "Patient Clinical File: Please double check your " "DEAD column, it must be True, False, 'Unknown' or " "'Not Collected'.\n") else: total_error += \ "Patient Clinical File: Must have DEAD column.\n" # CHECK: PRIMARY_RACE warn, error = process_functions.check_col_and_values( clinicalDF, "PRIMARY_RACE", race_mapping['CODE'].tolist(), "Patient Clinical File") warning += warn total_error += error # CHECK: SECONDARY_RACE warn, error = process_functions.check_col_and_values( clinicalDF, "SECONDARY_RACE", race_mapping['CODE'].tolist(), "Patient Clinical File") warning += warn total_error += error # CHECK: TERTIARY_RACE warn, error = process_functions.check_col_and_values( clinicalDF, "TERTIARY_RACE", race_mapping['CODE'].tolist(), "Patient Clinical File") warning += warn total_error += error # CHECK: SEX warn, error = process_functions.check_col_and_values( clinicalDF, "SEX", sex_mapping['CODE'].tolist(), "Patient Clinical File", required=True) warning += warn total_error += error # CHECK: ETHNICITY warn, error = process_functions.check_col_and_values( clinicalDF, "ETHNICITY", ethnicity_mapping['CODE'].tolist(), "Patient Clinical File") warning += warn total_error += error return (total_error, warning)
def process_steps(self, filePath, databaseToSynIdMappingDf, newPath, parentId, oncotreeLink): patientSynId = databaseToSynIdMappingDf.Id[ databaseToSynIdMappingDf['Database'] == "patient"][0] sampleSynId = databaseToSynIdMappingDf.Id[ databaseToSynIdMappingDf['Database'] == "sample"][0] clinicalDf = pd.read_csv(filePath, sep="\t", comment="#") patient = False sample = False # These synapse ids for the clinical tier release scope is # hardcoded because it never changes patientColsTable = self.syn.tableQuery( 'select fieldName from syn8545211 where patient is ' 'True and inClinicalDb is True') patientCols = patientColsTable.asDataFrame()['fieldName'].tolist() sampleColsTable = self.syn.tableQuery( 'select fieldName from syn8545211 where sample is True ' 'and inClinicalDb is True') sampleCols = sampleColsTable.asDataFrame()['fieldName'].tolist() if "patient" in filePath.lower(): clinicalTemplate = pd.DataFrame(columns=patientCols) patient = True elif "sample" in filePath.lower(): clinicalTemplate = pd.DataFrame(columns=sampleCols) sample = True else: clinicalTemplate = pd.DataFrame(columns=set(patientCols + sampleCols)) sample = True patient = True newClinicalDf = self._process(clinicalDf, clinicalTemplate) if patient: patientClinical = newClinicalDf[patientCols].drop_duplicates( "PATIENT_ID") self.uploadMissingData(patientClinical, "PATIENT_ID", patientSynId, parentId) # retractedPatientSynId) process_functions.updateData(self.syn, patientSynId, patientClinical, self.center, col=patientCols, toDelete=True) if sample: if sum(newClinicalDf["SAMPLE_ID"].duplicated()) > 0: logger.error("There are duplicated samples, " "and the duplicates are removed") sampleClinical = newClinicalDf[sampleCols].drop_duplicates( "SAMPLE_ID") # Exclude all clinical samples with wrong oncotree codes oncotree_mapping = pd.DataFrame() oncotree_mapping_dict = \ process_functions.get_oncotree_code_mappings(oncotreeLink) # Add in unknown key for oncotree code oncotree_mapping_dict['UNKNOWN'] = {} oncotree_mapping['ONCOTREE_CODE'] = oncotree_mapping_dict.keys() # Make oncotree codes uppercase (SpCC/SPCC) sampleClinical['ONCOTREE_CODE'] = sampleClinical[ 'ONCOTREE_CODE'].astype(str).str.upper() sampleClinical = sampleClinical[ sampleClinical['ONCOTREE_CODE'].isin( oncotree_mapping['ONCOTREE_CODE'])] self.uploadMissingData(sampleClinical, "SAMPLE_ID", sampleSynId, parentId) # ,retractedSampleSynId) process_functions.updateData(self.syn, sampleSynId, sampleClinical, self.center, col=sampleCols, toDelete=True) newClinicalDf.to_csv(newPath, sep="\t", index=False) return (newPath)
def update_oncotree_code_tables(syn, database_mappingdf): """ Updates database statistics of oncotree codes and primary onocotree codes Args: syn: synapse object database_mappingdf: mapping between synapse ids and database """ oncotree_distribution_synid = database_mappingdf["Id"][ database_mappingdf["Database"] == "oncotree"].values[0] clinical = syn.tableQuery("select * from syn7517674") clinicaldf = clinical.asDataFrame() # DISTRIBUTION OF ONCOTREE CODE TABLE UPDATE oncotree_code_distributiondf = pd.DataFrame( columns=set(clinicaldf["CENTER"]), index=set(clinicaldf["ONCOTREE_CODE"])) for center in oncotree_code_distributiondf.columns: onc_counts = clinicaldf["ONCOTREE_CODE"][clinicaldf["CENTER"] == center].value_counts() oncotree_code_distributiondf[center] = onc_counts oncotree_code_distributiondf = oncotree_code_distributiondf.fillna(0) oncotree_code_distributiondf = oncotree_code_distributiondf.applymap(int) oncotree_code_distributiondf["Total"] = oncotree_code_distributiondf.apply( sum, axis=1) oncotree_code_distributiondf[ "Oncotree_Code"] = oncotree_code_distributiondf.index oncotree_distribution_db = syn.tableQuery("SELECT %s FROM %s" % ( "Oncotree_Code," + ",".join(clinicaldf["CENTER"].unique()) + ",Total", oncotree_distribution_synid, )) oncotree_distribution_dbdf = oncotree_distribution_db.asDataFrame() process_functions.updateDatabase( syn, oncotree_distribution_dbdf, oncotree_code_distributiondf, oncotree_distribution_synid, ["Oncotree_Code"], to_delete=True, ) # DISTRIBUTION OF PRIMARY CODE TABLE UPDATE oncotree_link_synid = database_mappingdf["Id"][ database_mappingdf["Database"] == "oncotreeLink"].values[0] primary_code_synId = database_mappingdf["Id"][ database_mappingdf["Database"] == "primaryCode"].values[0] # Can also use most up to date oncotree code, # because these tables are updated from the database oncotree_link_ent = syn.get(oncotree_link_synid) oncotree_link = oncotree_link_ent.externalURL oncotree_mapping = process_functions.get_oncotree_code_mappings( oncotree_link) clinicaldf["PRIMARY_CODES"] = [ oncotree_mapping[i.upper()]["ONCOTREE_PRIMARY_NODE"] if i.upper() in oncotree_mapping.keys() else "DEPRECATED_CODE" for i in clinicaldf.ONCOTREE_CODE ] # ### DISTRIBUTION OF PRIMARY ONCOTREE CODE TABLE UPDATE primary_code_distributiondf = pd.DataFrame( columns=set(clinicaldf["CENTER"]), index=set(clinicaldf["PRIMARY_CODES"])) for center in primary_code_distributiondf.columns: onc_counts = clinicaldf["PRIMARY_CODES"][clinicaldf["CENTER"] == center].value_counts() primary_code_distributiondf[center] = onc_counts primary_code_distributiondf = primary_code_distributiondf.fillna(0) primary_code_distributiondf = primary_code_distributiondf.applymap(int) primary_code_distributiondf["Total"] = primary_code_distributiondf.apply( sum, axis=1) primary_code_distributiondf[ "Oncotree_Code"] = primary_code_distributiondf.index primary_code_dist_db = syn.tableQuery("SELECT %s FROM %s" % ( "Oncotree_Code," + ",".join(clinicaldf["CENTER"].unique()) + ",Total", primary_code_synId, )) primary_code_dist_dbdf = primary_code_dist_db.asDataFrame() process_functions.updateDatabase( syn, primary_code_dist_dbdf, primary_code_distributiondf, primary_code_synId, ["Oncotree_Code"], to_delete=True, )