def _validate(self, vitalStatusDf): total_error = "" warning = "" #PATIENT ID haveColumn = process_functions.checkColExist(vitalStatusDf, "PATIENT_ID") if haveColumn: if vitalStatusDf.PATIENT_ID.isnull().any(): total_error += "Vital status file: Please double check your PATIENT_ID column. No null values allowed.\n" else: total_error += "Vital status file: Must have PATIENT_ID column.\n" #YEAR DEATH haveColumn = process_functions.checkColExist(vitalStatusDf, "YEAR_DEATH") if haveColumn: notNullYears = vitalStatusDf.YEAR_DEATH[~vitalStatusDf.YEAR_DEATH.isnull()] try: notNullYears.apply(lambda x: datetime.datetime.strptime(str(int(x)), '%Y')) except: total_error += "Vital status file: Please double check your YEAR_DEATH column, it must be an integer in YYYY format or an empty string.\n" else: total_error += "Vital status file: Must have YEAR_DEATH column.\n" #YEAR CONTACT haveColumn = process_functions.checkColExist(vitalStatusDf, "YEAR_CONTACT") if haveColumn: notNullYears = vitalStatusDf.YEAR_CONTACT[~vitalStatusDf.YEAR_CONTACT.isnull()] try: notNullYears.apply(lambda x: datetime.datetime.strptime(str(int(x)), '%Y')) except: total_error += "Vital status file: Please double check your YEAR_CONTACT column, it must be an integer in YYYY format or an empty string.\n" else: total_error += "Vital status file: Must have YEAR_CONTACT column.\n" #INT CONTACT haveColumn = process_functions.checkColExist(vitalStatusDf, "INT_CONTACT") if haveColumn: #notNullContact = vitalStatusDf.INT_CONTACT[~vitalStatusDf.INT_CONTACT.isnull()] if not all([process_functions.checkInt(i) for i in vitalStatusDf.INT_CONTACT if not pd.isnull(i) and i not in ['>32485','<6570']]): total_error += "Vital status file: Please double check your INT_CONTACT column, it must be an integer, an empty string, >32485, or <6570.\n" else: total_error += "Vital status file: Must have INT_CONTACT column.\n" #INT DOD haveColumn = process_functions.checkColExist(vitalStatusDf, "INT_DOD") if haveColumn: if not all([process_functions.checkInt(i) for i in vitalStatusDf.INT_DOD if not pd.isnull(i) and i not in ['>32485','<6570']]): total_error += "Vital status file: Please double check your INT_DOD column, it must be an integer, an empty string, >32485, or <6570.\n" else: total_error += "Vital status file: Must have INT_DOD column.\n" haveColumn = process_functions.checkColExist(vitalStatusDf, "DEAD") if haveColumn: if not all([isinstance(i, bool) for i in vitalStatusDf.DEAD if not pd.isnull(i)]): total_error += "Vital status file: Please double check your DEAD column, it must be a boolean value or an empty string.\n" else: total_error += "Vital status file: Must have DEAD column.\n" return(total_error, warning)
def update_clinical(self, row): """Transform the values of each row of the clinical file""" # Must create copy or else it will overwrite the original row x = row.copy() # # PATIENT ID if x.get("PATIENT_ID") is not None: x["PATIENT_ID"] = process_functions.checkGenieId( x["PATIENT_ID"], self.center ) # BIRTH YEAR if x.get("BIRTH_YEAR") is not None: # BIRTH YEAR (Check if integer) if process_functions.checkInt(x["BIRTH_YEAR"]): x["BIRTH_YEAR"] = int(x["BIRTH_YEAR"]) # SAMPLE ID if x.get("SAMPLE_ID") is not None: x["SAMPLE_ID"] = process_functions.checkGenieId(x["SAMPLE_ID"], self.center) # AGE AT SEQ REPORT if x.get("AGE_AT_SEQ_REPORT") is not None: if process_functions.checkInt(x["AGE_AT_SEQ_REPORT"]): x["AGE_AT_SEQ_REPORT"] = int(x["AGE_AT_SEQ_REPORT"]) # SEQ ASSAY ID if x.get("SEQ_ASSAY_ID") is not None: x["SEQ_ASSAY_ID"] = x["SEQ_ASSAY_ID"].replace("_", "-") # standardize all SEQ_ASSAY_ID with uppercase x["SEQ_ASSAY_ID"] = x["SEQ_ASSAY_ID"].upper() if x.get("SEQ_DATE") is not None: x["SEQ_DATE"] = x["SEQ_DATE"].title() x["SEQ_YEAR"] = ( int(str(x["SEQ_DATE"]).split("-")[1]) if str(x["SEQ_DATE"]) != "Release" else float("nan") ) if x.get("YEAR_CONTACT") is not None: if process_functions.checkInt(x["YEAR_CONTACT"]): x["YEAR_CONTACT"] = int(x["YEAR_CONTACT"]) if x.get("YEAR_DEATH") is not None: if process_functions.checkInt(x["YEAR_DEATH"]): x["YEAR_DEATH"] = int(x["YEAR_DEATH"]) # TRIM EVERY COLUMN MAKE ALL DASHES for i in x.keys(): if isinstance(x[i], str): x[i] = x[i].strip(" ") return x
def _validate(self, assay_info_df): ''' Validates the values of assay information file Args: assay_info_df: assay information dataframe Returns: tuple: error and warning ''' total_error = "" warning = "" if process_functions.checkColExist(assay_info_df, "SEQ_ASSAY_ID"): all_seq_assays = assay_info_df.SEQ_ASSAY_ID.unique() if not all( [assay.startswith(self.center) for assay in all_seq_assays]): total_error += \ "Assay_information.yaml: Please make sure your all your" +\ " SEQ_ASSAY_IDs start with your center abbreviation.\n" else: total_error += \ "Assay_information.yaml: Must have SEQ_ASSAY_ID column.\n" read_group_dict = process_functions.get_gdc_data_dictionary( "read_group") read_group_headers = read_group_dict['properties'] warn, error = process_functions.check_col_and_values( assay_info_df, 'is_paired_end', [True, False], filename="Assay_information.yaml", required=True) warning += warn total_error += error warn, error = process_functions.check_col_and_values( assay_info_df, 'library_selection', read_group_headers['library_selection']['enum'], filename="Assay_information.yaml", required=True) warning += warn total_error += error warn, error = process_functions.check_col_and_values( assay_info_df, 'library_strategy', read_group_headers['library_strategy']['enum'], filename="Assay_information.yaml", required=True) warning += warn total_error += error warn, error = process_functions.check_col_and_values( assay_info_df, 'platform', read_group_headers['platform']['enum'], filename="Assay_information.yaml", required=True) warning += warn total_error += error instrument_model = read_group_headers['instrument_model']['enum'] instrument_model.append(None) warn, error = process_functions.check_col_and_values( assay_info_df, 'instrument_model', instrument_model, filename="Assay_information.yaml", required=True) warning += warn total_error += error variant_classes = \ ['Splice_Site', 'Nonsense_Mutation', 'Frame_Shift_Del', 'Frame_Shift_Ins', 'Nonstop_Mutation', 'Translation_Start_Site', 'In_Frame_Ins', 'In_Frame_Del', 'Missense_Mutation', 'Intron', 'Splice_Region', 'Silent', 'RNA', "5'UTR", "3'UTR", 'IGR', "5'Flank", "3'Flank", None] warn, error = process_functions.check_col_and_values( assay_info_df, 'variant_classifications', variant_classes, filename="Assay_information.yaml", na_allowed=True) warning += warn total_error += error # if not process_functions.checkColExist( # assay_info_df, "target_capture_kit"): # total_error += ("Assay_information.yaml: " # "Must have target_capture_kit column.\n") if process_functions.checkColExist(assay_info_df, "read_length"): if not all([ process_functions.checkInt(i) for i in assay_info_df["read_length"] if i is not None and not pd.isnull(i) ]): total_error += \ ("Assay_information.yaml: " "Please double check your read_length. " "It must be an integer or null.\n") else: total_error += \ ("Assay_information.yaml: " "Must have read_length column.\n") if process_functions.checkColExist(assay_info_df, "number_of_genes"): if not all([ process_functions.checkInt(i) for i in assay_info_df["number_of_genes"] ]): total_error += \ ("Assay_information.yaml: " "Please double check your number_of_genes. " "It must be an integer.\n") else: total_error += \ ("Assay_information.yaml: " "Must have number_of_genes column.\n") if process_functions.checkColExist(assay_info_df, "gene_padding"): if not all([ process_functions.checkInt(i) for i in assay_info_df["gene_padding"] if i is not None and not pd.isnull(i) ]): total_error += \ ("Assay_information.yaml: " "Please double check your gene_padding. " "It must be an integer or blank.\n") else: warning += \ ("Assay_information.yaml: " "gene_padding is by default 10 if not specified.\n") return (total_error, warning)
def _validate(self, clinicaldf, oncotree_link): """ This function validates the clinical file to make sure it adhere to the clinical SOP. Args: clinicalDF: Merged clinical file with patient and sample information oncotree_link: Link to oncotree Returns: Error message """ total_error = StringIO() warning = StringIO() clinicaldf.columns = [col.upper() for col in clinicaldf.columns] # CHECK: for empty rows empty_rows = clinicaldf.isnull().values.all(axis=1) if empty_rows.any(): total_error.write("Clinical file(s): No empty rows allowed.\n") # Remove completely empty rows to speed up processing clinicaldf = clinicaldf[~empty_rows] clinicaldf = clinicaldf.fillna("") oncotree_mapping_dict = process_functions.get_oncotree_code_mappings( oncotree_link ) oncotree_mapping = pd.DataFrame( {"ONCOTREE_CODE": list(oncotree_mapping_dict.keys())} ) sampletype_mapping = process_functions.getGenieMapping(self.syn, "syn7434273") ethnicity_mapping = process_functions.getGenieMapping(self.syn, "syn7434242") race_mapping = process_functions.getGenieMapping(self.syn, "syn7434236") sex_mapping = process_functions.getGenieMapping(self.syn, "syn7434222") # CHECK: SAMPLE_ID sample_id = "SAMPLE_ID" haveSampleColumn = process_functions.checkColExist(clinicaldf, sample_id) if not haveSampleColumn: total_error.write("Sample Clinical File: Must have SAMPLE_ID column.\n") else: if sum(clinicaldf[sample_id].duplicated()) > 0: total_error.write( "Sample Clinical File: No duplicated SAMPLE_ID " "allowed.\nIf there are no duplicated " "SAMPLE_IDs, and both sample and patient files are " "uploaded, then please check to make sure no duplicated " "PATIENT_IDs exist in the patient clinical file.\n" ) # CHECK: PATIENT_ID patientId = "PATIENT_ID" # #CHECK: PATIENT_ID IN SAMPLE FILE havePatientColumn = process_functions.checkColExist(clinicaldf, patientId) if not havePatientColumn: total_error.write("Patient Clinical File: Must have PATIENT_ID column.\n") # CHECK: within the sample file that the sample ids match # the patient ids if haveSampleColumn and havePatientColumn: # Make sure sample and patient ids are string cols clinicaldf[sample_id] = clinicaldf[sample_id].astype(str) clinicaldf[patientId] = clinicaldf[patientId].astype(str) if not all( [ patient in sample for sample, patient in zip( clinicaldf[sample_id], clinicaldf[patientId] ) ] ): total_error.write( "Sample Clinical File: PATIENT_ID's much be contained in " "the SAMPLE_ID's (ex. SAGE-1 <-> SAGE-1-2)\n" ) # #CHECK: All samples must have associated patient data # (GENIE requires patient data) if not all(clinicaldf[patientId] != ""): total_error.write( "Patient Clinical File: All samples must have associated " "patient information and no null patient ids allowed. " "These samples are missing patient data: {}\n".format( ", ".join( clinicaldf[sample_id][clinicaldf[patientId] == ""].unique() ) ) ) # CHECK: All patients should have associated sample data if not all(clinicaldf[sample_id] != ""): # ## MAKE WARNING FOR NOW### warning.write( "Sample Clinical File: All patients must have associated " "sample information. These patients are missing sample " "data: {}\n".format( ", ".join( clinicaldf[patientId][clinicaldf[sample_id] == ""].unique() ) ) ) # CHECK: AGE_AT_SEQ_REPORT age = "AGE_AT_SEQ_REPORT" haveColumn = process_functions.checkColExist(clinicaldf, age) if haveColumn: # Deal with HIPAA converted rows from DFCI # First for loop can't int(text) because there # are instances that have <3435 age_seq_report_df = clinicaldf[ ~clinicaldf[age].isin(["Unknown", ">32485", "<6570"]) ] # age_seq_report_df[age] = \ # remove_greaterthan_lessthan_str(age_seq_report_df[age]) if not all([process_functions.checkInt(i) for i in age_seq_report_df[age]]): total_error.write( "Sample Clinical File: Please double check your " "AGE_AT_SEQ_REPORT. It must be an integer, 'Unknown', " "'>32485', '<6570'.\n" ) else: age_seq_report_df[age] = age_seq_report_df[age].astype(int) median_age = age_seq_report_df[age].median() if median_age < 100: total_error.write( "Sample Clinical File: Please double check your " "AGE_AT_SEQ_REPORT. You may be reporting this value " "in YEARS, please report in DAYS.\n" ) else: total_error.write( "Sample Clinical File: Must have AGE_AT_SEQ_REPORT column.\n" ) # CHECK: ONCOTREE_CODE haveColumn = process_functions.checkColExist(clinicaldf, "ONCOTREE_CODE") maleOncoCodes = ["TESTIS", "PROSTATE", "PENIS"] womenOncoCodes = ["CERVIX", "VULVA", "UTERUS", "OVARY"] if haveColumn: # Make oncotree codes uppercase (SpCC/SPCC) clinicaldf["ONCOTREE_CODE"] = ( clinicaldf["ONCOTREE_CODE"].astype(str).str.upper() ) oncotree_codes = clinicaldf["ONCOTREE_CODE"][ clinicaldf["ONCOTREE_CODE"] != "UNKNOWN" ] if not all(oncotree_codes.isin(oncotree_mapping["ONCOTREE_CODE"])): unmapped_oncotrees = oncotree_codes[ ~oncotree_codes.isin(oncotree_mapping["ONCOTREE_CODE"]) ] total_error.write( "Sample Clinical File: Please double check that all your " "ONCOTREE CODES exist in the mapping. You have {} samples " "that don't map. These are the codes that " "don't map: {}\n".format( len(unmapped_oncotrees), ",".join(set(unmapped_oncotrees)), ) ) # Should add the SEX mismatch into the dashboard file if ( process_functions.checkColExist(clinicaldf, "SEX") and "oncotree_mapping_dict" in locals() and havePatientColumn and haveSampleColumn ): wrongCodeSamples = [] # This is to check if oncotree codes match the sex, # returns list of samples that have conflicting codes and sex for code, patient, sample in zip( clinicaldf["ONCOTREE_CODE"], clinicaldf["PATIENT_ID"], clinicaldf["SAMPLE_ID"], ): if ( oncotree_mapping_dict.get(code) is not None and sum(clinicaldf["PATIENT_ID"] == patient) > 0 ): primaryCode = oncotree_mapping_dict[code][ "ONCOTREE_PRIMARY_NODE" ] sex = clinicaldf["SEX"][ clinicaldf["PATIENT_ID"] == patient ].values[0] sex = float("nan") if sex == "" else float(sex) if ( oncotree_mapping_dict[code]["ONCOTREE_PRIMARY_NODE"] in maleOncoCodes and sex != 1.0 ): wrongCodeSamples.append(sample) if ( oncotree_mapping_dict[code]["ONCOTREE_PRIMARY_NODE"] in womenOncoCodes and sex != 2.0 ): wrongCodeSamples.append(sample) if len(wrongCodeSamples) > 0: warning.write( "Sample Clinical File: Some SAMPLE_IDs have " "conflicting SEX and ONCOTREE_CODES: {}\n".format( ",".join(wrongCodeSamples) ) ) else: total_error.write("Sample Clinical File: Must have ONCOTREE_CODE column.\n") warn, error = process_functions.check_col_and_values( clinicaldf, "SAMPLE_TYPE", sampletype_mapping["CODE"].tolist(), "Sample Clinical File", required=True, ) total_error.write(error) # CHECK: SEQ_ASSAY_ID haveColumn = process_functions.checkColExist(clinicaldf, "SEQ_ASSAY_ID") if haveColumn: if not all([i != "" for i in clinicaldf["SEQ_ASSAY_ID"]]): total_error.write( "Sample Clinical File: Please double check your " "SEQ_ASSAY_ID columns, there are empty rows.\n" ) # must remove empty seq assay ids first # Checking if seq assay ids start with the center name empty_seq_idx = clinicaldf.SEQ_ASSAY_ID != "" seqassay_ids = clinicaldf.SEQ_ASSAY_ID[empty_seq_idx] uniq_seqassay_ids = seqassay_ids.unique() invalid_seqassay = [] for seqassay in uniq_seqassay_ids: # SEQ Ids are all capitalized now, so no need to check # for differences in case if not seqassay.upper().startswith(self.center): invalid_seqassay.append(seqassay) if invalid_seqassay: total_error.write( "Sample Clinical File: Please make sure your " "SEQ_ASSAY_IDs start with your center " "abbreviation: {}.\n".format(", ".join(invalid_seqassay)) ) else: total_error.write("Sample Clinical File: Must have SEQ_ASSAY_ID column.\n") haveColumn = process_functions.checkColExist(clinicaldf, "SEQ_DATE") seq_date_error = ( "Sample Clinical File: SEQ_DATE must be one of five values- " "For Jan-March: use Jan-YEAR. " "For Apr-June: use Apr-YEAR. " "For July-Sep: use Jul-YEAR. " "For Oct-Dec: use Oct-YEAR. (ie. Apr-2017) " "For values that don't have SEQ_DATES that " "you want released use 'release'.\n" ) if haveColumn: clinicaldf["SEQ_DATE"] = [ i.title() for i in clinicaldf["SEQ_DATE"].astype(str) ] seqdate = clinicaldf["SEQ_DATE"][clinicaldf["SEQ_DATE"] != "Release"] if sum(clinicaldf["SEQ_DATE"] == "") > 0: total_error.write( "Sample Clinical File: Samples without SEQ_DATEs will " "NOT be released.\n" ) try: if not seqdate.empty: seqdate.apply( lambda date: datetime.datetime.strptime(date, "%b-%Y") ) if not seqdate.str.startswith(("Jan", "Apr", "Jul", "Oct")).all(): total_error.write(seq_date_error) except ValueError: total_error.write(seq_date_error) else: total_error.write("Sample Clinical File: Must have SEQ_DATE column.\n") # CHECK: BIRTH_YEAR error = _check_year( clinicaldf=clinicaldf, year_col="BIRTH_YEAR", filename="Patient Clinical File", allowed_string_values=["Unknown", ">89", "<18"], ) total_error.write(error) # CHECK: YEAR DEATH error = _check_year( clinicaldf=clinicaldf, year_col="YEAR_DEATH", filename="Patient Clinical File", allowed_string_values=[ "Unknown", "Not Collected", "Not Applicable", "Not Released", ">89", "<18", ], ) total_error.write(error) # CHECK: YEAR CONTACT error = _check_year( clinicaldf=clinicaldf, year_col="YEAR_CONTACT", filename="Patient Clinical File", allowed_string_values=[ "Unknown", "Not Collected", "Not Released", ">89", "<18", ], ) total_error.write(error) # CHECK: INT CONTACT haveColumn = process_functions.checkColExist(clinicaldf, "INT_CONTACT") if haveColumn: if not all( [ process_functions.checkInt(i) for i in clinicaldf.INT_CONTACT if i not in [ ">32485", "<6570", "Unknown", "Not Collected", "Not Released", ] ] ): total_error.write( "Patient Clinical File: Please double check your " "INT_CONTACT column, it must be an integer, '>32485', " "'<6570', 'Unknown', 'Not Released' or 'Not Collected'.\n" ) else: total_error.write("Patient Clinical File: Must have INT_CONTACT column.\n") # INT DOD haveColumn = process_functions.checkColExist(clinicaldf, "INT_DOD") if haveColumn: if not all( [ process_functions.checkInt(i) for i in clinicaldf.INT_DOD if i not in [ ">32485", "<6570", "Unknown", "Not Collected", "Not Applicable", "Not Released", ] ] ): total_error.write( "Patient Clinical File: Please double check your INT_DOD " "column, it must be an integer, '>32485', '<6570', " "'Unknown', 'Not Collected', 'Not Released' or " "'Not Applicable'.\n" ) else: total_error.write("Patient Clinical File: Must have INT_DOD column.\n") haveColumn = process_functions.checkColExist(clinicaldf, "DEAD") if haveColumn: # Need to have check_bool function if not all( [ str(i).upper() in ["TRUE", "FALSE"] for i in clinicaldf.DEAD if i not in ["Unknown", "Not Collected", "Not Released"] ] ): total_error.write( "Patient Clinical File: Please double check your " "DEAD column, it must be True, False, 'Unknown', " "'Not Released' or 'Not Collected'.\n" ) else: total_error.write("Patient Clinical File: Must have DEAD column.\n") # CHECK: contact vital status value consistency contact_error = _check_int_year_consistency( clinicaldf=clinicaldf, cols=["YEAR_CONTACT", "INT_CONTACT"], string_vals=["Not Collected", "Unknown", "Not Released"], ) total_error.write(contact_error) # CHECK: death vital status value consistency death_error = _check_int_year_consistency( clinicaldf=clinicaldf, cols=["YEAR_DEATH", "INT_DOD"], string_vals=[ "Not Collected", "Unknown", "Not Applicable", "Not Released", ], ) total_error.write(death_error) death_error = _check_int_dead_consistency(clinicaldf=clinicaldf) total_error.write(death_error) # CHECK: SAMPLE_CLASS is optional attribute have_column = process_functions.checkColExist(clinicaldf, "SAMPLE_CLASS") if have_column: sample_class_vals = pd.Series(clinicaldf["SAMPLE_CLASS"].unique().tolist()) if not sample_class_vals.isin(["Tumor", "cfDNA"]).all(): total_error.write( "Sample Clinical File: SAMPLE_CLASS column must " "be 'Tumor', or 'cfDNA'\n" ) # CHECK: PRIMARY_RACE warn, error = process_functions.check_col_and_values( clinicaldf, "PRIMARY_RACE", race_mapping["CODE"].tolist(), "Patient Clinical File", ) warning.write(warn) total_error.write(error) # CHECK: SECONDARY_RACE warn, error = process_functions.check_col_and_values( clinicaldf, "SECONDARY_RACE", race_mapping["CODE"].tolist(), "Patient Clinical File", ) warning.write(warn) total_error.write(error) # CHECK: TERTIARY_RACE warn, error = process_functions.check_col_and_values( clinicaldf, "TERTIARY_RACE", race_mapping["CODE"].tolist(), "Patient Clinical File", ) warning.write(warn) total_error.write(error) # CHECK: SEX warn, error = process_functions.check_col_and_values( clinicaldf, "SEX", sex_mapping["CODE"].tolist(), "Patient Clinical File", required=True, ) warning.write(warn) total_error.write(error) # CHECK: ETHNICITY warn, error = process_functions.check_col_and_values( clinicaldf, "ETHNICITY", ethnicity_mapping["CODE"].tolist(), "Patient Clinical File", ) warning.write(warn) total_error.write(error) return total_error.getvalue(), warning.getvalue()
def update_clinical(self, x, sex_mapping, race_mapping, ethnicity_mapping, sample_type): # PATIENT ID if x.get("PATIENT_ID") is not None: x['PATIENT_ID'] = process_functions.checkGenieId( x['PATIENT_ID'], self.center) # RACE if x.get('PRIMARY_RACE') is not None: x['PRIMARY_RACE'] = process_functions.getCODE( race_mapping, x['PRIMARY_RACE']) else: x['PRIMARY_RACE'] = "Not Collected" if x.get('SECONDARY_RACE') is not None: x['SECONDARY_RACE'] = process_functions.getCODE( race_mapping, x['SECONDARY_RACE']) else: x['SECONDARY_RACE'] = "Not Collected" if x.get('TERTIARY_RACE') is not None: x['TERTIARY_RACE'] = process_functions.getCODE( race_mapping, x['TERTIARY_RACE']) else: x['TERTIARY_RACE'] = "Not Collected" # ETHNICITY if x.get('ETHNICITY') is not None: x['ETHNICITY'] = process_functions.getCODE(ethnicity_mapping, x['ETHNICITY']) else: x['ETHNICITY'] = "Not Collected" # BIRTH YEAR if x.get("BIRTH_YEAR") is not None: # BIRTH YEAR (Check if integer) if process_functions.checkInt(x['BIRTH_YEAR']): x['BIRTH_YEAR'] = int(x['BIRTH_YEAR']) # SEX if x.get("SEX") is not None: x['SEX'] = process_functions.getCODE(sex_mapping, x['SEX']) # TRIM EVERY COLUMN MAKE ALL DASHES # SAMPLE ID if x.get('SAMPLE_ID') is not None: x['SAMPLE_ID'] = process_functions.checkGenieId( x['SAMPLE_ID'], self.center) # AGE AT SEQ REPORT if x.get('AGE_AT_SEQ_REPORT') is not None: if process_functions.checkInt(x['AGE_AT_SEQ_REPORT']): x['AGE_AT_SEQ_REPORT'] = int(x['AGE_AT_SEQ_REPORT']) # SEQ ASSAY ID if x.get('SEQ_ASSAY_ID') is not None: x['SEQ_ASSAY_ID'] = x['SEQ_ASSAY_ID'].replace('_', '-') # standardize all SEQ_ASSAY_ID with uppercase x['SEQ_ASSAY_ID'] = x['SEQ_ASSAY_ID'].upper() # SAMPLE_TYPE if x.get('SAMPLE_TYPE') is not None: sampleType = x['SAMPLE_TYPE'] x['SAMPLE_TYPE'] = process_functions.getCODE( sample_type, sampleType) # Trim spaces x['SAMPLE_TYPE_DETAILED'] = process_functions.getCODE( sample_type, sampleType, useDescription=True) if x.get('SEQ_DATE') is not None: x['SEQ_DATE'] = x['SEQ_DATE'].title() x['SEQ_YEAR'] = \ int(str(x['SEQ_DATE']).split("-")[1]) \ if str(x['SEQ_DATE']) != "Release" else pd.np.nan if x.get('YEAR_CONTACT') is None: x['YEAR_CONTACT'] = 'Not Collected' else: if process_functions.checkInt(x['YEAR_CONTACT']): x['YEAR_CONTACT'] = int(x['YEAR_CONTACT']) if x.get('YEAR_DEATH') is None: x['YEAR_DEATH'] = 'Not Collected' else: if process_functions.checkInt(x['YEAR_DEATH']): x['YEAR_DEATH'] = int(x['YEAR_DEATH']) if x.get('INT_CONTACT') is None: x['INT_CONTACT'] = 'Not Collected' if x.get('INT_DOD') is None: x['INT_DOD'] = 'Not Collected' if x.get('DEAD') is None: x['DEAD'] = 'Not Collected' # TRIM EVERY COLUMN MAKE ALL DASHES for i in x.keys(): if isinstance(x[i], str): x[i] = x[i].strip(" ") return (x)
def _validate(self, clinicalDF, oncotreeLink): """ This function validates the clinical file to make sure it adhere to the clinical SOP. Args: clinicalDF: Merged clinical file with patient and sample information oncotreeLink: Link to oncotree Returns: Error message """ total_error = "" warning = "" clinicalDF.columns = [col.upper() for col in clinicalDF.columns] clinicalDF = clinicalDF.fillna("") # oncotree_mapping = process_functions.get_oncotree_codes(oncotreeLink) # if oncotree_mapping.empty: oncotree_mapping = pd.DataFrame() oncotree_mapping_dict = \ process_functions.get_oncotree_code_mappings(oncotreeLink) oncotree_mapping['ONCOTREE_CODE'] = oncotree_mapping_dict.keys() sampleType_mapping = \ process_functions.getGenieMapping(self.syn, "syn7434273") ethnicity_mapping = \ process_functions.getGenieMapping(self.syn, "syn7434242") race_mapping = \ process_functions.getGenieMapping(self.syn, "syn7434236") sex_mapping = \ process_functions.getGenieMapping(self.syn, "syn7434222") # CHECK: SAMPLE_ID sampleId = 'SAMPLE_ID' haveSampleColumn = \ process_functions.checkColExist(clinicalDF, sampleId) if not haveSampleColumn: total_error += \ "Sample Clinical File: Must have SAMPLE_ID column.\n" else: if sum(clinicalDF[sampleId].duplicated()) > 0: total_error += ( "Sample Clinical File: No duplicated SAMPLE_ID " "allowed.\nIf there are no duplicated " "SAMPLE_IDs, and both sample and patient files are " "uploaded, then please check to make sure no duplicated " "PATIENT_IDs exist in the patient clinical file.\n") # CHECK: PATIENT_ID patientId = "PATIENT_ID" # #CHECK: PATIENT_ID IN SAMPLE FILE havePatientColumn = \ process_functions.checkColExist(clinicalDF, patientId) if not havePatientColumn: total_error += \ "Patient Clinical File: Must have PATIENT_ID column.\n" # CHECK: within the sample file that the sample ids match # the patient ids if haveSampleColumn and havePatientColumn: # Make sure sample and patient ids are string cols clinicalDF[sampleId] = clinicalDF[sampleId].astype(str) clinicalDF[patientId] = clinicalDF[patientId].astype(str) if not all([ patient in sample for sample, patient in zip( clinicalDF[sampleId], clinicalDF[patientId]) ]): total_error += ( "Sample Clinical File: PATIENT_ID's much be contained in " "the SAMPLE_ID's (ex. SAGE-1 <-> SAGE-1-2)\n") # #CHECK: All samples must have associated patient data # (GENIE requires patient data) if not all(clinicalDF[patientId] != ""): total_error += ( "Patient Clinical File: All samples must have associated " "patient information and no null patient ids allowed. " "These samples are missing patient data: {}\n".format( ", ".join( clinicalDF[sampleId][clinicalDF[patientId] == ""])) ) # CHECK: All patients should have associated sample data if not all(clinicalDF[sampleId] != ""): # ## MAKE WARNING FOR NOW### warning += ( "Sample Clinical File: All patients must have associated " "sample information. These patients are missing sample " "data: {}\n".format(", ".join( clinicalDF[patientId][clinicalDF[sampleId] == ""]))) # CHECK: AGE_AT_SEQ_REPORT age = "AGE_AT_SEQ_REPORT" haveColumn = process_functions.checkColExist(clinicalDF, age) if haveColumn: # Deal with HIPAA converted rows from DFCI # First for loop can't int(text) because there # are instances that have <3435 age_seq_report_df = \ clinicalDF[~clinicalDF[age].isin(['Unknown'])] age_seq_report_df[age] = \ remove_greaterthan_lessthan_str(age_seq_report_df[age]) if not all([ process_functions.checkInt(i) for i in age_seq_report_df[age] ]): total_error += ( "Sample Clinical File: Please double check your " "AGE_AT_SEQ_REPORT. It must be an integer or 'Unknown'.\n") else: age_seq_report_df[age] = age_seq_report_df[age].astype(int) median_age = pd.np.median(age_seq_report_df[age]) if median_age < 100: total_error += ( "Sample Clinical File: Please double check your " "AGE_AT_SEQ_REPORT. You may be reporting this value " "in YEARS, please report in DAYS.\n") else: total_error += \ "Sample Clinical File: Must have AGE_AT_SEQ_REPORT column.\n" # CHECK: ONCOTREE_CODE haveColumn = \ process_functions.checkColExist(clinicalDF, "ONCOTREE_CODE") maleOncoCodes = ["TESTIS", "PROSTATE", "PENIS"] womenOncoCodes = ["CERVIX", "VULVA", "UTERUS", "OVARY"] if haveColumn: # Make oncotree codes uppercase (SpCC/SPCC) clinicalDF['ONCOTREE_CODE'] = \ clinicalDF['ONCOTREE_CODE'].astype(str).str.upper() oncotree_codes = clinicalDF['ONCOTREE_CODE'][ clinicalDF['ONCOTREE_CODE'] != "UNKNOWN"] if not all(oncotree_codes.isin(oncotree_mapping['ONCOTREE_CODE'])): unmapped_oncotrees = oncotree_codes[ ~oncotree_codes.isin(oncotree_mapping['ONCOTREE_CODE'])] total_error += ( "Sample Clinical File: Please double check that all your " "ONCOTREE CODES exist in the mapping. You have {} samples " "that don't map. These are the codes that " "don't map: {}\n".format(len(unmapped_oncotrees), ",".join( set(unmapped_oncotrees)))) if process_functions.checkColExist(clinicalDF, "SEX") and \ 'oncotree_mapping_dict' in locals() and \ havePatientColumn and \ haveSampleColumn: wrongCodeSamples = [] # This is to check if oncotree codes match the sex, # returns list of samples that have conflicting codes and sex for code, patient, sample in zip(clinicalDF['ONCOTREE_CODE'], clinicalDF['PATIENT_ID'], clinicalDF['SAMPLE_ID']): if oncotree_mapping_dict.get(code) is not None and \ sum(clinicalDF['PATIENT_ID'] == patient) > 0: primaryCode = oncotree_mapping_dict[code][ 'ONCOTREE_PRIMARY_NODE'] sex = clinicalDF['SEX'][clinicalDF['PATIENT_ID'] == patient].values[0] sex = float('nan') if sex == '' else float(sex) if oncotree_mapping_dict[code][ 'ONCOTREE_PRIMARY_NODE'] in maleOncoCodes and \ sex != 1.0: wrongCodeSamples.append(sample) if oncotree_mapping_dict[code][ 'ONCOTREE_PRIMARY_NODE'] in womenOncoCodes and\ sex != 2.0: wrongCodeSamples.append(sample) if len(wrongCodeSamples) > 0: warning += ( "Sample Clinical File: Some SAMPLE_IDs have " "conflicting SEX and ONCOTREE_CODES: {}\n".format( ",".join(wrongCodeSamples))) else: total_error += \ "Sample Clinical File: Must have ONCOTREE_CODE column.\n" warn, error = process_functions.check_col_and_values( clinicalDF, "SAMPLE_TYPE", sampleType_mapping['CODE'].tolist(), "Sample Clinical File", required=True) total_error += error # CHECK: SEQ_ASSAY_ID haveColumn = \ process_functions.checkColExist(clinicalDF, "SEQ_ASSAY_ID") if haveColumn: if not all([i != "" for i in clinicalDF['SEQ_ASSAY_ID']]): total_error += ( "Sample Clinical File: Please double check your " "SEQ_ASSAY_ID columns, there are empty rows.\n") # must remove empty seq assay ids first # Checking if seq assay ids start with the center name seqAssayIds = \ clinicalDF.SEQ_ASSAY_ID[clinicalDF.SEQ_ASSAY_ID != ""] allSeqAssays = seqAssayIds.unique() notNormalized = [] not_caps = [] for seqassay in allSeqAssays: # SEQ Ids are all capitalized now, so no need to check # for differences in case if not seqassay.upper().startswith(self.center): not_caps.append(seqassay) if len(not_caps) > 0: total_error += ("Sample Clinical File: Please make sure your " "SEQ_ASSAY_IDs start with your center " "abbreviation: {}.\n".format( ", ".join(not_caps))) else: total_error += \ "Sample Clinical File: Must have SEQ_ASSAY_ID column.\n" haveColumn = process_functions.checkColExist(clinicalDF, "SEQ_DATE") seq_date_error = ( "Sample Clinical File: SEQ_DATE must be one of five values- " "For Jan-March: use Jan-YEAR. " "For Apr-June: use Apr-YEAR. " "For July-Sep: use Jul-YEAR. " "For Oct-Dec: use Oct-YEAR. (ie. Apr-2017) " "For values that don't have SEQ_DATES that " "you want released use 'release'.\n") if haveColumn: clinicalDF['SEQ_DATE'] = [ i.title() for i in clinicalDF['SEQ_DATE'].astype(str) ] seqDate = clinicalDF['SEQ_DATE'][ clinicalDF['SEQ_DATE'] != 'Release'] if sum(clinicalDF['SEQ_DATE'] == '') > 0: total_error += ( "Sample Clinical File: Samples without SEQ_DATEs will " "NOT be released.\n") try: if not seqDate.empty: dates = seqDate.apply( lambda date: datetime.datetime.strptime(date, '%b-%Y')) # REMOVE JUN LATER if not all([ i.startswith(("Jul", "Jan", "Oct", "Apr")) for i in seqDate ]): total_error += seq_date_error except ValueError: total_error += seq_date_error else: total_error += "Sample Clinical File: Must have SEQ_DATE column.\n" # CHECK: BIRTH_YEAR birth_year = "BIRTH_YEAR" haveColumn = process_functions.checkColExist(clinicalDF, birth_year) if haveColumn: birth_year_df = \ clinicalDF[~clinicalDF[birth_year].isin(['Unknown'])] # Deal with HIPAA converted rows from DFCI # First for loop can't int(text) because there are # instances that have <YYYY birth_year_df[birth_year] = \ remove_greaterthan_lessthan_str(birth_year_df[birth_year]) try: years = birth_year_df[birth_year].apply( lambda x: datetime.datetime.strptime(str(int( x)), '%Y').year > datetime.datetime.utcnow().year) assert not years.any() except Exception: total_error += ( "Patient Clinical File: Please double check your " "BIRTH_YEAR column, it must be an integer in YYYY format " "> {year} or 'Unknown'.\n".format( year=datetime.datetime.utcnow().year)) else: total_error += \ "Patient Clinical File: Must have BIRTH_YEAR column.\n" # CHECK: VITAL_STATUS # YEAR DEATH haveColumn = process_functions.checkColExist(clinicalDF, "YEAR_DEATH") if haveColumn: notNullYears = clinicalDF.YEAR_DEATH[~clinicalDF.YEAR_DEATH.isin( ['Unknown', 'Not Collected', 'Not Applicable'])] try: notNullYears.apply( lambda x: datetime.datetime.strptime(str(int(x)), '%Y')) except Exception: total_error += ( "Patient Clinical File: Please double check your " "YEAR_DEATH column, it must be an integer in YYYY format, " "'Unknown', 'Not Applicable' or 'Not Collected'.\n") else: total_error += \ "Patient Clinical File: Must have YEAR_DEATH column.\n" # YEAR CONTACT haveColumn = process_functions.checkColExist(clinicalDF, "YEAR_CONTACT") if haveColumn: notNullYears = clinicalDF.YEAR_CONTACT[ ~clinicalDF.YEAR_CONTACT.isin(['Unknown', 'Not Collected'])] try: notNullYears.apply( lambda x: datetime.datetime.strptime(str(int(x)), '%Y')) except Exception: total_error += ( "Patient Clinical File: Please double check your " "YEAR_CONTACT column, it must be an integer in YYYY " "format, 'Unknown' or 'Not Collected'.\n") else: total_error += \ "Patient Clinical File: Must have YEAR_CONTACT column.\n" # INT CONTACT haveColumn = process_functions.checkColExist(clinicalDF, "INT_CONTACT") if haveColumn: if not all([ process_functions.checkInt(i) for i in clinicalDF.INT_CONTACT if i not in ['>32485', '<6570', 'Unknown', 'Not Collected'] ]): total_error += ( "Patient Clinical File: Please double check your " "INT_CONTACT column, it must be an integer, '>32485', " "'<6570', 'Unknown' or 'Not Collected'.\n") else: total_error += \ "Patient Clinical File: Must have INT_CONTACT column.\n" # INT DOD haveColumn = process_functions.checkColExist(clinicalDF, "INT_DOD") if haveColumn: if not all([ process_functions.checkInt(i) for i in clinicalDF.INT_DOD if i not in [ '>32485', '<6570', 'Unknown', 'Not Collected', 'Not Applicable' ] ]): total_error += ( "Patient Clinical File: Please double check your INT_DOD " "column, it must be an integer, '>32485', '<6570', " "'Unknown', 'Not Collected' or 'Not Applicable'.\n") else: total_error += \ "Patient Clinical File: Must have INT_DOD column.\n" haveColumn = process_functions.checkColExist(clinicalDF, "DEAD") if haveColumn: # Need to have check_bool function if not all([ str(i).upper() in ['TRUE', 'FALSE'] for i in clinicalDF.DEAD if i not in ['Unknown', 'Not Collected'] ]): total_error += ( "Patient Clinical File: Please double check your " "DEAD column, it must be True, False, 'Unknown' or " "'Not Collected'.\n") else: total_error += \ "Patient Clinical File: Must have DEAD column.\n" # CHECK: PRIMARY_RACE warn, error = process_functions.check_col_and_values( clinicalDF, "PRIMARY_RACE", race_mapping['CODE'].tolist(), "Patient Clinical File") warning += warn total_error += error # CHECK: SECONDARY_RACE warn, error = process_functions.check_col_and_values( clinicalDF, "SECONDARY_RACE", race_mapping['CODE'].tolist(), "Patient Clinical File") warning += warn total_error += error # CHECK: TERTIARY_RACE warn, error = process_functions.check_col_and_values( clinicalDF, "TERTIARY_RACE", race_mapping['CODE'].tolist(), "Patient Clinical File") warning += warn total_error += error # CHECK: SEX warn, error = process_functions.check_col_and_values( clinicalDF, "SEX", sex_mapping['CODE'].tolist(), "Patient Clinical File", required=True) warning += warn total_error += error # CHECK: ETHNICITY warn, error = process_functions.check_col_and_values( clinicalDF, "ETHNICITY", ethnicity_mapping['CODE'].tolist(), "Patient Clinical File") warning += warn total_error += error return (total_error, warning)
def _validate(self, assay_info_df, project_id): """ Validates the values of assay information file Args: assay_info_df: assay information dataframe Returns: tuple: error and warning """ total_error = "" warning = "" if process_functions.checkColExist(assay_info_df, "SEQ_ASSAY_ID"): all_seq_assays = (assay_info_df.SEQ_ASSAY_ID.replace( { "_": "-" }, regex=True).str.upper().unique()) if not all( [assay.startswith(self.center) for assay in all_seq_assays]): total_error += ( "Assay_information.yaml: Please make sure all your " "SEQ_ASSAY_IDs start with your center abbreviation.\n") db_to_syn_map_df = process_functions.get_synid_database_mappingdf( self.syn, project_id) sample_synid = process_functions.getDatabaseSynId( self.syn, "sample", databaseToSynIdMappingDf=db_to_syn_map_df) uniq_seq_df = process_functions.get_syntabledf( self.syn, f"select distinct(SEQ_ASSAY_ID) as seq from {sample_synid} " f"where CENTER = '{self.center}'", ) # These are all the SEQ_ASSAY_IDs that are in the clinical database # but not in the assay_information file missing_seqs = uniq_seq_df["seq"][ ~uniq_seq_df["seq"].replace({ "_": "-" }, regex=True).str.upper().isin(all_seq_assays)] missing_seqs_str = ", ".join(missing_seqs) if missing_seqs.to_list(): total_error += ( "Assay_information.yaml: You are missing SEQ_ASSAY_IDs: " f"{missing_seqs_str}\n") else: total_error += "Assay_information.yaml: Must have SEQ_ASSAY_ID column.\n" read_group_dict = process_functions.get_gdc_data_dictionary( "read_group") read_group_headers = read_group_dict["properties"] warn, error = process_functions.check_col_and_values( assay_info_df, "is_paired_end", [True, False], filename="Assay_information.yaml", required=True, ) warning += warn total_error += error warn, error = process_functions.check_col_and_values( assay_info_df, "library_selection", read_group_headers["library_selection"]["enum"], filename="Assay_information.yaml", required=True, ) warning += warn total_error += error warn, error = process_functions.check_col_and_values( assay_info_df, "library_strategy", read_group_headers["library_strategy"]["enum"], filename="Assay_information.yaml", required=True, ) warning += warn total_error += error warn, error = process_functions.check_col_and_values( assay_info_df, "platform", read_group_headers["platform"]["enum"], filename="Assay_information.yaml", required=True, ) warning += warn total_error += error instrument_model = read_group_headers["instrument_model"]["enum"] instrument_model.extend(["Illumina NovaSeq 6000", None]) warn, error = process_functions.check_col_and_values( assay_info_df, "instrument_model", instrument_model, filename="Assay_information.yaml", required=True, ) warning += warn total_error += error # target_capture_kit = read_group_headers['target_capture_kit']['enum'] # warn, error = process_functions.check_col_and_values( # assay_info_df, # 'target_capture_kit', # target_capture_kit, # filename="Assay_information.yaml", # required=True) # warning += warn # total_error += error if not process_functions.checkColExist(assay_info_df, "target_capture_kit"): total_error += ("Assay_information.yaml: " "Must have target_capture_kit column.\n") variant_classes = [ "Splice_Site", "Nonsense_Mutation", "Frame_Shift_Del", "Frame_Shift_Ins", "Nonstop_Mutation", "Translation_Start_Site", "In_Frame_Ins", "In_Frame_Del", "Missense_Mutation", "Intron", "Splice_Region", "Silent", "RNA", "5'UTR", "3'UTR", "IGR", "5'Flank", "3'Flank", None, ] warn, error = process_functions.check_col_and_values( assay_info_df, "variant_classifications", variant_classes, filename="Assay_information.yaml", na_allowed=True, sep=";", ) warning += warn total_error += error if process_functions.checkColExist(assay_info_df, "read_length"): if not all([ process_functions.checkInt(i) for i in assay_info_df["read_length"] if i is not None and not pd.isnull(i) ]): total_error += ("Assay_information.yaml: " "Please double check your read_length. " "It must be an integer or null.\n") else: total_error += "Assay_information.yaml: " "Must have read_length column.\n" if process_functions.checkColExist(assay_info_df, "number_of_genes"): if not all([ process_functions.checkInt(i) for i in assay_info_df["number_of_genes"] ]): total_error += ("Assay_information.yaml: " "Please double check your number_of_genes. " "It must be an integer.\n") else: total_error += ("Assay_information.yaml: " "Must have number_of_genes column.\n") if process_functions.checkColExist(assay_info_df, "gene_padding"): if not all([ process_functions.checkInt(i) for i in assay_info_df["gene_padding"] if i is not None and not pd.isnull(i) ]): total_error += ("Assay_information.yaml: " "Please double check your gene_padding. " "It must be an integer or blank.\n") else: warning += ("Assay_information.yaml: " "gene_padding is by default 10 if not specified.\n") warn, error = process_functions.check_col_and_values( assay_info_df, "calling_strategy", ["tumor_only", "tumor_normal", "plasma_normal"], filename="Assay_information.yaml", required=True, ) warning += warn total_error += error if process_functions.checkColExist(assay_info_df, "specimen_tumor_cellularity"): if not all([ i.startswith(">") and i.endswith("%") for i in assay_info_df["specimen_tumor_cellularity"] ]): total_error += ( "Assay_information.yaml: " "Please double check your specimen_tumor_cellularity. " "It must in this format >(num)%. ie. >10%\n") else: total_error += ("Assay_information.yaml: " "Must have specimen_tumor_cellularity column.\n") alteration_types = [ "snv", "small_indels", "gene_level_cna", "intragenic_cna", "structural_variants", ] warn, error = process_functions.check_col_and_values( assay_info_df, "alteration_types", alteration_types, filename="Assay_information.yaml", required=True, sep=";", ) warning += warn total_error += error preservation_technique = ["FFPE", "fresh_frozen", "NA"] warn, error = process_functions.check_col_and_values( assay_info_df, "preservation_technique", preservation_technique, filename="Assay_information.yaml", required=True, sep=";", ) warning += warn total_error += error coverage = ["hotspot_regions", "coding_exons", "introns", "promoters"] warn, error = process_functions.check_col_and_values( assay_info_df, "coverage", coverage, filename="Assay_information.yaml", required=True, sep=";", ) warning += warn total_error += error return total_error, warning