def extract_patient_sample(raw_sample_df: DataFrame) -> DataFrame: patient_sample_df = raw_sample_df.select( "diagnosis", col("sample_id").alias("external_patient_sample_id"), "grade", "grading_system", "stage", "staging_system", "primary_site", "collection_site", init_cap_and_trim_all("prior_treatment").alias("prior_treatment"), "tumour_type", col("model_id").alias("model_name")) return patient_sample_df
def clean_data_before_join(patient_df: DataFrame) -> DataFrame: patient_df = patient_df.withColumn("ethnicity", init_cap_and_trim_all("ethnicity")) return patient_df
def get_engraftment_site_from_model(raw_model_df: DataFrame) -> DataFrame: return raw_model_df.select( init_cap_and_trim_all("engraftment_site").alias("name"))
def get_tumour_type_from_sample(raw_sample_df: DataFrame) -> DataFrame: return raw_sample_df.select( init_cap_and_trim_all("tumour_type").alias("name"))
def get_engraftment_material_from_model(raw_model_df: DataFrame) -> DataFrame: return raw_model_df.select(init_cap_and_trim_all("sample_type").alias("name"))
def clean_data_before_join(patient_sample_df: DataFrame) -> DataFrame: patient_sample_df = patient_sample_df.withColumn( "tumour_type", init_cap_and_trim_all("tumour_type")) return patient_sample_df
def get_project_group_from_sharing(raw_sharing_df: DataFrame) -> DataFrame: return raw_sharing_df.select(init_cap_and_trim_all("project").alias("name")).where("project is not null")
def get_ethnicity_from_patient(raw_patient_df: DataFrame) -> DataFrame: ethnicity_df = raw_patient_df.select( init_cap_and_trim_all("ethnicity").alias("name")) ethnicity_df = ethnicity_df.select("name").where("name is not null") ethnicity_df = ethnicity_df.drop_duplicates() return ethnicity_df
def extract_model_validation(raw_model_validation_df: DataFrame) -> DataFrame: quality_assurance_df = raw_model_validation_df.withColumn( "validation_technique", init_cap_and_trim_all("validation_technique")) return quality_assurance_df