Пример #1
0
def transform_quality_assurance(raw_model_validation_df: DataFrame,
                                model_df: DataFrame) -> DataFrame:
    quality_assurance_df = extract_model_validation(raw_model_validation_df)
    quality_assurance_df = set_fk_model(quality_assurance_df, model_df)
    quality_assurance_df = add_id(quality_assurance_df, "id")
    quality_assurance_df = get_columns_expected_order(quality_assurance_df)
    return quality_assurance_df
Пример #2
0
def transform_tissue(raw_sample_df: DataFrame) -> DataFrame:
    tissue_df = get_collection_site_from_sample(raw_sample_df).union(
        get_primary_type_from_sample(raw_sample_df))
    tissue_df = tissue_df.drop_duplicates()
    tissue_df = add_id(tissue_df, "id")
    tissue_df = tissue_df.select("id", "name")
    return tissue_df
Пример #3
0
def transform_diagnosis(raw_patient_df: DataFrame,
                        raw_sample_df: DataFrame) -> DataFrame:
    diagnosis_df = get_diagnosis_from_patient(raw_patient_df).union(
        get_diagnosis_from_sample(raw_sample_df))
    diagnosis_df = diagnosis_df.drop_duplicates()
    diagnosis_df = add_id(diagnosis_df, "id")
    diagnosis_df = diagnosis_df.select("id", "name")
    return diagnosis_df
def transform_engraftment_sample_type(raw_model_df: DataFrame) -> DataFrame:
    engraftment_sample_type_df = get_engraftment_sample_type_from_model(
        raw_model_df)
    engraftment_sample_type_df = engraftment_sample_type_df.drop_duplicates()
    engraftment_sample_type_df = add_id(engraftment_sample_type_df, "id")
    engraftment_sample_type_df = engraftment_sample_type_df.select(
        "id", "name")
    return engraftment_sample_type_df
Пример #5
0
def transform_treatment(drug_dosing_df: DataFrame, patient_treatment_df: DataFrame) -> DataFrame:
    treatment_df = get_treatment_from_drug_dosing(drug_dosing_df).union(
        get_treatment_patient_treatment(patient_treatment_df)
    )
    treatment_df = treatment_df.withColumn("name", lower_and_trim_all("name"))
    treatment_df = treatment_df.drop_duplicates()
    treatment_df = add_id(treatment_df, "id")
    treatment_df = treatment_df.select("id", "name")
    treatment_df.show()
    return treatment_df
def transform_patient_snapshot(raw_sample_df: DataFrame,
                               patient_sample_df: DataFrame,
                               patient_df: DataFrame) -> DataFrame:
    patient_snapshot_df = clean_data_before_join(raw_sample_df)
    patient_snapshot_df = set_fk_patient(patient_snapshot_df, patient_df)
    patient_snapshot_df = set_fk_patient_sample(patient_snapshot_df,
                                                patient_sample_df)
    patient_snapshot_df = add_id(patient_snapshot_df, "id")
    patient_snapshot_df = get_columns_expected_order(patient_snapshot_df)

    return patient_snapshot_df
Пример #7
0
def transform_patient(raw_patient_df: DataFrame, diagnosis_df: DataFrame,
                      ethnicity_df: DataFrame,
                      provider_group_df: DataFrame) -> DataFrame:

    patient_df = clean_data_before_join(raw_patient_df)
    patient_df = set_fk_diagnosis(patient_df, diagnosis_df)
    patient_df = set_fk_ethnicity(patient_df, ethnicity_df)
    patient_df = set_fk_provider_group(patient_df, provider_group_df)
    patient_df = set_external_id(patient_df)
    patient_df = add_id(patient_df, "id")
    patient_df = get_columns_expected_order(patient_df)
    return patient_df
Пример #8
0
def transform_patient_sample(raw_sample_df: DataFrame, diagnosis_df: DataFrame,
                             tissue_df: DataFrame, tumour_type_df: DataFrame,
                             model_df: DataFrame,
                             raw_sample_platform_df: DataFrame) -> DataFrame:
    patient_sample_df = extract_patient_sample(raw_sample_df)
    patient_sample_df = clean_data_before_join(patient_sample_df)
    patient_sample_df = add_id(patient_sample_df, "id")
    patient_sample_df = set_fk_diagnosis(patient_sample_df, diagnosis_df)
    patient_sample_df = set_fk_origin_tissue(patient_sample_df, tissue_df)
    patient_sample_df = set_fk_sample_site(patient_sample_df, tissue_df)
    patient_sample_df = set_fk_tumour_type(patient_sample_df, tumour_type_df)
    patient_sample_df = set_fk_model(patient_sample_df, model_df)
    patient_sample_df = set_raw_data_url(patient_sample_df,
                                         raw_sample_platform_df)
    patient_sample_df = get_columns_expected_order(patient_sample_df)
    return patient_sample_df
Пример #9
0
def transform_provider_group(
        raw_sharing_df: DataFrame,
        raw_loader_df: DataFrame,
        provider_type_df: DataFrame) -> DataFrame:
    data_from_sharing_df = extract_data_sharing(raw_sharing_df)
    data_from_loader_df = extract_data_loader(raw_loader_df)

    provider_group_df = join_sharing_loader(
        data_from_sharing_df, data_from_loader_df)

    provider_group_df = set_fk_provider_type(
        provider_group_df, provider_type_df)

    provider_group_df = add_id(provider_group_df, "id")
    provider_group_df = get_columns_expected_order(provider_group_df)
    return provider_group_df
Пример #10
0
def transform_model(raw_model_df: DataFrame, raw_sharing_df: DataFrame,
                    publication_group_df: DataFrame,
                    accessibility_group_df: DataFrame,
                    contact_people_df: DataFrame, contact_form_df: DataFrame,
                    source_database_df: DataFrame) -> DataFrame:

    model_df = get_data_from_model(raw_model_df)
    model_df = join_model_with_sharing(model_df, raw_sharing_df)
    model_df = add_id(model_df, "id")
    model_df = set_fk_publication_group(model_df, publication_group_df)
    model_df = set_fk_accessibility_group(model_df, accessibility_group_df)
    model_df = set_fk_contact_people(model_df, contact_people_df)
    model_df = set_fk_contact_form(model_df, contact_form_df)
    model_df = set_fk_source_database(model_df, source_database_df)
    model_df = get_columns_expected_order(model_df)

    return model_df
def transform_source_database(raw_sharing_df: DataFrame) -> DataFrame:
    source_database_df = extract_source_database(raw_sharing_df)
    source_database_df = add_id(source_database_df, "id")
    source_database_df = get_columns_expected_order(source_database_df)

    return source_database_df
def transform_engraftment_site(raw_model_df: DataFrame) -> DataFrame:
    engraftment_site = get_engraftment_site_from_model(raw_model_df)
    engraftment_site = engraftment_site.drop_duplicates()
    engraftment_site = add_id(engraftment_site, "id")
    engraftment_site = engraftment_site.select("id", "name")
    return engraftment_site
def transform_tumour_type(raw_sample_df: DataFrame) -> DataFrame:
    tumour_type = get_tumour_type_from_sample(raw_sample_df)
    tumour_type = tumour_type.drop_duplicates()
    tumour_type = add_id(tumour_type, "id")
    tumour_type = tumour_type.select("id", "name")
    return tumour_type
def transform_engraftment_material(raw_model_df: DataFrame) -> DataFrame:
    engraftment_material = get_engraftment_material_from_model(raw_model_df)
    engraftment_material = engraftment_material.drop_duplicates()
    engraftment_material = add_id(engraftment_material, "id")
    engraftment_material = engraftment_material.select("id", "name")
    return engraftment_material
Пример #15
0
def transform_provider_group(raw_sharing_df: DataFrame) -> DataFrame:
    provider_type_df = get_provider_type_from_sharing(raw_sharing_df)
    provider_type_df = add_id(provider_type_df, "id")
    provider_type_df = get_columns_expected_order(provider_type_df)
    return provider_type_df
def transform_xenograft_sample(raw_sample_platform_df: DataFrame) -> DataFrame:
    xenograft_sample_df = get_xenograft_sample_from_sample_platform(
        raw_sample_platform_df)
    xenograft_sample_df = add_id(xenograft_sample_df, "id")
    xenograft_sample_df = get_columns_expected_order(xenograft_sample_df)
    return xenograft_sample_df
def transform_host_strain(raw_model_df: DataFrame) -> DataFrame:
    host_strain_df = extract_host_strain(raw_model_df)
    host_strain_df = add_id(host_strain_df, "id")
    host_strain_df = get_columns_expected_order(host_strain_df)
    return host_strain_df
Пример #18
0
def transform_accessibility_group(raw_sharing_df: DataFrame) -> DataFrame:
    accessibility_group_df = get_accessibility_group_from_sharing(
        raw_sharing_df)
    accessibility_group_df = add_id(accessibility_group_df, "id")
    accessibility_group_df = get_columns_expected_order(accessibility_group_df)
    return accessibility_group_df
Пример #19
0
def transform_ethnicity(raw_patient_df: DataFrame) -> DataFrame:
    ethnicity_df = get_ethnicity_from_patient(raw_patient_df)
    ethnicity_df = add_id(ethnicity_df, "id")
    ethnicity_df = get_columns_expected_order(ethnicity_df)
    return ethnicity_df
Пример #20
0
def transform_publication_group(raw_model_df: DataFrame) -> DataFrame:
    publication_group_df = extract_publications_from_models(raw_model_df)
    publication_group_df = add_id(publication_group_df, "id")
    publication_group_df = get_columns_expected_order(publication_group_df)
    return publication_group_df
Пример #21
0
def transform_project_group(raw_sharing_df: DataFrame) -> DataFrame:
    project_group = get_project_group_from_sharing(raw_sharing_df)
    project_group = project_group.drop_duplicates()
    project_group = add_id(project_group, "id")
    project_group = project_group.select("id", "name")
    return project_group
def transform_contact_form(raw_sharing_df: DataFrame) -> DataFrame:
    contact_form_df = extract_contact_form(raw_sharing_df)
    contact_form_df = add_id(contact_form_df, "id")
    contact_form_df = get_columns_expected_order(contact_form_df)

    return contact_form_df