from dash_website.utils.aws_loader import load_csv, upload_file

if __name__ == "__main__":
    squeezed_dimensions = (load_csv(
        f"page2_predictions/Performances/PERFORMANCES_bestmodels_alphabetical_instances_Age_test.csv"
    )[["organ", "view", "R-Squared_all", "R-Squared_sd_all"]].rename(
        columns={
            "organ": "dimension",
            "view": "subdimension",
            "R-Squared_all": "r2",
            "R-Squared_sd_all": "r2_std"
        }).replace({
            "ImmuneSystem": "BloodCells"
        }).set_index("dimension"))
    squeezed_dimensions.loc["Lungs", "subdimension"] = "*"
    squeezed_dimensions.loc["Hearing", "subdimension"] = "*"
    squeezed_dimensions.reset_index(inplace=True)
    squeezed_dimensions["squeezed_dimensions"] = squeezed_dimensions[
        "dimension"] + squeezed_dimensions["subdimension"].replace("*", "")

    squeezed_dimensions.to_feather(
        "all_data/xwas/squeezed_dimensions_participant_and_time_of_examination.feather"
    )
    upload_file(
        "all_data/xwas/squeezed_dimensions_participant_and_time_of_examination.feather",
        "xwas/squeezed_dimensions_participant_and_time_of_examination.feather",
    )
예제 #2
0
    "Ethnicity.African",
    "Ethnicity.Black_Other",
    "Ethnicity.Chinese",
    "Ethnicity.Other",
    "Ethnicity.Other_ethnicity",
    "Ethnicity.Do_not_know",
    "Ethnicity.Prefer_not_to_answer",
    "Ethnicity.NA",
]

if __name__ == "__main__":
    list_information = []

    for chamber_type in [3, 4]:
        information_raw = load_csv(
            f"page12_AttentionMapsVideos/RawVideos/files/AttentionMaps-samples_Age_Heart_MRI_{chamber_type}chambersRawVideo.csv",
            usecols=columns_to_take,
        )[columns_to_take].set_index("id")
        information_raw.drop(index=information_raw[
            information_raw["aging_rate"] != "normal"].index,
                             inplace=True)

        information = pd.DataFrame(
            None,
            columns=[
                "chamber", "sex", "age_group", "sample", "chronological_age",
                "biological_age", "ethnicity"
            ],
            index=information_raw.index,
        )

        information["chamber"] = chamber_type
import pandas as pd
from tqdm import tqdm

from dash_website.utils.aws_loader import load_csv

from dash_website import DIMENSIONS, ALL_CATEGORIES

if __name__ == "__main__":
    for dimension in tqdm(DIMENSIONS):
        list_indexes = []
        for category in ALL_CATEGORIES:
            for variable in load_csv(
                    f"page5_LinearXWASResults/LinearOutput/linear_correlations_{category}_{dimension}.csv",
                    usecols=["env_feature_name"],
            )["env_feature_name"].apply(
                    lambda variable: variable.replace(".0", "")):
                list_indexes.append([category, variable])

        indexes = pd.MultiIndex.from_tuples(list_indexes,
                                            names=["category", "variable"])
        correlations = pd.DataFrame(
            None,
            columns=["p_value", "correlation", "sample_size"],
            index=indexes)

        for category in ALL_CATEGORIES:
            correlation_category_dimension = load_csv(
                f"page5_LinearXWASResults/LinearOutput/linear_correlations_{category}_{dimension}.csv",
                usecols=[
                    "env_feature_name", "p_val", "corr_value",
                    "size_na_dropped"
예제 #4
0
    "R-Squared_sd_all": "r2_std",
}

DATA_TYPE_NAMING = {
    "instances": "all_samples_per_participant",
    "eids": "average_per_participant"
}
ALGORITHMS_NAMING = {
    "ElasticNet": "elastic_net",
    "LightGBM": "light_gbm",
    "NeuralNetwork": "neural_network"
}

for data_type in ["instances"]:  # ["eids", "instances"]:
    scores_raw = load_csv(
        f"page2_predictions/Performances/PERFORMANCES_tuned_alphabetical_{data_type}_Age_test.csv"
    )

    scores_ = scores_raw[COLUMNS_TO_TAKE.keys()].rename(
        columns=COLUMNS_TO_TAKE)
    scores = scores_.replace(ALGORITHMS_NAMING).replace({
        "ImmuneSystem":
        "BloodCells"
    }).reset_index(drop=True)
    scores.loc[(scores["dimension"] == "Musculoskeletal") &
               (scores["sub_subdimension"] == "MRI"),
               "sub_subdimension"] = "DXA"

    scores.to_feather(
        f"all_data/feature_importances/scores_{DATA_TYPE_NAMING[data_type]}.feather"
    )
예제 #5
0
    "*": "all_samples_when_possible_otherwise_average",
}
COLUMNS_TO_TAKE = {
    "organ": "dimension",
    "view": "subdimension",
    "transformation": "sub_subdimension",
    "architecture": "algorithm",
    "R-Squared_all": "r2",
    "R-Squared_sd_all": "r2_std",
}
DICT_TO_CHANGE_DIMENSIONS = {"ImmuneSystem": "BloodCells"}

if __name__ == "__main__":
    for sample_definition in ["instances", "eids"]:
        correlations_raw_ = load_csv(
            f"page4_correlations/ResidualsCorrelations/ResidualsCorrelations_{sample_definition}_Age_test.csv"
        )
        correlations_std_raw_ = load_csv(
            f"page4_correlations/ResidualsCorrelations/ResidualsCorrelations_sd_{sample_definition}_Age_test.csv"
        )

        correlations_raw = correlations_raw_.melt(
            id_vars=["Unnamed: 0"],
            value_vars=correlations_raw_.columns.drop("Unnamed: 0"))
        correlations_raw.rename(columns={
            "Unnamed: 0": "dimensions_1",
            "variable": "dimensions_2",
            "value": "correlation"
        },
                                inplace=True)
    "correlation": "Correlation",
}

if __name__ == "__main__":
    list_features = []
    for category in tqdm(
            pd.Index(ALL_CATEGORIES).drop(
                ["Genetics", "Phenotypic", "PhysicalActivity"])):
        for dimension in DIMENSIONS:
            for algorithm in [
                    "elastic_net", "light_gbm", "neural_network", "correlation"
            ]:
                if "medical_diagnoses_" in category:
                    features = load_csv(
                        f"page18_MultivariateXWASFeatures/FeatureImp_{DICT_TO_FORMER_CATEGORIES.get(category, category)}_{DICT_TO_FORMER_DIMENSIONS.get(dimension, dimension)}_{DICT_TO_FORMER_ALGORITHM.get(algorithm, algorithm)}.csv"
                    ).rename(columns={
                        "features": "variable",
                        "weight": "feature_importance"
                    })
                else:
                    features = load_csv(
                        f"page18_MultivariateXWASFeatures/FeatureImp_Clusters_{DICT_TO_FORMER_CATEGORIES.get(category, category)}_{DICT_TO_FORMER_DIMENSIONS.get(dimension, dimension)}_{DICT_TO_FORMER_ALGORITHM.get(algorithm, algorithm)}.csv"
                    ).rename(columns={
                        "features": "variable",
                        "weight": "feature_importance"
                    })

                features["variable"] = features["variable"].apply(
                    lambda variable: variable.split(".0")[0])
                features["category"] = category
                features["dimension"] = dimension
                features["algorithm"] = algorithm
예제 #7
0
    ("PhysicalActivity", "FullWeek", "Scalars"): "PhysicalActivity",
    # ("Demographics", "All", "Scalars"): "Demographics",
}


if __name__ == "__main__":
    for dimension, subdimension, sub_subdimension in tqdm(DIMENSION_TO_NAME.keys()):
        list_colums = []

        for algorithm in ["correlation", "elastic_net", "light_gbm", "neural_network"]:
            for observation in ["mean", "std"]:
                list_colums.append([algorithm, observation])
        columns = pd.MultiIndex.from_tuples(list_colums, names=["algorithm", "observation"])

        feature_for_index = load_csv(
            f"page3_featureImp/FeatureImp/FeatureImp_Age_{dimension}_{subdimension}_{sub_subdimension}_ElasticNet.csv"
        ).rename(columns={"features": "feature"})
        feature_for_index["feature"] = (
            feature_for_index["feature"].astype(str).apply(lambda feature: feature.split(".0")[0])
        )
        feature_for_index.drop(index=feature_for_index.index[feature_for_index["feature"].duplicated()], inplace=True)

        features = pd.DataFrame(None, columns=columns, index=feature_for_index["feature"])
        features.index.name = "feature"

        for algorithm in ["correlation", "elastic_net", "light_gbm", "neural_network"]:
            mean_feature = load_csv(
                f"page3_featureImp/FeatureImp/FeatureImp_Age_{dimension}_{subdimension}_{sub_subdimension}_{ALGORITHM_NAMING[algorithm]}.csv"
            ).rename(columns={"features": "feature"})
            mean_feature["feature"] = mean_feature["feature"].astype(str).apply(lambda feature: feature.split(".0")[0])
            mean_feature.drop(index=mean_feature.index[mean_feature["feature"].duplicated()], inplace=True)
예제 #8
0
from dash_website.utils.aws_loader import load_csv

COLUMNS_TO_TAKE = {
    "SNP": "SNP",
    "CHR": "chromosome",
    "Gene": "Gene",
    "Gene_type": "Gene_type",
    "P_BOLT_LMM_INF": "p_value",
    "BETA": "size_effect",
    "organ": "dimension",
}

if __name__ == "__main__":
    size_effects = load_csv(
        "page10_GWASResults/Volcano/GWAS_hits_Age_All_withGenes.csv"
    )[COLUMNS_TO_TAKE].rename(columns=COLUMNS_TO_TAKE)
    size_effects.replace(
        {
            "*instances1": "*instances1.5x",
            "ImmuneSystem": "BloodCells"
        },
        inplace=True)
    size_effects.drop(
        index=size_effects[size_effects["dimension"] == "withGenes"].index,
        inplace=True)
    size_effects.reset_index(
        drop=True).to_feather("all_data/genetics/gwas/size_effects.feather")
예제 #9
0
        pd.concat(
            (missing_scores, old_scores), ignore_index=True
        ).drop(columns="Unnamed: 0").to_csv(
            f"all_data/page7_MultivariateXWASResults/Scores/Scores_{algorithm}_test.csv"
        )
        upload_file(
            f"all_data/page7_MultivariateXWASResults/Scores/Scores_{algorithm}_test.csv",
            f"page7_MultivariateXWASResults/Scores/Scores_{algorithm}_test.csv",
        )

    list_scores = []

    for algorithm in CAMEL_TO_SNAKE.keys():
        scores = load_csv(
            f"page7_MultivariateXWASResults/Scores/Scores_{algorithm}_test.csv",
            index_col=0).drop(columns="subset")
        scores.rename(columns={
            "env_dataset": "category",
            "organ": "dimension"
        },
                      inplace=True)

        scores_cleaned_dimension = scores.set_index("dimension").rename(
            index=DICT_TO_CHANGE_DIMENSIONS).reset_index()

        every_category = np.array(
            scores_cleaned_dimension["category"].tolist())
        category_to_split = ~scores_cleaned_dimension[
            "category"].str.startswith("medical_diagnoses")
예제 #10
0
    "Claudification": "Claudication",
}

CAMEL_TO_SNAKE = {
    "ElasticNet": "elastic_net",
    "LightGbm": "light_gbm",
    "NeuralNetwork": "neural_network"
}

if __name__ == "__main__":
    list_correlations = []

    for correlation_type in ["Pearson", "Spearman"]:
        for algorithm in ["ElasticNet", "LightGbm", "NeuralNetwork"]:
            correlations = load_csv(
                f"page8_MultivariateXWASCorrelations/CorrelationsMultivariate/CorrelationsMultivariate_{correlation_type}_{algorithm}.csv",
                index_col=0,
            )

            correlations.rename(
                columns={
                    "env_dataset": "category",
                    "organ_1": "dimension_1",
                    "organ_2": "dimension_2",
                    "corr": "correlation",
                    "sample_size": "number_features",
                },
                inplace=True,
            )
            correlations.replace(DICT_TO_CHANGE_DIMENSIONS, inplace=True)

            correlations["category"] = list(
예제 #11
0
}
DICT_TO_CHANGE_DIMENSIONS = {
    "ImmuneSystem": "BloodCells",
    "InceptionResNetV2": "inception_res_net_v2",
    "InceptionV3": "inception_v3",
    "ElasticNet": "elastic_net",
    "LightGBM": "light_gbm",
    "NeuralNetwork": "neural_network",
    "1DCNN": "1dcnn",
    "3DCNN": "3dcnn",
}

if __name__ == "__main__":
    for sample_definition in ["instances", "eids"]:
        scores = load_csv(
            f"page2_predictions/Performances/PERFORMANCES_withEnsembles_withCI_alphabetical_{sample_definition}_Age_test.csv"
        )[COLUMNS_TO_TAKE].rename(columns=COLUMNS_TO_TAKE)

        for metric in ["r2", "rmse", "c_index", "c_index_difference"]:
            scores[metric] = scores[f"{metric}_and_std"].str.split(
                "+", expand=True)[0].astype(np.float32)
            scores[f"{metric}_std"] = (scores[f"{metric}_and_std"].str.split(
                "+",
                expand=True)[1].str.split("-",
                                          expand=True)[1].astype(np.float32))

            scores.drop(columns=f"{metric}_and_std", inplace=True)

        scores.loc[(scores["dimension"] == "Musculoskeletal") &
                   (scores["sub_subdimension"] == "MRI"),
                   "sub_subdimension"] = "DXA"
import pandas as pd
from dash_website.utils.aws_loader import load_csv

COLUMNS_TO_TAKE = {"organ": "dimension", "view": "subdimension", "R-Squared_all": "r2", "R-Squared_sd_all": "r2_std"}
DICT_TO_CHANGE_DIMENSIONS = {"ImmuneSystem": "BloodCells"}


if __name__ == "__main__":
    scores_raw = (
        load_csv(f"page2_predictions/Performances/PERFORMANCES_bestmodels_alphabetical_instances_Age_test.csv")[
            COLUMNS_TO_TAKE
        ]
        .rename(columns=COLUMNS_TO_TAKE)
        .set_index("dimension")
    )

    ensembles_scores_raw = (
        load_csv(f"page2_predictions/Performances/PERFORMANCES_withEnsembles_alphabetical_instances_Age_test.csv")[
            COLUMNS_TO_TAKE
        ]
        .rename(columns=COLUMNS_TO_TAKE)
        .set_index(["dimension", "subdimension"])
    )
    ensembles_scores_raw["subdimension"] = ensembles_scores_raw.index.get_level_values("subdimension")

    for dimension_to_correct in ["Hearing", "Lungs"]:
        scores_raw.loc[dimension_to_correct, ["subdimension", "r2", "r2_std"]] = ensembles_scores_raw.loc[
            (dimension_to_correct, "*"), ["subdimension", "r2", "r2_std"]
        ].values[0]

    scores = scores_raw.reset_index()
예제 #13
0
    "Ethnicity.Chinese",
    "Ethnicity.Other",
    "Ethnicity.Other_ethnicity",
    "Ethnicity.Do_not_know",
    "Ethnicity.Prefer_not_to_answer",
    "Ethnicity.NA",
]

if __name__ == "__main__":
    list_information = []

    for DIMENSION in list(TREE_TIME_SERIES.keys()):
        for SUBDIMENSION in list(TREE_TIME_SERIES[DIMENSION].keys()):
            for SUB_SUBDIMENSION in TREE_TIME_SERIES[DIMENSION][SUBDIMENSION]:
                information_raw = load_csv(
                    f"page9_AttentionMaps/Attention_maps_infos/AttentionMaps-samples_Age_{DIMENSION}_{SUBDIMENSION}_{SUB_SUBDIMENSION}.csv",
                    usecols=columns_to_take,
                )[columns_to_take].set_index("id")

                information = pd.DataFrame(
                    None,
                    columns=[
                        "dimension",
                        "subdimension",
                        "sub_subdimension",
                        "sex",
                        "age_group",
                        "aging_rate",
                        "sample",
                        "chronological_age",
                        "biological_age",
                        "ethnicity",
예제 #14
0
}

SAMPLE_DEFINITION_NAMING = {
    "instances": "all_samples_per_participant",
    "eids": "average_per_participant",
    "*": "all_samples_when_possible_otherwise_average",
}
DICT_TO_CHANGE_DIMENSIONS = {"ImmuneSystem": "BloodCells"}


if __name__ == "__main__":
    for sample_definition in ["instances", "eids"]:
        scores_raw = (
            load_csv(
                f"page2_predictions/Performances/PERFORMANCES_bestmodels_alphabetical_{sample_definition}_Age_test.csv"
            )[COLUMNS_TO_TAKE]
            .rename(columns=COLUMNS_TO_TAKE)
            .set_index("dimension")
        )

        ensembles_scores_raw = (
            load_csv(
                f"page2_predictions/Performances/PERFORMANCES_withEnsembles_alphabetical_{sample_definition}_Age_test.csv"
            )[COLUMNS_TO_TAKE]
            .rename(columns=COLUMNS_TO_TAKE)
            .set_index(["dimension", "subdimension"])
        )
        ensembles_scores_raw["subdimension"] = ensembles_scores_raw.index.get_level_values("subdimension")

        if sample_definition == "instances":
            for dimension_to_correct in ["Hearing", "Lungs"]:
예제 #15
0
if __name__ == "__main__":
    for dimension, subdimension, sub_subdimension in tqdm(
            DIMENSION_TO_NAME.keys()):
        print(dimension, subdimension, sub_subdimension)

        name = DIMENSION_TO_NAME[(dimension, subdimension, sub_subdimension)]

        if dimension == "ImmuneSystem":
            new_dimension = "BloodCells"
        else:
            new_dimension = dimension

        if dimension != "PhysicalActivity":
            raw_scalars = load_csv(
                f"page1_biomarkers/BiomarkerDatasets/{name}_ethnicity.csv"
            ).set_index("id")
        else:
            raw_scalars = load_csv(
                f"page1_biomarkers/BiomarkerDatasets/{name}_short.csv"
            ).set_index("id")

        rename_columns = {
            "Sex": "sex",
            "Age when attended assessment centre": "chronological_age"
        }

        for feature in raw_scalars.columns[raw_scalars.columns.str.contains(
                ".0")]:
            rename_columns[feature] = feature.replace(".0", "")