Exemplos de ClinicalDatasetSampler em Python, exemplos de vlpi.data.ClinicalDataset.ClinicalDatasetSampler em Python

Exemplo n.º 1

0

Exibir arquivo

def LoadUKBBData(
        training_data_fraction,
        dis_index,
        filepath='path/to/clinical/data/UKBB_HPO.pth',
        exclude_path='path/to/clinical/data/ukb_withdrawn_current.txt',
        sampler_path='path/to/clinical/samplers'):
    clinData = ClinicalDataset()
    clinData.ReadFromDisk(filepath)

    try:
        sampler = ClinicalDatasetSampler(clinData,
                                         training_data_fraction,
                                         conditionSamplingOnDx=[dis_index],
                                         returnArrays='Torch')
        sampler.ReadFromDisk(sampler_path + 'Sampler_' +
                             dis_index.replace(':', '_'))
        sampler.ConvertToUnconditional()
    except KeyError:
        sampler = ClinicalDatasetSampler(clinData,
                                         training_data_fraction,
                                         returnArrays='Torch')
        sampler.ReadFromDisk(sampler_path + 'Sampler_' +
                             dis_index.replace(':', '_'))

    excluded = np.array(
        pd.read_csv(exclude_path, header=None, index_col=0).index)
    sampler.DropSamples(excluded)
    return clinData, sampler

Exemplo n.º 2

0

Exibir arquivo

Arquivo: CPA_vLPI_Fit_Step1.py Projeto: daverblair/CrypticPhenotypeAnalysisScripts

# read the hpo terms from disk
dis_to_term = pd.read_pickle(input_hpo_file)

#load the dataset from disk, include only the HPO terms annotated to the the disease
clinData = ClinicalDataset()
clinData.ReadFromDisk('path/to/clinical/record/dataset')
annotated_terms = dis_to_term.loc[dis_index]['HPO_ICD10_ID']
clinData.IncludeOnly(annotated_terms)

#make sure the maximum rank of the model is less than the number of annotated HPO terms
if (len(annotated_terms) - 1) < rank:
    rank = len(annotated_terms) - 1

## load the stored dataset sampler
sampler = ClinicalDatasetSampler(clinData,
                                 training_data_fraction,
                                 conditionSamplingOnDx=[dis_index],
                                 returnArrays='Torch')
sampler.ReadFromDisk('path/to/clinical/dataset/samplers/' + 'Sampler_' +
                     dis_index.replace(':', '_'))

#set the covariates
if covariate_set == 'NULL':
    sampler.SubsetCovariates([])
elif covariate_set != 'ALL':
    sampler.SubsetCovariates(covariate_set.split(','))

#make sure the model hasn't been fit before. If not, then fit it and write to disk.
if 'trialNum_' + trial + '.pth' not in os.listdir(direcPrefix +
                                                  outputFileDirec +
                                                  '/Models/'):

Exemplo n.º 3

0

Exibir arquivo

Arquivo: CPA_ConsistencyAnalysis_Step2_2.py Projeto: daverblair/CrypticPhenotypeAnalysisScripts

clinData=ClinicalDataset()
clinData.ReadFromDisk('path/to/clinical/dataset')



results_table={'OMIM_ICD_ID':[],'Avg Component Weighted R^2':[],'Component Weighted R^2 Matrix':[],'Cluster Labels':[],'Num Replicates, Top Model':[],'Meets Criteria':[]}
try:
    os.mkdir(output_file_prefix+'_Figures')
except FileExistsError:
    pass

for dis_index in set(allowed_diseases).intersection(dis_to_term.index):
    try:
        print('Computing matrix similarities for '+dis_index)

        sampler=ClinicalDatasetSampler(clinData,training_data_fraction,conditionSamplingOnDx = [dis_index],returnArrays='Torch')
        sampler.ReadFromDisk('path/to/dataset/samplers'+'Sampler_'+dis_index.replace(':','_'))
        sampler.ConvertToUnconditional()
        all_procrustes_scores=[]
        procrustes_score_matrix = np.ones((num_trials,num_trials))


        for trial_pair in itertools.combinations(range(1,num_trials+1), 2):
            vlpi_1=vLPI(sampler,max_rank)
            vlpi_1.LoadModel('path/to/latent/pheno/models/'+input_direc+'MendelianDiseaseIndex_'+dis_index.replace(':','_')+'/Models/trialNum_'+str(trial_pair[0])+'.pth')

            vlpi_2=vLPI(sampler,max_rank)
            vlpi_2.LoadModel('path/to/latent/pheno/models/'+input_direc+'MendelianDiseaseIndex_'+dis_index.replace(':','_')+'/Models/trialNum_'+str(trial_pair[1])+'.pth')

            risk_matrix_1=vlpi_1.ReturnComponents()
            risk_matrix_2=vlpi_2.ReturnComponents()

Exemplo n.º 4

0

Exibir arquivo

Arquivo: CrypticPhenoImpute.py Projeto: daverblair/CrypticPhenoImpute

def main():

    #fixed data loaded into memory
    dis_table = pd.read_csv(DATA_PATH + "TargetDiseaseCodes.txt",
                            sep='\t',
                            index_col="CODE")
    ukbb_model_table = pd.read_pickle(DATA_PATH + "ICD10-UKBB_ModelTable.pth")

    parser = argparse.ArgumentParser(
        description=
        'Imputes the cryptic phenotypes analyzed in Blair et al. 2020 into arbitrary clinical datasets.'
    )

    parser.add_argument(
        "encoding",
        help="ICD encoding. Must be either 'ICD10-CM' or 'ICD10-UKBB'.",
        type=str)

    parser.add_argument(
        "datafile",
        help=
        "Path to the datafile containing the clinical information. Note, the software expects a tab-delimitted text file with two columns. The first column contains a unique ID for every subject. The second column contains a comma-separated list of diagnosed ICD10 codes. DO NOT include a header.",
        type=str)

    parser.add_argument(
        "cryptic_phenotype",
        help=
        "Disease cryptic phenotype to be imputed. Must be in the following list: {0:s}. To see a key for the cryptic phenotypes, provide the argument KEY instead."
        .format(', '.join(list(dis_table.index))),
        type=str)

    parser.add_argument("output_file",
                        help="Path to the output file.",
                        type=str)

    parser.add_argument(
        "--use_best",
        help=
        "Disease cryptic phenotype to be imputed. Must be in the following list: {0:s}. To see a key for the cryptic phenotypes, provide the argument KEY instead."
        .format(', '.join(list(dis_table.index))),
        action="store_true")

    parser.add_argument(
        "--model_path",
        help=
        "By default, the program downlads and saves models to the same directory as the software package. This might not be allowed in all settings, so you can specify an alternative path to store models using this option.",
        type=str)

    args = parser.parse_args()

    if args.cryptic_phenotype == 'KEY':
        print(dis_table)
        sys.exit()

    assert args.encoding in [
        'ICD10-CM', 'ICD10-UKBB'
    ], "Encoding not recognized. Please use 'ICD10-CM' or 'ICD10-UKBB'."
    assert args.cryptic_phenotype in dis_table.index, "Disease cryptic phenotype to be imputed. Must be in the following list: {0:s}. To see a key for the cryptic phenotypes, provide the argument KEY instead.".format(
        ', '.join(list(dis_table.index)))
    disease_code = dis_table.loc[args.cryptic_phenotype]['OMIM_HPO_ID']

    #initialize the ClinicalDataset class
    if args.encoding == 'ICD10-CM':
        currentClinicalDataset = ClinicalDataset()
    else:
        currentClinicalDataset = ClinicalDataset(ICDFilePaths=[
            ICD_PATH + 'icd10_ukbb.txt', ICD_PATH + 'ICD10_Chapters.txt'
        ])

    #read the dataset into memory
    currentClinicalDataset.ReadDatasetFromFile(args.datafile,
                                               1,
                                               indexColumn=0,
                                               hasHeader=False,
                                               chunkSize=50000)

    #set up the model directories if they do not already exist
    if args.model_path is not None:
        MODEL_PATH = args.model_path
        if MODEL_PATH[-1] != '/':
            MODEL_PATH += '/'
    else:
        MODEL_PATH = pkg_resources.resource_filename('CrypticPhenoImpute',
                                                     'Models/')

    try:
        os.mkdir(MODEL_PATH)
    except FileExistsError:
        pass

    try:
        os.mkdir(MODEL_PATH + 'ICD10UKBB_Models')
    except FileExistsError:
        pass

    try:
        os.mkdir(MODEL_PATH + 'ICD10CM_Models')
    except FileExistsError:
        pass

    #if using ICD10-CM, use the vlpi model directly. Requires translating from ICD10-CM into HPO terms

    if args.encoding == 'ICD10-CM':
        #load the HPO term table
        hpo_table = pd.read_csv(DATA_PATH + "HPOTable.txt",
                                sep='\t',
                                index_col="HPO_ICD10_ID")
        model_table = pd.read_csv(DATA_PATH + "ModelTable.txt",
                                  sep='\t',
                                  index_col="Disease ID")

        disease_hpo = model_table.loc[disease_code][
            'Annotated HPO Terms'].split(',')
        hpo_icd10_map = {
            hpo: hpo_table.loc[hpo]['ICD10'].split(';')
            for hpo in disease_hpo
        }

        icd10_HPO_map = {}
        for key, value in hpo_icd10_map.items():
            for icd in value:
                try:
                    icd10_HPO_map[icd] += [key]
                except KeyError:
                    icd10_HPO_map[icd] = [key]

        currentClinicalDataset.ConstructNewDataArray(icd10_HPO_map)

        sampler = ClinicalDatasetSampler(currentClinicalDataset, 0.5)

        vlpi_model = vLPI(sampler,
                          model_table.loc[disease_code]['Max. Model Rank'])

        try:
            vlpi_model.LoadModel(MODEL_PATH +
                                 'ICD10CM_Models/{0:s}.pth'.format(
                                     disease_code.replace(':', '_')))
        except FileNotFoundError:
            print("\nDownloading model files from GitHub.")
            wget.download(
                "https://raw.githubusercontent.com/daverblair/CrypticPhenoImpute/master/CrypticPhenoImpute/Models/ICD10CM_Models/{0:s}.pth"
                .format(disease_code.replace(':', '_')),
                out=MODEL_PATH + 'ICD10CM_Models/')
            vlpi_model.LoadModel(MODEL_PATH +
                                 'ICD10CM_Models/{0:s}.pth'.format(
                                     disease_code.replace(':', '_')))

        try:
            with open(
                    MODEL_PATH + 'ICD10CM_Models/{0:s}_Index.pth'.format(
                        disease_code.replace(':', '_')), 'rb') as f:
                model_hpo_index = pickle.load(f)
        except FileNotFoundError:
            print("\nDownloading index files from GitHub.")
            wget.download(
                "https://raw.githubusercontent.com/daverblair/CrypticPhenoImpute/master/CrypticPhenoImpute/Models/ICD10CM_Models/{0:s}_Index.pth"
                .format(disease_code.replace(':', '_')),
                out=MODEL_PATH + 'ICD10CM_Models/')
            with open(
                    MODEL_PATH + 'ICD10CM_Models/{0:s}_Index.pth'.format(
                        disease_code.replace(':', '_')), 'rb') as f:
                model_hpo_index = pickle.load(f)

        ######## This code corrects variations in the order in which symptoms are stored that occurred between an earlier and the current version of the ClinicalDataset class
        ######## Clearly, this is less than ideal, but it wasn't worth refitting all of the models for this small change in storage that could be corrected.
        symptom_array = currentClinicalDataset.ReturnSparseDataMatrix()
        new_order = [
            currentClinicalDataset.dxCodeToDataIndexMap[x]
            for x in model_hpo_index.keys()
        ]
        symptom_array = (symptom_array.tocsr()[:, new_order]).tocoo()
        ########
        ########

        cp = vlpi_model.ComputeEmbeddings(
            dataArrays=(symptom_array,
                        []))[:, model_table.loc[disease_code]['Top Component']]
        output_table = pd.DataFrame({
            'Subject_ID': currentClinicalDataset.data.index,
            args.cryptic_phenotype: cp
        })
        output_table.set_index('Subject_ID', inplace=True, drop=True)
        output_table.to_csv(args.output_file, sep='\t')

    # use the ICD10-UKBB encoding
    else:
        try:
            os.mkdir(MODEL_PATH + 'ICD10UKBB_Models/{0:s}'.format(
                disease_code.replace(':', '_')))
        except FileExistsError:
            pass

        if args.use_best == True:
            try:
                os.mkdir(MODEL_PATH + 'ICD10UKBB_Models/{0:s}'.format(
                    disease_code.replace(':', '_') + '/TopModel'))
            except FileExistsError:
                pass

            try:
                features = pd.read_csv(
                    MODEL_PATH +
                    'ICD10UKBB_Models/{0:s}/TopModelFeatures.txt'.format(
                        disease_code.replace(':', '_')),
                    sep='\t',
                    header=None)
            except FileNotFoundError:
                print("\nDownloading feature file.")
                wget.download(
                    "https://raw.githubusercontent.com/daverblair/CrypticPhenoImpute/master/CrypticPhenoImpute/Models/ICD10UKBB_Models/{0:s}/TopModelFeatures.txt"
                    .format(disease_code.replace(':', '_')),
                    out=MODEL_PATH + 'ICD10UKBB_Models/{0:s}'.format(
                        disease_code.replace(':', '_')))
                features = pd.read_csv(
                    MODEL_PATH +
                    'ICD10UKBB_Models/{0:s}/TopModelFeatures.txt'.format(
                        disease_code.replace(':', '_')),
                    sep='\t',
                    header=None)

            currentClinicalDataset.IncludeOnly(features[0].values)
            symptom_array = currentClinicalDataset.ReturnSparseDataMatrix()

            try:
                with open(
                        MODEL_PATH +
                        'ICD10UKBB_Models/{0:s}/TopModel/{1:s}'.format(
                            disease_code.replace(':', '_'),
                            ukbb_model_table.loc[disease_code]['Top_Model']),
                        'rb') as f:
                    model_dict = pickle.load(f)
            except FileNotFoundError:
                print("\nDownloading top performing imputation model.")
                wget.download(
                    "https://raw.githubusercontent.com/daverblair/CrypticPhenoImpute/master/CrypticPhenoImpute/Models/ICD10UKBB_Models/{0:s}/TopModel/{1:s}"
                    .format(disease_code.replace(':', '_'),
                            ukbb_model_table.loc[disease_code]['Top_Model']),
                    out=MODEL_PATH + 'ICD10UKBB_Models/{0:s}/TopModel'.format(
                        disease_code.replace(':', '_')))
                with open(
                        MODEL_PATH +
                        'ICD10UKBB_Models/{0:s}/TopModel/{1:s}'.format(
                            disease_code.replace(':', '_'),
                            ukbb_model_table.loc[disease_code]['Top_Model']),
                        'rb') as f:
                    model_dict = pickle.load(f)
            cp = model_dict['Model'].predict(symptom_array)

        else:
            try:
                os.mkdir(MODEL_PATH + 'ICD10UKBB_Models/{0:s}'.format(
                    disease_code.replace(':', '_') + '/BaggedModels'))
            except FileExistsError:
                pass

            try:
                features = pd.read_csv(
                    MODEL_PATH +
                    'ICD10UKBB_Models/{0:s}/BaggedModelFeatures.txt'.format(
                        disease_code.replace(':', '_')),
                    sep='\t')
            except FileNotFoundError:
                print("\nDownloading feature file.")
                wget.download(
                    "https://raw.githubusercontent.com/daverblair/CrypticPhenoImpute/master/CrypticPhenoImpute/Models/ICD10UKBB_Models/{0:s}/BaggedModelFeatures.txt"
                    .format(disease_code.replace(':', '_')),
                    out=MODEL_PATH + 'ICD10UKBB_Models/{0:s}'.format(
                        disease_code.replace(':', '_')))
                features = pd.read_csv(
                    MODEL_PATH +
                    'ICD10UKBB_Models/{0:s}/BaggedModelFeatures.txt'.format(
                        disease_code.replace(':', '_')),
                    sep='\t')

            currentClinicalDataset.IncludeOnly(features['ICD10'].values)
            symptom_array = currentClinicalDataset.ReturnSparseDataMatrix()

            all_models = ukbb_model_table.loc[disease_code]['Bagged_Models']
            num_models = 0
            cp = np.zeros((currentClinicalDataset.numPatients))
            for model_string in all_models:
                try:
                    with open(
                            MODEL_PATH +
                            'ICD10UKBB_Models/{0:s}/BaggedModels/{1:s}'.format(
                                disease_code.replace(':', '_'), model_string),
                            'rb') as f:
                        model_dict = pickle.load(f)
                except FileNotFoundError:
                    print("\nDownloading bagged model: {0:s}.".format(
                        model_string))
                    wget.download(
                        "https://raw.githubusercontent.com/daverblair/CrypticPhenoImpute/master/CrypticPhenoImpute/Models/ICD10UKBB_Models/{0:s}/BaggedModels/{1:s}"
                        .format(disease_code.replace(':', '_'), model_string),
                        out=MODEL_PATH +
                        'ICD10UKBB_Models/{0:s}/BaggedModels'.format(
                            disease_code.replace(':', '_')))
                    with open(
                            MODEL_PATH +
                            'ICD10UKBB_Models/{0:s}/BaggedModels/{1:s}'.format(
                                disease_code.replace(':', '_'), model_string),
                            'rb') as f:
                        model_dict = pickle.load(f)
                cp += model_dict['Model'].predict(symptom_array)
                num_models += 1
            cp /= num_models

        output_table = pd.DataFrame({
            'Subject_ID': currentClinicalDataset.data.index,
            args.cryptic_phenotype: cp
        })
        output_table.set_index('Subject_ID', inplace=True, drop=True)
        output_table.to_csv(args.output_file, sep='\t')

Exemplo n.º 5

0

Exibir arquivo

    clinData.ReadFromDisk('path/to/clinical/dataset')

    #set the clinical dataset to only include annotated symptoms
    if model_table.loc[dis_index][[
            'Revised Converged [0.02, 2000]',
            'Revised Increase LR Converged [0.05, 4000]'
    ]].sum() > 0:
        annotated_terms = revised_dis_to_term.loc[dis_index]['HPO_ICD10_ID']
    else:
        annotated_terms = dis_to_term.loc[dis_index]['HPO_ICD10_ID']
    max_rank = model_table.loc[dis_index]['Rank']
    clinData.IncludeOnly(annotated_terms)

    #load the sampler
    sampler = ClinicalDatasetSampler(clinData,
                                     training_data_fraction,
                                     conditionSamplingOnDx=[dis_index],
                                     returnArrays='Torch')
    sampler.ReadFromDisk('path/to/samplers' + 'Sampler_' +
                         dis_index.replace(':', '_'))

    #set the covariates
    if model_table.loc[dis_index]['Covariates'] == 'NULL':
        sampler.SubsetCovariates([])
    elif covariate_set != 'ALL':
        sampler.SubsetCovariates(
            model_table.loc[dis_index]['Covariates'].split(','))

    #load the top peforming model
    sampler.ConvertToUnconditional()
    bestVLPIModel = vLPI(sampler, max_rank)
    bestVLPIModel.LoadModel('/path/to/models/' + dis_index.replace(':', '_') +

Exemplo n.º 6

0

Exibir arquivo

Arquivo: CPA_Effective_Rank_Step4.py Projeto: daverblair/CrypticPhenotypeAnalysisScripts

            'Revised Increase LR Converged [0.05, 4000]'
    ]].sum() > 0:
        annotated_terms = revised_dis_to_term.loc[dis_index]['HPO_ICD10_ID']
    else:
        annotated_terms = dis_to_term.loc[dis_index]['HPO_ICD10_ID']

    #set the max rank of the model
    max_rank = model_table.loc[dis_index]['Rank']

    #make a local copy of the clinical data, inlude on the annotated symptoms
    localClinData = copy.deepcopy(clinData)
    localClinData.IncludeOnly(annotated_terms)

    #read the sampler for the disease from disk
    sampler = ClinicalDatasetSampler(localClinData,
                                     training_data_fraction,
                                     conditionSamplingOnDx=[dis_index],
                                     returnArrays='Torch')
    sampler.ReadFromDisk('path/to/samplers/' + 'Sampler_' +
                         dis_index.replace(':', '_'))

    #set the covariates
    if model_table.loc[dis_index]['Covariates'] == 'NULL':
        sampler.SubsetCovariates([])
    elif covariate_set != 'ALL':
        sampler.SubsetCovariates(
            model_table.loc[dis_index]['Covariates'].split(','))

    #load the best latent phenotype model
    sampler.ConvertToUnconditional()
    bestVLPIModel = vLPI(sampler, max_rank)
    bestVLPIModel.LoadModel('path/to/best/models/' +

Exemplo n.º 7

0

Exibir arquivo

Arquivo: vLPI_Illustration.py Projeto: daverblair/CrypticPhenotypeAnalysisScripts

ax.legend(loc='best',frameon=False, fontsize=20)
ax.set_xlabel('Cryptic\n'+r'Phenotype ($\mathbf{Z}$)',fontsize=40,fontweight='bold')
ax.set_ylabel('Density\n'+r'$\log_{10}$-Scale',fontsize=40,fontweight='bold')
plt.savefig('cryptic_pheno_sim.svg')
plt.close()

# #
clinData = ClinicalDataset()
#build arbitrary list of disease codes
disList =list(clinData.dxCodeToDataIndexMap.keys())[0:num_symptoms+1]

# load data into clinical dataset
clinData.IncludeOnly(disList)
clinData.LoadFromArrays(torch.cat([simData['incidence_data'],simData['target_dis_dx'].reshape(-1,1)],axis=1),simData['covariate_data'],[],catCovDicts=None, arrayType = 'Torch')
clinData.ConditionOnDx([disList[-1]])
sampler = ClinicalDatasetSampler(clinData,training_data_fraction,returnArrays='Torch',conditionSamplingOnDx = [disList[-1]])
sampler.ConvertToUnconditional()

vlpiModel= vLPI(sampler,inf_rank)
try:
    vlpiModel.LoadModel('IllustrativeExample.pth')
except FileNotFoundError:
    inference_output = vlpiModel.FitModel(batch_size=200,errorTol=(1.0/num_samples))
    vlpiModel.PackageModel('IllustrativeExample.pth')

inferredCrypticPhenotypes=vlpiModel.ComputeEmbeddings((simData['incidence_data'],simData['covariate_data']))
riskFunction=vlpiModel.ReturnComponents().ravel()
latentPhenos=vlpiModel.ComputeEmbeddings(dataArrays=(clinData.ReturnSparseDataMatrix(clinData.data.index.sort_values()),[])).ravel()

fig, ax = plt.subplots(figsize=(10, 8))
fig.tight_layout(pad=2)

Exemplo n.º 8

0

Exibir arquivo

    ukbbDataset, ukbb_sampler = LoadUKBBData(training_data_fraction, dis_index)

    annotated_terms_ucsf = model_table.loc[dis_index]['Annotated HPO Terms']
    annotated_terms_ukbb = model_table.loc[dis_index][
        'Annotated HPO Terms UKBB']

    max_rank_ucsf = model_table.loc[dis_index]['UCSF Max. Model Rank']
    max_rank_ukbb = model_table.loc[dis_index]['UKBB Max. Model Rank']

    ucsfDataset_HPO.IncludeOnly(annotated_terms_ucsf)
    ucsfDataset_UKBB.IncludeOnly(annotated_terms_ukbb)
    ukbbDataset.IncludeOnly(annotated_terms_ukbb)

    sampler_hpo = ClinicalDatasetSampler(ucsfDataset_HPO,
                                         training_data_fraction,
                                         conditionSamplingOnDx=[dis_index],
                                         returnArrays='Torch')
    sampler_hpo.ReadFromDisk('path/to/samplers/UCSF/' + 'Sampler_' +
                             dis_index.replace(':', '_'))

    sampler_ucsf_ukbb = ClinicalDatasetSampler(
        ucsfDataset_UKBB,
        training_data_fraction,
        conditionSamplingOnDx=[dis_index],
        returnArrays='Torch')
    sampler_ucsf_ukbb.ReadFromDisk('path/to/samplers/UCSF/' + 'Sampler_' +
                                   dis_index.replace(':', '_'))

    if model_table.loc[dis_index]['Covariate Set'] == 'NULL':
        sampler_hpo.SubsetCovariates([])
        sampler_ucsf_ukbb.SubsetCovariates([])

Exemplo n.º 9

0

Exibir arquivo

numLatentPhenotypes=2

## simulate the data
simulator = ClinicalDataSimulator(numberOfSymptoms,numLatentPhenotypes,rareDiseaseFrequency)
simulatedData=simulator.GenerateClinicalData(numberOfSamples)

## load the data into a ClinicalDataset class
clinicalData = ClinicalDataset()

#change the dataset to have only 20 symptoms (defaults to full ICD10-CM codebook), named after the first 20 letters
allICDCodes = list(clinicalData.dxCodeToDataIndexMap.keys())
symptomConversionMap=dict(zip(allICDCodes[0:numberOfSymptoms],string.ascii_uppercase[0:numberOfSymptoms]))
clinicalData.ConstructNewDataArray(symptomConversionMap)

# now load into the data structure. Note, when using in this manner, it's the users responsibility to make sure that the input data columns match the data columns of the ClinicalDataset.
clinicalData.LoadFromArrays(simulatedData['incidence_data'],simulatedData['covariate_data'],[],catCovDicts=None, arrayType = 'Torch')

## Now load the ClincalDataset into ClinicalDatasetSampler
training_data_fraction=0.75
sampler = ClinicalDatasetSampler(clinicalData,training_data_fraction,returnArrays='Torch')

## Intantation the models
infNumberOfLatentPhenotypes=10
vlpiModel= vLPI(sampler,infNumberOfLatentPhenotypes)

inference_output = vlpiModel.FitModel(batch_size=1000,errorTol=(1.0/numberOfSamples))
vlpiModel.PackageModel('ExampleModel.pth')

inferredCrypticPhenotypes=vlpiModel.ComputeEmbeddings((simulatedData['incidence_data'],simulatedData['covariate_data']))
riskFunction=vlpiModel.ReturnComponents()