Exemplo n.º 1
0
 def get_logreg_params(self, pval_thresh=5e-2, hq_mods=pd.DataFrame()):
     """ Gets significant PCA components
         - logreg_constant: decides whether the constant variable in logistic regression is considered
     """
     signif_pca_comps, signif_pca_react_loads = {}, {}
     if self.hq_models.empty == True and hq_mods.empty==True:
         print("Give list of models or run -get_hq_samples")
         return None
     elif hq_mods.empty!=True:
         self.hq_models = hq_mods
         
     pheno_id = "_".join(self.hq_models.name.split("_")[1:])
     for sampled_map_num, ic_val in self.hq_models.items():
         landscape_sample_num = sampled_map_num.split("_")[-1]
         sample_id = "sample_"+landscape_sample_num+"_map_assess.json"
         landscape_assess_sample_file = self.assess_file_loc+sample_id
         min_pca_pval=False
         if path.exists(landscape_assess_sample_file):
             landscape_assess_sample = load_json_obj(landscape_assess_sample_file)
             pval_dict = ast.literal_eval(landscape_assess_sample["p_values_"+pheno_id].replace("nan", "1.0"))
             coef_dict = ast.literal_eval(landscape_assess_sample["coefs_"+pheno_id])
             comp_dict = ast.literal_eval(landscape_assess_sample["PCA_comp_dict_"+pheno_id])
             comp_dict = {"x"+str(k+1):v for k, v in comp_dict.items() }
             signif_pca_comps[sampled_map_num] = {}
             for pca_comp, p_val in pval_dict.items():
                 if p_val < pval_thresh and pca_comp!="const":
                     signif_pca_comps[sampled_map_num].update({pca_comp:{
                         "p_val": p_val, "coef": coef_dict[pca_comp],"pca_load":comp_dict[pca_comp]}})
     self.signif_pca = signif_pca_comps
     return signif_pca_comps
Exemplo n.º 2
0
def load_samples_assess_df(ENSEMBLE_MAP_ASSESS, pheno_list):
    ### -------------- LOAD 2 -----------------
    print("...loading SAMPLES_ASSESS_DF to identify minimum BIC or AIC MNCs")
    onlyfiles = [
        f for f in listdir(ENSEMBLE_MAP_ASSESS)
        if os.path.isfile(os.path.join(ENSEMBLE_MAP_ASSESS, f))
    ]
    onlyfiles = [f for f in onlyfiles if f != ".DS_Store"]
    samplesAfter = [f for f in onlyfiles if "sample_" in f]

    wanted_keys = []
    ### Options for what we want in SAMPLES_ASSESS_DF
    for pheno_id in pheno_list:
        wanted_keys.extend(["AIC_" + pheno_id, "BIC_" + pheno_id])

    SAMPLES_ASSESS_DF = {}
    for landscape_sample_name in tqdm(samplesAfter):
        landscape_sample_num = landscape_sample_name.split("_")[1]
        sample_id = "sampled_map_" + str(landscape_sample_num)
        landscape_assess_sample_file = ENSEMBLE_MAP_ASSESS + landscape_sample_name

        if os.path.exists(landscape_assess_sample_file):
            landscape_assess_sample = load_json_obj(
                landscape_assess_sample_file)
            SAMPLES_ASSESS_DF[sample_id] = {}
            SAMPLES_ASSESS_DF[sample_id].update(
                dict((k, landscape_assess_sample[k]) for k in wanted_keys
                     if k in landscape_assess_sample))

    # transform to pandas dataframe
    SAMPLES_ASSESS_DF = pd.DataFrame.from_dict(SAMPLES_ASSESS_DF,
                                               orient="index")
    print("\t... SAMPLES_ASSESS_DF shape: (samples: %d, assess_cols: %d)" %
          (SAMPLES_ASSESS_DF.shape[0], SAMPLES_ASSESS_DF.shape[1]))
    return SAMPLES_ASSESS_DF
Exemplo n.º 3
0
def get_sample_constraints(variant_dec_file):
    """Load the allele-constraint map for a particular model sample"
    """
    if path.exists(variant_dec_file):
        variant_dec_dict = load_json_obj(variant_dec_file)
    else:
        print("variant_dec_dict does not exist: ", variant_dec_file)
        variant_dec_dict = {}
    return variant_dec_dict
Exemplo n.º 4
0
def load_landscape_sample(fva_landscape_file):
    """Load the popFVA landscape for a particular model sample
    """
    fva_landscape_dict = load_json_obj(fva_landscape_file)
    obj_val_list = {}
    for strain_id, strain_fva_dict in fva_landscape_dict.items():
        obj_val_list[strain_id] = {}
        for rxn, max_min_dict in strain_fva_dict.items():
            obj_val_list[strain_id].update({rxn+"_max":float(format(max_min_dict["maximum"],'.10f')), 
                                            rxn+"_min":float(format(max_min_dict["minimum"],'.10f'))})
    fva_landscape_df = pd.DataFrame.from_dict(obj_val_list, orient="index")
    return fva_landscape_df
Exemplo n.º 5
0
    def load_ensemble_data(self, STRAIN_NUM=375,ALLELE_NUM=237,ACTION_NUM=4, ADD_NO_CHANGE=False,
                           pheno_list = ["ethambutol", "isoniazid", "rifampicin", "4-aminosalicylic_acid",
                                         "pyrazinamide", "ethionamide","ofloxacin", "cycloserine"],
                           STANDARDIZE=False, FILTER_RXN_DIR=False, test_set=True):
        """ Loads in the data describing a particular ensemble
        """
        self.action_num = ACTION_NUM
        self.add_no_change = ADD_NO_CHANGE
        ENSEMBLE_DIR = "ens_strains"+str(STRAIN_NUM)+"_alleles"+str(ALLELE_NUM)+"_actions"+str(ACTION_NUM)
        if not path.exists(ENSEMBLE_DIR):
            raise ValueError('\t... directory "%s" does not exist' %s (ENSEMBLE_DIR))
        else:
            print("dir ensemble: %s" % (ENSEMBLE_DIR))

        POPFVA_SAMPLES_DIR = ENSEMBLE_DIR+"/popfva_samples/"
        if not path.exists(POPFVA_SAMPLES_DIR):
            print('\t... directory "%s" does not exist' %s (POPFVA_SAMPLES_DIR))
            raise ValueError('\t... directory "%s" does not exist' %s (ENSEMBLE_DIR))
        else:
            print("dir popfva samples: %s" % (POPFVA_SAMPLES_DIR))
        self.popfva_file_loc = POPFVA_SAMPLES_DIR

        ### Create folders to save different types of sample assessments
        if STANDARDIZE==True:
            ENSEMBLE_MAP_ASSESS = ENSEMBLE_DIR+"/popfva_assessment_std/"
            ENSEMBLE_MAP_ANOVA = ENSEMBLE_DIR+"/popfva_anova_std/" ### Save ANOVA F-test enrichments.
        else:
            ENSEMBLE_MAP_ASSESS = ENSEMBLE_DIR+"/popfva_assessment/"
            ENSEMBLE_MAP_ANOVA = ENSEMBLE_DIR+"/popfva_anova/"

        if FILTER_RXN_DIR==True:
            ENSEMBLE_MAP_ASSESS = ENSEMBLE_DIR+"/popfva_assessment_rxnfilt_std"+str(STANDARDIZE)+"/"
            ENSEMBLE_MAP_ANOVA = ENSEMBLE_DIR+"/popfva_anova_rxnfilt_std"+str(STANDARDIZE)+"/"

        self.assess_file_loc = ENSEMBLE_MAP_ASSESS
        ENSEMBLE_MAP_COMPRESS = ENSEMBLE_DIR+"/popfva_compress/" ### Save numpy array versions of landscapes

        ### -------------- LOAD 1 -----------------
        print("(1) load COBRA_MODEL, base_flux_samples, pheno_to_data2d_dict, pheno_to_Y_dict ...")

        MODEL_SAMPLES_FILE = ENSEMBLE_DIR+"/"+"base_flux_samples.csv"
        base_flux_samples = pd.read_csv(MODEL_SAMPLES_FILE,index_col=0)
        self.flux_samples = base_flux_samples

        ENSEMBLE_BASEMODEL_FILE = ENSEMBLE_DIR+"/base_cobra_model.json"
        COBRA_MODEL = load_json_model(ENSEMBLE_BASEMODEL_FILE)
        self.base_cobra_model = COBRA_MODEL

        ### Load in the genetic variant matrix and AMR phenotypes for each case.
        pheno_to_data2d_dict = {}
        pheno_to_Y_dict = {}
        ALLELE_PHENO_FILE = ENSEMBLE_DIR+"/allele_pheno_data/"
        for pheno_id in pheno_list:
            G_VARIANT_MATRIX_FILE = ALLELE_PHENO_FILE+"/allele_df_"+pheno_id+".csv"
            PHENO_MATRIX_FILE = ALLELE_PHENO_FILE+"/pheno_df_"+pheno_id+".csv"
            pheno_to_data2d_dict.update({pheno_id: pd.read_csv(G_VARIANT_MATRIX_FILE,index_col=0)})
            pheno_to_Y_dict.update({pheno_id: pd.read_csv(PHENO_MATRIX_FILE,index_col=0)[pheno_id]})## to make Series
        
        self.x_allele_dict = pheno_to_data2d_dict
        self.y_pheno_dict = pheno_to_Y_dict
        self.pheno_list = pheno_list

        ### -------------- LOAD 2 -----------------
        print("(2) load SAMPLES_ASSESS_DF ...")
        onlyfiles = [f for f in listdir(ENSEMBLE_MAP_ASSESS) if path.isfile(path.join(ENSEMBLE_MAP_ASSESS, f))]
        onlyfiles = [f for f in onlyfiles if f != ".DS_Store"]
        if test_set==True:
            samplesAfter = [f for f in onlyfiles if "sample_" in f][:20]# only get 20 sample so files are small
        else:
            samplesAfter = [f for f in onlyfiles if "sample_" in f] 

        wanted_keys = []
        ### Options for what we want in SAMPLES_ASSESS_DF are as follows... (look in 02_ass_ensemble.py for more info)
        ### "AIC_", "BIC_", "prsquared_", "loglikelihood_", "LLR_pval_", "p_values_", "coefs_", "std_err_", "PCA_comp_dict_"
        for pheno_id in pheno_list:
            wanted_keys.extend(["AIC_"+pheno_id, "BIC_"+pheno_id, "prsquared_"+pheno_id, "std_err_"+pheno_id,
                                "loglikelihood_"+pheno_id, "LLR_pval_"+pheno_id, "cv_score_mean_"+pheno_id])

        SAMPLES_ASSESS_DF = {}
        for landscape_sample_name in tqdm(samplesAfter):
            landscape_sample_num = landscape_sample_name.split("_")[1]
            sample_id = "sampled_map_"+str(landscape_sample_num)
            landscape_assess_sample_file = ENSEMBLE_MAP_ASSESS+landscape_sample_name

            if path.exists(landscape_assess_sample_file):
                landscape_assess_sample = load_json_obj(landscape_assess_sample_file)
                SAMPLES_ASSESS_DF[sample_id] = {}
                SAMPLES_ASSESS_DF[sample_id].update(dict((k, landscape_assess_sample[k]) for k in wanted_keys if k in landscape_assess_sample))

        # transform to pandas dataframe
        SAMPLES_ASSESS_DF = pd.DataFrame.from_dict(SAMPLES_ASSESS_DF,orient="index")
        print("\t... SAMPLES_ASSESS_DF shape: (samples: %d, assess_cols: %d)" % (SAMPLES_ASSESS_DF.shape[0], SAMPLES_ASSESS_DF.shape[1]))
        self.assess_df = SAMPLES_ASSESS_DF

        ### -------------- LOAD 3 -----------------
        print("(3) load SAMPLES_ANOVA_DICT ...")
        SAMPLES_ANOVA_DF = {}
        for pheno_id in pheno_list:
            SAMPLES_ANOVA_DF[pheno_id] = {}

        for landscape_sample_name in tqdm(samplesAfter[:]):
            landscape_sample_num = landscape_sample_name.split("_")[1]
            sample_id = "sample_"+landscape_sample_num+"_map_anova.json"
            landscape_anova_sample_file = ENSEMBLE_MAP_ANOVA+sample_id

            if path.exists(landscape_anova_sample_file):
                landscape_anova_sample = load_json_obj(landscape_anova_sample_file)
                
                for pheno_id in pheno_list:
                    SAMPLES_ANOVA_DF[pheno_id]["sampled_map_"+landscape_sample_num] = {}
                    SAMPLES_ANOVA_DF[pheno_id]["sampled_map_"+landscape_sample_num].update(landscape_anova_sample[pheno_id]["pVal"])

        print("\t... generating SAMPLES_ANOVA_DICT")
        SAMPLES_ANOVA_DICT = {}
        for pheno_id in tqdm(pheno_list):
            SAMPLES_ANOVA_DICT.update({pheno_id: pd.DataFrame.from_dict(SAMPLES_ANOVA_DF[pheno_id],orient="index")})
        
        self.anova_dict = SAMPLES_ANOVA_DICT

        ### -------------- LOAD 3 -----------------
        print("(4) load SAMPLES_AC_DF ...")
        allele_col_ids = [x for x in pheno_to_data2d_dict[pheno_list[0]].columns]

        SAMPLES_AC_DF = {}
        for landscape_sample_name in tqdm(samplesAfter):
            landscape_sample_num = landscape_sample_name.split("_")[1]
            sample_id = "sampled_map_"+str(landscape_sample_num)
            landscape_assess_sample_file = ENSEMBLE_MAP_ASSESS+landscape_sample_name

            if path.exists(landscape_assess_sample_file):
                landscape_assess_sample = load_json_obj(landscape_assess_sample_file)
                SAMPLES_AC_DF[sample_id] = {}
                SAMPLES_AC_DF[sample_id].update(dict((k, landscape_assess_sample[k]) for k in allele_col_ids if k in landscape_assess_sample))

        SAMPLES_AC_DF = pd.DataFrame.from_dict(SAMPLES_AC_DF,orient="index")
        print("\t... SAMPLES_AC_DF shape: (samples: %d, assess_cols: %d)" % (SAMPLES_AC_DF.shape[0], SAMPLES_AC_DF.shape[1]))
        self.constraint_df = SAMPLES_AC_DF
    print('\t... creating ensemble directory:' + ENSEMBLE_DIR + "/")
    os.makedirs(ENSEMBLE_DIR + "/")
    print('\t... saving parameters to ensemble directory')
    with open(ENSEMBLE_DIR + '/mnc_ensemble_args.txt', 'w') as f:
        f.write('\n'.join(sys.argv[1:]))
    ### save to json
    args_dict = {
        "action_num": args.action_num,
        'nabound': args.add_na_bound,
        'popFVA_STANDARDIZE': args.popFVA_STANDARDIZE,
        'testsize': args.testsize
    }
    save_json_obj(args_dict, ENSEMBLE_DIR + "/mnc_ensemble_args.json")
else:
    exit_script = False
    args_dict = load_json_obj(ENSEMBLE_DIR + "/mnc_ensemble_args.json")

    if str(args_dict["nabound"]) != str(args.add_na_bound):
        print("--nabound argument is different!")
        exit_script = True

    if args_dict["action_num"] != args.action_num:
        print("--action_num argument is different!")
        exit_script = True

    if args_dict["popFVA_STANDARDIZE"] != args.popFVA_STANDARDIZE:
        print("--popfvascale argument is different!",
              args_dict["popFVA_STANDARDIZE"], str(args.popFVA_STANDARDIZE))
        exit_script = True

    if args_dict["testsize"] != args.testsize:
Exemplo n.º 7
0
    '--bicthresh',
    type=int,
    dest='bic_threshold',
    default=10,
    help=
    'Delta BIC determines the set of high quality models to perform analysis on. (default: 10). See Burnham and Anderson 2002 book on model selection for further information'
)

### load args
args = parser.parse_args()

ENSEMBLE_DIR = args.mnc_dir
TESTSET = args.train_test
BIC_cutoff = args.bic_threshold

ensemble_args_dict = load_json_obj(ENSEMBLE_DIR + "/mnc_ensemble_args.json")
action_num = ensemble_args_dict["action_num"]  # 4
ADD_NA_BOUND = ensemble_args_dict["nabound"]  # False
STANDARDIZE_ = ensemble_args_dict["popFVA_STANDARDIZE"]  # False
print("action_num (%d), nabound (%s), standardize (%s)" %
      (action_num, str(ADD_NA_BOUND), str(STANDARDIZE_)))

SCALE_POPFVA_ = True
pval_threshold = 1.0
load_threshold = 0.0
fdr_correction = False
save_data = True

#### IMPORTANT - top_models = pd.read_csv(ENSEMBLE_DIR+"/tables/best_mncs_"+pheno_id+".csv",index_col=0) will fail if best_mncs_ file is not generated before~!
#### write code for getting list of best MNCs for each phenotype
if TESTSET == False: