예제 #1
0
    def _validate_fs(self, feature_selection):
        if feature_selection not in self.FEATURE_SELECTION:
            common.terminate(
                "Choose feature selection from the options: %s (selected fs: %s)."
                % (self.FEATURE_SELECTION, feature_selection))

        if self.use_phenos is None:
            if feature_selection == 'phenotype':
                common.terminate(
                    "Must provide a phenotype (--pheno) when selecting feature selection of type 'phenotype'."
                )
            elif feature_selection == 'controls':
                common.terminate(
                    "Must provide one phenotype (--pheno) in a binary format when selection feature selection of type 'controls'."
                )

        return getattr(
            self,
            self.FEATURE_FUNC_NAME_FORMAT.format(
                feature_option_name=feature_selection))
예제 #2
0
    def validate_args(self, args):
        # make sure user choose only one EWAS test (mutually exlusive group is not supported...)
        # argument pheno is required for all ewas tests - it can be supplied through --pheno flag of .glint meth data file
        # So, if the datafile supplied is not .glint file - pheno must be supplied as a flag
        if not args.datafile.name.endswith(methylation_data.GLINT_FILE_SUFFIX):
            self.required_args.append('phenofile')

        super(EWASParser, self).validate_args(args)

        if len(args.pheno
               ) > 1:  # 0 is all phenotypes in the data (which could be one)
            common.terminate("Must supply only one phenotype for EWAS")

        test_counter = 0
        if args.lmm:
            test_counter += 1
        if args.logreg:
            test_counter += 1
        if args.linreg:
            test_counter += 1
        if args.wilc:
            if args.covar:
                common.terminate(
                    "Wilcoxon test cannot take any covaraites. Remove the --covar argument."
                )
            test_counter += 1
        if test_counter > 1:
            common.terminate("Select only one EWAS test.")

        # add lmm parser if lmm was chosen
        if args.lmm:
            self.lmm_parser.validate_args(args)
            self.all_args.extend(self.lmm_parser.all_args)
            self.required_args.extend(self.lmm_parser.required_args)

        # default test is linear regression
        if test_counter == 0:
            args.linreg = True
            logging.info(
                "No EWAS test was chosen, using the default linear regression."
            )
예제 #3
0
    def impute(self, min_score, plink_snp_file, plink_geno_file,
               plink_ind_file, min_missing_values):
        """
        impute with following rules:
        replace missing values with mean (unless there are more htan min_missing_values missing values)
        remove samples (dont impute them) which have more than min_missing_values missing snps (out of all its snps)
        remove snps (sont impute with them) which have more than min_missing_values missing samples (out of all its samples)
        remore site with score lower than min_score
        dont impute sites that we dont have any snp (if we have at least one - impute it)
        """
        samples = loadtxt(plink_ind_file, dtype=str, usecols=(0, ))
        number_of_samples = samples.shape[0]
        filename = plink_ind_file.name if type(
            plink_ind_file) == file else plink_ind_file
        logging.info("Found %s samples in the file '%s'." %
                     (number_of_samples, filename))

        if type(plink_ind_file) == file:
            plink_ind_file.close()

        plink_snps_data = loadtxt(plink_snp_file, dtype=str)
        if type(plink_snp_file) == file:
            plink_snp_file.close()
        # use only snps that their allele are not the pairs CG or AT - since in those cases we cannot know which strand was tested
        relevant_snps_indices = self.get_relevant_plink_snp_list(
            plink_snps_data)

        logging.info("Get number of snp occurences...")
        # extract find occurences per sample for each snp from .geno file
        # snp_occurrences, relevant_snps_indices, missing_sampels_indices, non_missing_sampels_indices = self.get_snps_occurences(plink_geno_file, relevant_snps_indices, number_of_samples, min_missing_values)  #missing samples handling
        snp_occurrences, relevant_snps_indices = self.get_snps_occurences(
            plink_geno_file, relevant_snps_indices, number_of_samples,
            min_missing_values)
        # logging.debug("samples removed %s" % ", ".join(samples[missing_sampels_indices])) #missing samples handling

        # indices = [i  for i,name in enumerate(plink_snps_data[relevant_snps_indices,0]) if name in self.snps_id_per_name]
        relevant_snps_names = []
        relevant_snp_occurrences = []
        for i, name in enumerate(plink_snps_data[relevant_snps_indices, 0]):
            if name in self.snps_id_per_name:
                relevant_snps_names.append(name)
                relevant_snp_occurrences.append(snp_occurrences[i])

        # remove snps that we dont have information on
        self.imputed_samples = samples  #[non_missing_sampels_indices] #missing samples handling
        number_of_samples = self.imputed_samples.size

        if (number_of_samples == 0):
            common.terminate(
                "All samples were removed. There is nothing to impute. Quiting..."
            )

        # find sites with score bigger than min_score
        seterr(
            invalid='ignore'
        )  # to ignore the following line warning (These warnings are an intentional aspect of numpy)
        relevant_sites_indices = where(self.sites_scores > min_score)[0]

        # this code removes the sites in the list bad_sites_list (list of cpgs) from our data. it wast tested.
        # logging.info("remove bad sites..." )
        # assert type(bad_sites_list) == list or type(bad_sites_list) == ndarray
        # assert type(self.sites_name_per_id) == list or type(self.sites_name_per_id) == ndarray
        # bad_sites_indices = where(in1d(self.sites_name_per_id, bad_sites_list))[0]
        # relevant_sites_indices = delete(relevant_sites_indices, bad_sites_indices)

        # impute
        logging.info("Impute methylation levels...")
        site_imputation, imputed_sites_ids = self.impute_sites(
            number_of_samples, relevant_snps_names, relevant_snp_occurrences,
            relevant_sites_indices)
        if site_imputation == []:  # no sites imputed
            common.terminate(
                "All sites were removed. There is nothing to impute.")
        logging.info("%s sites were imputed for %s samples." %
                     (len(imputed_sites_ids), number_of_samples))
        logging.info(
            "%s sites with score > %s were not imputed due to missing SNPs." %
            (len(relevant_sites_indices) - len(imputed_sites_ids), min_score))
        self.imputed_sites_names = self.sites_name_per_id[imputed_sites_ids]
        self.site_imputation = site_imputation
예제 #4
0
 def _validate_stdth(self, stdth):
     if stdth > 1 or stdth < 0:
         common.terminate(
             "stdth cannot be greater than 1 and lower than 0. stdth = %s."
             % stdth)
     return stdth