def _validate_fs(self, feature_selection): if feature_selection not in self.FEATURE_SELECTION: common.terminate( "Choose feature selection from the options: %s (selected fs: %s)." % (self.FEATURE_SELECTION, feature_selection)) if self.use_phenos is None: if feature_selection == 'phenotype': common.terminate( "Must provide a phenotype (--pheno) when selecting feature selection of type 'phenotype'." ) elif feature_selection == 'controls': common.terminate( "Must provide one phenotype (--pheno) in a binary format when selection feature selection of type 'controls'." ) return getattr( self, self.FEATURE_FUNC_NAME_FORMAT.format( feature_option_name=feature_selection))
def validate_args(self, args): # make sure user choose only one EWAS test (mutually exlusive group is not supported...) # argument pheno is required for all ewas tests - it can be supplied through --pheno flag of .glint meth data file # So, if the datafile supplied is not .glint file - pheno must be supplied as a flag if not args.datafile.name.endswith(methylation_data.GLINT_FILE_SUFFIX): self.required_args.append('phenofile') super(EWASParser, self).validate_args(args) if len(args.pheno ) > 1: # 0 is all phenotypes in the data (which could be one) common.terminate("Must supply only one phenotype for EWAS") test_counter = 0 if args.lmm: test_counter += 1 if args.logreg: test_counter += 1 if args.linreg: test_counter += 1 if args.wilc: if args.covar: common.terminate( "Wilcoxon test cannot take any covaraites. Remove the --covar argument." ) test_counter += 1 if test_counter > 1: common.terminate("Select only one EWAS test.") # add lmm parser if lmm was chosen if args.lmm: self.lmm_parser.validate_args(args) self.all_args.extend(self.lmm_parser.all_args) self.required_args.extend(self.lmm_parser.required_args) # default test is linear regression if test_counter == 0: args.linreg = True logging.info( "No EWAS test was chosen, using the default linear regression." )
def impute(self, min_score, plink_snp_file, plink_geno_file, plink_ind_file, min_missing_values): """ impute with following rules: replace missing values with mean (unless there are more htan min_missing_values missing values) remove samples (dont impute them) which have more than min_missing_values missing snps (out of all its snps) remove snps (sont impute with them) which have more than min_missing_values missing samples (out of all its samples) remore site with score lower than min_score dont impute sites that we dont have any snp (if we have at least one - impute it) """ samples = loadtxt(plink_ind_file, dtype=str, usecols=(0, )) number_of_samples = samples.shape[0] filename = plink_ind_file.name if type( plink_ind_file) == file else plink_ind_file logging.info("Found %s samples in the file '%s'." % (number_of_samples, filename)) if type(plink_ind_file) == file: plink_ind_file.close() plink_snps_data = loadtxt(plink_snp_file, dtype=str) if type(plink_snp_file) == file: plink_snp_file.close() # use only snps that their allele are not the pairs CG or AT - since in those cases we cannot know which strand was tested relevant_snps_indices = self.get_relevant_plink_snp_list( plink_snps_data) logging.info("Get number of snp occurences...") # extract find occurences per sample for each snp from .geno file # snp_occurrences, relevant_snps_indices, missing_sampels_indices, non_missing_sampels_indices = self.get_snps_occurences(plink_geno_file, relevant_snps_indices, number_of_samples, min_missing_values) #missing samples handling snp_occurrences, relevant_snps_indices = self.get_snps_occurences( plink_geno_file, relevant_snps_indices, number_of_samples, min_missing_values) # logging.debug("samples removed %s" % ", ".join(samples[missing_sampels_indices])) #missing samples handling # indices = [i for i,name in enumerate(plink_snps_data[relevant_snps_indices,0]) if name in self.snps_id_per_name] relevant_snps_names = [] relevant_snp_occurrences = [] for i, name in enumerate(plink_snps_data[relevant_snps_indices, 0]): if name in self.snps_id_per_name: relevant_snps_names.append(name) relevant_snp_occurrences.append(snp_occurrences[i]) # remove snps that we dont have information on self.imputed_samples = samples #[non_missing_sampels_indices] #missing samples handling number_of_samples = self.imputed_samples.size if (number_of_samples == 0): common.terminate( "All samples were removed. There is nothing to impute. Quiting..." ) # find sites with score bigger than min_score seterr( invalid='ignore' ) # to ignore the following line warning (These warnings are an intentional aspect of numpy) relevant_sites_indices = where(self.sites_scores > min_score)[0] # this code removes the sites in the list bad_sites_list (list of cpgs) from our data. it wast tested. # logging.info("remove bad sites..." ) # assert type(bad_sites_list) == list or type(bad_sites_list) == ndarray # assert type(self.sites_name_per_id) == list or type(self.sites_name_per_id) == ndarray # bad_sites_indices = where(in1d(self.sites_name_per_id, bad_sites_list))[0] # relevant_sites_indices = delete(relevant_sites_indices, bad_sites_indices) # impute logging.info("Impute methylation levels...") site_imputation, imputed_sites_ids = self.impute_sites( number_of_samples, relevant_snps_names, relevant_snp_occurrences, relevant_sites_indices) if site_imputation == []: # no sites imputed common.terminate( "All sites were removed. There is nothing to impute.") logging.info("%s sites were imputed for %s samples." % (len(imputed_sites_ids), number_of_samples)) logging.info( "%s sites with score > %s were not imputed due to missing SNPs." % (len(relevant_sites_indices) - len(imputed_sites_ids), min_score)) self.imputed_sites_names = self.sites_name_per_id[imputed_sites_ids] self.site_imputation = site_imputation
def _validate_stdth(self, stdth): if stdth > 1 or stdth < 0: common.terminate( "stdth cannot be greater than 1 and lower than 0. stdth = %s." % stdth) return stdth