def test_exclude_bad_probes(self): logging.info("Testing removing bad probes...") probes_meth_data = self.meth_data.copy() data_no_bad_probes = methylation_data.MethylationDataLoader( datafile=self.DEMO_DATA_NO_BAD_PROBES) bad_probes = load(open(self.BAD_PROBES, 'r')) module = refactor.Refactor(methylation_data=probes_meth_data, k=5, bad_probes_list=bad_probes) module._exclude_bad_probes() assert array_equal(data_no_bad_probes.data, module.meth_data.data) # tests sites list has changed remove_count = len(bad_probes) orig_sites_before = [] orig_sites_before.extend(self.meth_data.cpgnames) orig_sites_after = [] orig_sites_after.extend(module.meth_data.cpgnames) for i in bad_probes: try: orig_sites_before.remove(i) except: remove_count -= 1 assert orig_sites_after == orig_sites_before # test sites size assert self.meth_data.sites_size - remove_count == module.meth_data.sites_size logging.info("PASS")
def _test_fit_model(self): logging.info("Testing linear regression: fit_model") meth_data = methylation_data.MethylationDataLoader( datafile=self.LIN_REG_DATA, covarfiles=[self.LIN_REG_COVAR], phenofile=[self.LIN_REG_PHENO]) results = loadtxt(self.LIN_REG_FIT_MODEL) # test 1 dim coefs, tstats, pvals = LinearRegression.fit_model( meth_data.phenotype, meth_data.data[0, :], covars=meth_data.covar) coefs_inter = coefs[0] coefs_site = coefs[-1] coefs_covar1 = coefs[1] coefs_covar2 = coefs[2] tstats = tstats[-1] pvals = pvals[-1] assert abs(coefs_inter - results[0]) < 1e-3 assert abs(coefs_site - results[1]) < 1e-3 assert abs(coefs_covar1 - results[2]) < 1e-3 assert abs(coefs_covar2 - results[3]) < 1e-3 assert abs(tstats - results[4]) < 1e-2 assert abs(pvals - results[5]) < 1e-3 # Note - there is no option to test 2 dim logging.info("PASS")
def test_exclude(self): logging.info("Testing test_exclude...") data_after = methylation_data.MethylationDataLoader( datafile=self.FAKE_DATA_EXC) data = self.meth_data.copy() data.exclude(self.INC_EXC) assert array_equal(data_after.data, data.data) logging.info("PASS")
def __init__(self): logging.info("Testing Started on LMMTester") self.meth_data = methylation_data.MethylationDataLoader( datafile=self.DATA, covarfiles=[self.COVAR], phenofile=[self.PHENO]) self.test_pvalues() logging.info("Testing Finished on LMMTester")
def test_remove_outliers(self): logging.info("Test remove outliers") data = methylation_data.MethylationDataLoader(datafile=self.DATA_FILE) samples_after_maxpcstd = loadtxt(self.SAMPLES_AFTER_MAXPCSTD, dtype=str) orig_samples = set([i for i in data.samples_ids]) data.exclude_maxpcstds([[1, 3], [2, 3]]) assert array_equal(data.samples_ids, samples_after_maxpcstd)
def __init__(self): logging.info("Testing Started on LogisticRegressionTester") self.meth_data = methylation_data.MethylationDataLoader( datafile=self.LIN_REG_DATA, covarfiles=[self.LIN_REG_COVAR], phenofile=[self.LIN_REG_PHENO]) self._test_fit_model() logging.info("Testing Finished on LogisticRegressionTester")
def test_remove_lowest_std_sites(self): logging.info("Testing stdth...") data_copy = self.meth_data.copy() data_copy.remove_lowest_std_sites(self.STDTH) data_after_std = methylation_data.MethylationDataLoader( datafile=self.FAKE_DATA_STDTH) assert array_equal(data_copy.data, data_after_std.data) test_remove_lowest_std_sites() logging.info("PASS")
def test_remove_lowest_std_sites(): logging.info("Testing stdth again...") data_after_std = methylation_data.MethylationDataLoader( datafile=DATA_STDTH) data_after_std.remove_lowest_std_sites(0.02) data = loadtxt(STDTH_RES) for i in range(data.shape[0]): assert tools.correlation(data[i, :], data_after_std.data[i, :], 1e-12) logging.info("PASS")
def test_add_covariates(self): logging.info("Testing add covar...") meth_data = self.meth_data.copy() data2 = methylation_data.MethylationDataLoader( datafile=self.FAKE_DATA, covarfiles=[self.FAKE_COVAR_PART1]) data2.add_covar_files([self.FAKE_COVAR_PART2]) assert array_equal(data2.covar, meth_data.covar) logging.info("PASS")
def run(self, args): try: self.args = args self.module = None if args.datafile.name.endswith(methylation_data.GLINT_FILE_SUFFIX): logging.info("Loading glint file %s..." % args.datafile.name) a = time() dataf = args.datafile.read() JSON_decoder = JSONDecoder( object_hook=methylation_data.json_numpy_obj_hook) result = JSON_decoder.decode(dataf) self.module = methylation_data.MethylationData( result['data'], result['samples_ids'], result['cpgnames'], result['phenotype'], result['covar'], result['covarnames'], result['phenonames'], result['title_indexes']) logging.debug("Loading binary data took %s seconds" % (time() - a)) logging.debug( "Got methylation data with %s sites and %s samples id" % (self.module.sites_size, self.module.samples_size)) # if phenotype or covariates supplied with metylation data, replace module covar and pheno file with new ones if args.phenofile is not None: self.module.add_pheno_files(args.phenofile) if args.covarfile is not None: self.module.add_covar_files(args.covarfile) else: self.module = methylation_data.MethylationDataLoader( datafile=args.datafile, phenofile=args.phenofile, covarfiles=args.covarfile) # load remove/keep sites/samples files and remove/keep values self.include_list = [] self.exclude_list = [] self.remove_list = [] self.keep_list = [] if args.include is not None: self.include_list = self._load_and_validate_ids_in_file( args.include, self.module.cpgnames) if args.exclude is not None: self.exclude_list = self._load_and_validate_ids_in_file( args.exclude, self.module.cpgnames) if args.keep is not None: self.keep_list = self._load_and_validate_ids_in_file( args.keep, self.module.samples_ids) if args.remove is not None: self.remove_list = self._load_and_validate_ids_in_file( args.remove, self.module.samples_ids) except Exception: logging.exception("in methylation data") raise
def test_remove(self): logging.info("Testing remove...") data_after = methylation_data.MethylationDataLoader( datafile=self.FAKE_DATA_REMOVE, covarfiles=[self.FAKE_COVAR_REMOVE], phenofile=[self.FAKE_PHENO_REMOVE]) data = self.meth_data.copy() data.remove(self.KEEP_REMOVE_INDICES) assert array_equal(data_after.data, data.data) assert array_equal(data_after.phenotype, data.phenotype) assert array_equal(data_after.covar, data.covar) logging.info("PASS")
def __init__(self): logging.info("Testing Started on RefactorTester") self.meth_data = methylation_data.MethylationData self.meth_data = methylation_data.MethylationDataLoader( datafile=self.DEMO_SMALL_DATA, covarfiles=[self.DEMO_COVAR], phenofile=[self.DEMO_PHENO]) self.test_remove_covariates() self.test_low_rank_approx_distances() self.test_exclude_bad_probes() logging.info("Testing Finished on RefactorTester")
def _test_fit_model(self): # todo not working logging.info("Testing Wilcoxon") meth_data = methylation_data.MethylationDataLoader(datafile = self.DATA, covarfiles = [self.COVAR], phenofile = [self.PHENO]) y = meth_data.phenotype # a binary vector (phenotype) x = meth_data.data[0,:]# site under test - with 0 just the first site zstats, pval = tools.wilcoxon_test(y, x) assert abs(zstats - self.Z_STATS_RES) < 1e-2 assert abs(pval - self.P_VAL_RES) < 1e-3 logging.info("PASS")
def __init__(self): logging.info("Testing Started on PCATester") pca_res_p = loadtxt(self.PCA_P_RES) meth_data = methylation_data.MethylationDataLoader( datafile=self.DATA_FILE) pca_out = pca.PCA(meth_data.data.transpose()) for i in range(10): assert tools.correlation(pca_out.P[:, i], pca_res_p[:, i]) logging.info("PASS") logging.info("Testing Finished on PCATester")
def test_upload_new_files(self): logging.info("Testing upload new covaritates and phenotype files...") data = self.meth_data.copy() data_upload = methylation_data.MethylationDataLoader( datafile=self.FAKE_DATA_REMOVE) data.remove(self.KEEP_REMOVE_INDICES) data_upload.upload_new_covaritates_files([self.FAKE_COVAR_REMOVE]) data_upload.upload_new_phenotype_file([self.FAKE_PHENO_REMOVE]) assert array_equal(data.data, data_upload.data) assert array_equal(data.phenotype, data_upload.phenotype) assert array_equal(data.covar, data_upload.covar) logging.info("PASS")
def __init__(self): logging.info("Testing Started on SenariosTester") bad_probes = set() [ bad_probes.update(loadtxt(probes_file, dtype=str)) for probes_file in refactor_parser.BAD_PROBES_FILES ] self.bad_probes = list(bad_probes) self.meth_data = methylation_data.MethylationDataLoader( datafile=self.DATA, covarfiles=[self.COVAR], phenofile=[self.PHENO]) self.test_senario1() self.test_senario2() self.test_senario3() self.test_senario4() logging.info("Testing Finished on SenariosTester")
def __init__(self): logging.info("Testing Started on DataTester") self.meth_data = methylation_data.MethylationDataLoader( datafile=self.FAKE_DATA, covarfiles=[self.FAKE_COVAR], phenofile=[self.FAKE_PHENO]) self.test_remove_lowest_std_sites() self.test_get_mean_per_site() self.test_include() self.test_exclude() self.test_keep() self.test_remove() self.test_exclude_sites_with_low_mean() self.test_exclude_sites_with_high_mean() self.test_upload_new_files() self.test_load_and_validate_covar() self.test_load_and_validate_phenotype() self.test_add_covariates() self.test_fail_exclude() self.test_fail_remove() self.test_remove_outliers() logging.info("Testing Finished on DataTester")
def __init__(self): logging.info("Testing Started on HousemanTester") self.meth_data = methylation_data.MethylationDataLoader( datafile=self.DATA) self.test_components() logging.info("Testing Finished on HousemanTester")
def __init__(self): logging.info("Testing Started ToolsTester") self.meth_data = methylation_data.MethylationDataLoader(datafile = self.DATA_FILE) self.test_low_rank_approx() self.test_euclidean_distance() logging.info("Testing Finished ToolsTester")
def __init__(self): logging.info("Testing Started on WilcoxonTester") self.meth_data = methylation_data.MethylationDataLoader(datafile = self.DATA, covarfiles = [self.COVAR], phenofile = [self.PHENO]) self._test_fit_model() logging.info("Testing Finished on WilcoxonTester")