def test_one_sample(self, a): mn = get_this_file_or_timestamped( os.path.join(a.results_dir, a.name + ".matrix_raw.csv")) os.remove(mn) a.measure_coverage(samples=a.samples[:1]) mn = get_this_file_or_timestamped(mn) assert file_exists_and_not_empty(mn) assert pd.read_csv(mn, index_col=0).shape[1] == 1
def test_missing_input_with_permissive(self, a): mn = get_this_file_or_timestamped( os.path.join(a.results_dir, a.name + ".matrix_raw.csv")) os.remove(mn) os.remove(a.samples[0].aligned_filtered_bam) a.measure_coverage(samples=a.samples[:2], permissive=True) mn = get_this_file_or_timestamped(mn) assert file_exists_and_not_empty(mn) assert pd.read_csv(mn, index_col=0).shape[1] == 1
def test_distributed(self, a): mn = get_this_file_or_timestamped( os.path.join(a.results_dir, a.name + ".matrix_raw.csv")) os.remove(mn) a.measure_coverage(distributed=True, computing_configuration="localhost") # Check job files for each sample exist fs = list() for s in a.samples: f = os.path.join(s.sample_root, "coverage", s.name + ".peak_set_coverage.") for end in ["sh", "bed"]: fs.append(f + end) assert all([file_exists_and_not_empty(f) for f in fs]) # # has to be done separately for log files because they'll empty # # just check for existence fs = list() for s in a.samples: f = os.path.join(s.sample_root, "coverage", s.name + ".peak_set_coverage.") for end in ["log"]: fs.append(f + end) assert all([os.path.exists(f) for f in fs])
def test_remove_factor(atac_analysis_many_factors): import pandas as pd a = atac_analysis_many_factors a.matrix_norm = a.matrix_norm.dropna() prefix = os.path.join(a.results_dir, "unsupervised_analysis_{}".format(a.data_type), a.name) # inspect a.unsupervised_analysis(output_prefix="before", steps=["pca_association"]) f = prefix + ".before.pca.variable_principle_components_association.csv" p = pd.read_csv(get_this_file_or_timestamped(f)) # extract the name of the factor with highest contribution factor = p.iloc[p.query("pc == 1")["p_value"].idxmin()]["attribute"] # check if it's significant assert p.query( "attribute == '{}' and pc < 15".format(factor))["p_value"].min() < 0.05 # remove factor without regard for the other factors m = a.remove_factor_from_matrix(factor=factor, assign=False, save=False) a.unsupervised_analysis(matrix=m, output_prefix="after_simple", steps=["pca_association"]) f = prefix + ".after_simple.pca.variable_principle_components_association.csv" p2 = pd.read_csv(get_this_file_or_timestamped(f)) assert p2.query( "attribute == '{}' and pc < 15".format(factor))["p_value"].min() > 0.05 # remove factor accounting for the other factors m = a.remove_factor_from_matrix( factor=factor, covariates=[x for x in a.group_attributes if x != factor], assign=False, save=False, ) a.unsupervised_analysis(matrix=m, output_prefix="after_covariates", steps=["pca_association"]) f = prefix + ".after_covariates.pca.variable_principle_components_association.csv" p3 = pd.read_csv(get_this_file_or_timestamped(f)) assert p3.query( "attribute == '{}' and pc < 15".format(factor))["p_value"].min() > 0.05
def test_missing_input_no_permissive(self, a): mn = get_this_file_or_timestamped( os.path.join(a.results_dir, a.name + ".matrix_raw.csv")) os.remove(mn) os.remove(a.samples[0].aligned_filtered_bam) with pytest.raises(IOError): a.measure_coverage(samples=a.samples[:1])
def test_no_arguments(self, a): mn = get_this_file_or_timestamped( os.path.join(a.results_dir, a.name + ".matrix_raw.csv")) os.remove(mn) a.measure_coverage() assert file_exists_and_not_empty(mn)
def test_analysis_serialization(self, tmp_path): tmp_path = str(tmp_path) pickle_file = os.path.join(tmp_path, "analysis.pickle") a = Analysis(pickle_file=pickle_file) assert not file_exists(pickle_file) a.to_pickle() assert file_exists(pickle_file) assert file_not_empty(pickle_file) previous_size = os.stat( get_this_file_or_timestamped(pickle_file)).st_size a.random = np.random.random((100, 100)) a.to_pickle() new_size = os.stat(get_this_file_or_timestamped(pickle_file)).st_size assert new_size > previous_size previous_size = os.stat( get_this_file_or_timestamped(pickle_file)).st_size a.random = np.random.random((100, 100)) a.to_pickle(timestamp=True) assert len(glob.glob(os.path.join(tmp_path, "*.pickle"))) == 2
def test_get_right_timestamped_file(tmpdir): from ngs_toolkit.utils import get_this_file_or_timestamped target = os.path.join(tmpdir, "human.grch38.genomic_context.bed") assert get_this_file_or_timestamped(target) == target outs = [ "human.grch38.genomic_context.2019-09-03-11:46:42.bed", "human.grch38.genomic_context.exon.2019-09-03-11:46:36.bed", "human.grch38.genomic_context.genebody.2019-09-03-11:46:36.bed", "human.grch38.genomic_context.intergenic.2019-09-03-11:46:41.bed", "human.grch38.genomic_context.intron.2019-09-03-11:46:38.bed", "human.grch38.genomic_context.promoter.2019-09-03-11:46:36.bed", "human.grch38.genomic_context.utr3.2019-09-03-11:46:40.bed", "human.grch38.genomic_context.utr5.2019-09-03-11:46:39.bed"] outs = [os.path.join(tmpdir, f) for f in outs] # Now with several existing files that also match the regex for f in outs: with open(f, "w") as handle: handle.write(f) assert get_this_file_or_timestamped(target) == outs[0]
def file_exists_and_not_empty(file): from ngs_toolkit.utils import get_this_file_or_timestamped f = get_this_file_or_timestamped(file) return os.path.exists(f) and (os.stat(f).st_size > 0)
def file_not_empty(file): from ngs_toolkit.utils import get_this_file_or_timestamped return os.stat(get_this_file_or_timestamped(file)).st_size > 0
def file_exists(file): from ngs_toolkit.utils import get_this_file_or_timestamped return os.path.exists(get_this_file_or_timestamped(file))
def wrapper(*args, **kwargs): for i, _ in enumerate(args): if isinstance(args[i], str): args = args[:i] + (get_this_file_or_timestamped( args[i]), ) + args[i + 1:] return f(*args, **kwargs)
def wrapper(file): return f(get_this_file_or_timestamped(file))