def test_bedslice(): chromsizes = util.read_chromsizes(op.join(datadir, 'toy.chrom.sizes')) bins = util.binnify(chromsizes, 10) grouped = bins.groupby('chrom') df = util.bedslice(grouped, chromsizes, 'chr1:0-12') assert df['chrom'].tolist() == ['chr1', 'chr1'] assert df['start'].tolist() == [0, 10]
def test_parse_region(): chromsizes = util.read_chromsizes(op.join(datadir, 'toy.chrom.sizes')) assert util.parse_region(('chr1', 0, 10)) == ('chr1', 0, 10) assert util.parse_region('chr1:0-10') == ('chr1', 0, 10) assert util.parse_region('chr1:0-', chromsizes) == ('chr1', 0, chromsizes['chr1']) # Don't accept undefined end unless chromsizes exists # NOTE: parse_region_string works here with pytest.raises(ValueError): util.parse_region('chr1:0-') # catch end < start in non-string case with pytest.raises(ValueError): util.parse_region(('chr1', 10, 0)) # catch errors when chromsizes is given for region in [ ('chr1', 0, 1000), ('chr1', -5, 10), ('DoesNotExist', 0, 10), 'DoesNotExist', ]: with pytest.raises(ValueError): util.parse_region(region, chromsizes)
def prepare_snakemake(allc_table_path, output_dir, chrom_sizes_path, template_path, chroms=None, smoothing=True, chunk_size=50000000): output_dir = pathlib.Path(output_dir).absolute() output_dir.mkdir(exist_ok=True) allc_table = pd.read_csv(allc_table_path, sep='\t') allc_table.columns = ['allc_path', 'sample', 'group'] if allc_table['group'].unique().size != 2: raise ValueError( f"There must be two and only two different groups, got {allc_table['group'].unique().size}." ) group1, group2 = allc_table['group'].unique() group1_allc = allc_table.loc[allc_table['group'] == group1, 'allc_path'].tolist() group2_allc = allc_table.loc[allc_table['group'] == group2, 'allc_path'].tolist() group1_id = allc_table.loc[allc_table['group'] == group1, 'sample'].tolist() group2_id = allc_table.loc[allc_table['group'] == group2, 'sample'].tolist() chrom_sizes = read_chromsizes(chrom_sizes_path).reindex(chroms) if chroms is None: chroms = chrom_sizes.index.tolist() bins = binnify(chrom_sizes.loc[chroms], binsize=chunk_size) regions = [] for _, (chrom, start, end) in bins.iterrows(): region = f'{chrom}:{start}-{end}' regions.append(region) for region in regions: config_path = f'{output_dir}/{region}.yaml' parameters = dict(region=region, allc_paths=group1_allc + group2_allc, group1=group1_id, group2=group2_id, smoothing=smoothing) with open(config_path, 'w') as f: f.write(yaml.dump(parameters)) snakefile = f""" regions = {regions} rule main: input: expand('{{region}}.DSS.DML.hdf', region=regions) rule papermill: input: nb='{template_path}', config='{{region}}.yaml' output: nb='{{region}}.ipynb', data='{{region}}.DSS.DML.hdf' shell: 'papermill {{input.nb}} {{output.nb}} -f {{input.config}} && sleep 10' """ snakefile_path = f'{output_dir}/Snakefile' with open(snakefile_path, 'w') as f: f.write(snakefile) return snakefile_path
def test_genome_segmentation(): chromsizes = util.read_chromsizes(op.join(datadir, 'toy.chrom.sizes')) bins = util.binnify(chromsizes, 10) gs = util.GenomeSegmentation(chromsizes, bins) df = gs.fetch('chr1') assert len(df) == 4 df = gs.fetch('chr1:2-30') assert len(df) == 3 util.balanced_partition(gs, 2, ['chr1'])
def test_get_binsize(): chromsizes = util.read_chromsizes(op.join(datadir, 'toy.chrom.sizes')) bins = util.binnify(chromsizes, 10) assert util.get_binsize(bins) == 10 # variable-sized bins bins = pd.read_csv(op.join(datadir, 'toy.bins.var.bed'), names=['chrom', 'start', 'end'], sep='\t') assert util.get_binsize(bins) is None # ambiguous case: one bin per chromosome with different lengths bins = pd.DataFrame({ 'chrom': ['chr1', 'chr2', 'chr3'], 'start': [0, 0, 0], 'end': [100, 200, 300] }) assert util.get_binsize(bins) is None
def prepare_snakemake(allc_table_path, output_dir, chrom_sizes_path, template_path, chroms=None, test_covariate='group', match_covariate=None, adjust_covariate=None, cutoff=0.1, min_num_region=3, smooth=True, bp_span=1000, min_in_span=30, max_gap_smooth=2500, max_gap=1000, verbose=True, max_perms=10, stat="stat", block=False, block_size=5000, chrs_per_chunk=1, cpu=40, chunk_size=5000000000): output_dir = pathlib.Path(output_dir).absolute() output_dir.mkdir(exist_ok=True) chrom_sizes = read_chromsizes(chrom_sizes_path).reindex(chroms) if chroms is None: chroms = chrom_sizes.index.tolist() bins = binnify(chrom_sizes.loc[chroms], binsize=chunk_size) regions = [] for _, (chrom, start, end) in bins.iterrows(): region = f'{chrom}:{start}-{end}' regions.append(region) for region in regions: config_path = f'{output_dir}/{region}.yaml' parameters = { 'region': region, 'allc_table_path': allc_table_path, 'test_covariate': test_covariate, 'match_covariate': match_covariate, 'adjust_covariate': adjust_covariate, 'cutoff': cutoff, 'min_num_region': min_num_region, 'smooth': smooth, 'bp_span': bp_span, 'min_in_span': min_in_span, 'max_gap_smooth': max_gap_smooth, 'max_gap': max_gap, 'verbose': verbose, 'max_perms': max_perms, 'stat': stat, 'block': block, 'block_size': block_size, 'chrs_per_chunk': chrs_per_chunk, 'cpu': cpu } with open(config_path, 'w') as f: f.write(yaml.dump(parameters)) snakefile = f""" regions = {regions} rule main: input: expand('{{region}}.DMR.hdf', region=regions) rule papermill: input: nb='{template_path}', config='{{region}}.yaml' output: nb='{{region}}.ipynb', data='{{region}}.DMR.hdf' threads: 1 #{cpu} shell: 'papermill {{input.nb}} {{output.nb}} -f {{input.config}} && sleep 10' """ snakefile_path = f'{output_dir}/Snakefile' with open(snakefile_path, 'w') as f: f.write(snakefile) return regions
def test_check_bins(): chromsizes = util.read_chromsizes(op.join(datadir, 'toy.chrom.sizes')) bins = util.binnify(chromsizes, 10) bins['chrom'] = bins['chrom'].astype(str) bins = util.check_bins(bins, chromsizes) assert pd.api.types.is_categorical(bins["chrom"])
def test_get_chromsizes(): chromsizes = util.read_chromsizes(op.join(datadir, 'toy.chrom.sizes')) bins = util.binnify(chromsizes, 10) assert np.allclose(util.get_chromsizes(bins), chromsizes)
def test_binnify(): chromsizes = util.read_chromsizes(op.join(datadir, 'toy.chrom.sizes')) bins = util.binnify(chromsizes, 10) assert len(bins) == 8
def test_read_chromsizes(): util.read_chromsizes(op.join(datadir, 'toy.chrom.sizes'))