def test_mappable_bin_from_region(chr1_bin, chr1_params): mb = MappableBin.from_start(chr1_params, 0) assert mb.chrom == 'chr1' assert mb.start_pos == 0 assert mb.end_pos == 0 assert mb.current_size == 0 assert mb.bin_size == chr1_params.bin_size
def test_check_extend(chr1_bin, chr1_params): mb = MappableBin.from_start(chr1_params, 0) region = { 'start_pos': 10, 'end_pos': 20, } assert mb.check_extend(region) assert mb.start_pos == 0 assert mb.end_pos == 20 assert mb.current_size == 10
def test_check_extend_overflow(chr1_params): chr1_params.bin_size = 10000 mb = MappableBin.from_start(chr1_params, 0) region = { 'start_pos': 10, 'end_pos': 10011, } assert not mb.check_extend(region) assert mb.start_pos == 0 assert mb.end_pos == 0 assert mb.current_size == 0 assert mb.bin_size == 10000
def test_split_extend_overfill(chr1_params): chr1_params.bin_size = 10000 mb = MappableBin.from_start(chr1_params, 0) region = { 'start_pos': 10, 'end_pos': 20011, } next_bin = mb.split_extend(region) assert mb.is_full() assert not mb.is_overfill() assert not next_bin.is_full() assert next_bin.is_overfill()
def bins_boundaries_generator(self, chroms, mappable_regions_df): chrom_sizes = self.hg.chrom_sizes() chrom_bins = self.hg.calc_chrom_bins() # if mappable_regions_df is None: # mappable_regions_df = self.load_mappable_regions() for chrom in chroms: chrom_df = mappable_regions_df[mappable_regions_df.chrom == chrom] chrom_df = chrom_df.sort_values( by=['chrom', 'start_pos', 'end_pos']) params = BinParams.build( chrom_size=chrom_sizes[chrom], chrom_bin=chrom_bins[chrom]) mappable_bin = None current_excess = 0 bins_count = params.bins_count for _index, row in chrom_df.iterrows(): if mappable_bin is None: mappable_bin = MappableBin.from_start(params, start_pos=0) current_excess = mappable_bin.adapt_excess(current_excess) if not mappable_bin.check_extend(row): next_bin = mappable_bin.split_extend(row) bins_count -= 1 if bins_count == 0: # last bin a chromosome mappable_bin.end_pos = chrom_sizes[chrom].size yield mappable_bin if next_bin.is_overfill(): current_excess, mappable_bins = \ next_bin.overfill_split(current_excess) assert len(mappable_bins) > 1 for mb in mappable_bins[:-1]: bins_count -= 1 yield mb mappable_bin = mappable_bins[-1] else: mappable_bin = next_bin current_excess = \ mappable_bin.adapt_excess(current_excess) mappable_bin = None
def test_split_extend(chr1_params): chr1_params.bin_size = 10000 mb = MappableBin.from_start(chr1_params, 0) region = { 'start_pos': 10, 'end_pos': 10011, } next_bin = mb.split_extend(region) assert mb.bin_size == 10000 assert mb.start_pos == 0 assert mb.end_pos == 10010 assert mb.current_size == 10000 assert next_bin.bin_size == 10000 assert next_bin.start_pos == 10010 assert next_bin.end_pos == 10011 assert next_bin.current_size == 1
def test_overfill_split_adapt_excess_overfill(chr1_params): chr1_params.bin_size = 10000 chr1_params.bin_size_excess = 0.2 mb = MappableBin.from_start(chr1_params, 0) mb.end_pos = 10001 mb.current_size = 10001 current_excess = 0.9 current_excess, mappable_bins = \ mb.overfill_split(current_excess) assert len(mappable_bins) == 2 assert pytest.approx(current_excess) == 0.3 assert all([mb.is_full() for mb in mappable_bins[0:1]]) assert all([mb.current_size == 10001 for mb in mappable_bins[0:1]]) last_mb = mappable_bins[-1] assert last_mb.current_size == 0 assert last_mb.bin_size == 10000 assert last_mb.start_pos == 10001 assert last_mb.end_pos == 10001
def test_overfill_split(chr1_params): chr1_params.bin_size = 10000 chr1_params.bin_size_excess = 0.1 mb = MappableBin.from_start(chr1_params, 0) mb.end_pos = 20001 mb.current_size = 20001 current_excess = 0.1 current_excess, mappable_bins = \ mb.overfill_split(current_excess) assert len(mappable_bins) == 3 assert current_excess == 0.4 assert all([mb.is_full() for mb in mappable_bins[0:2]]) assert all([mb.current_size == 10000 for mb in mappable_bins[0:2]]) last_mb = mappable_bins[-1] assert last_mb.current_size == 1 assert last_mb.bin_size == 10000 assert last_mb.start_pos == 20000 assert last_mb.end_pos == 20001