def generate_dense_cpt_objects(self): # names of dense cpts names = [ "start_seg", "seg_subseg", "seg_seg", "seg_subseg_subseg", "segCountDown_seg_segTransition" ] num_segs = self.num_segs num_subsegs = self.num_subsegs # create required probability tables start_seg = fill_array(1.0 / num_segs, num_segs) seg_subseg = fill_array(1.0 / num_subsegs, (num_segs, num_subsegs)) seg_seg = make_zero_diagonal_table(num_segs) cpt_seg = make_zero_diagonal_table(num_subsegs) seg_subseg_subseg = (vstack_tile(cpt_seg, num_segs, 1)) segCountDown = self.make_dense_cpt_segCountDown_seg_segTransition() prob = [ start_seg, seg_subseg, seg_seg, seg_subseg_subseg, segCountDown ] # create DenseCPTs and add to input_master.dense_cpt: InlineSection for i in range(len(names)): input_master.dense_cpt[names[i]] = np.squeeze(DenseCPT(prob[i]), axis=0) # adding dirichlet row if necessary if self.len_seg_strength > 0: dirichlet_row = [ "DirichletTable %s" % self.make_dirichlet_name(NAME_SEGCOUNTDOWN_SEG_SEGTRANSITION) ] input_master.dense_cpt[ NAME_SEGCOUNTDOWN_SEG_SEGTRANSITION].extra_rows = dirichlet_row return str(input_master.dense_cpt)
def make_seg_subseg(self): name = "seg_subseg" parent_card = self.num_segs # TODO check card = self.num_subseg # TODO check prob = fill_array(1.0 / self.num_subsegs, (self.num_segs, self.num_subsegs)) seg_subseg = DenseCPT(name=name, parent_card=parent_card, cardinality=card, prob=prob) input_master.update(seg_subseg)
def make_zero_diagonal_table(length): if length == 1: return array([1.0]) # always return to self prob_self_self = 0.0 prob_self_other = (1.0 - prob_self_self) / (length - 1) # set everywhere (diagonal to be rewritten) res = fill_array(prob_self_other, (length, length)) # set diagonal range_cpt = range(length) res[range_cpt, range_cpt] = prob_self_self return res
def write_metadata(chromosome, verbose=False): if verbose: print("writing metadata for %s" % chromosome, file=sys.stderr) if chromosome.num_tracks_continuous == 0: chromosome.attrs.dirty = False return tracknames = chromosome.tracknames_continuous num_obs = len(tracknames) row_shape = (num_obs, ) mins = fill_array(PINF, row_shape) maxs = fill_array(NINF, row_shape) sums = fill_array(0.0, row_shape) sums_squares = fill_array(0.0, row_shape) num_datapoints = fill_array(0, row_shape) supercontigs = chromosome.supercontigs[chromosome.start:chromosome.end] num_supercontigs = len(supercontigs) prev_chrom_end = 0 # keeps track of where data was last present prev_supercontigs = [None] + supercontigs[:-1] next_supercontigs = supercontigs[1:] + [None] zipper = zip(prev_supercontigs, supercontigs, next_supercontigs) for prev_supercontig, supercontig, next_supercontig in zipper: if verbose: print(" scanning %s" % supercontig, file=sys.stderr) try: continuous = supercontig.continuous except NoSuchNodeError: raise NoSuchNodeError("Supercontig found missing continuous") # only runs when assertions checked if __debug__: init_num_obs(num_obs, continuous) # for the assertion # An array containing True when any data is present across all tracks mask_rows_any_present = fill_array(False, continuous.shape[0]) # doing this column by column greatly reduces the memory # footprint when you have large numbers of tracks. It also # simplifies the logic for the summary stats, since you don't # have to change the mask value for every operation, like in # revisions <= r243 for col_index, trackname in enumerate(tracknames): if verbose: print(" %s" % trackname, file=sys.stderr) ## read data col = continuous[:, col_index] mask_present = isfinite(col) mask_rows_any_present[mask_present] = True col_finite = col[mask_present] del col # col not needed anymore (optimization) num_datapoints_col = len(col_finite) if num_datapoints_col: update_extrema(get_nonzero_min, mins, col_finite, col_index) # print(update_extrema(get_nonzero_min, mins, col_finite, col_index)) update_extrema(amax, maxs, col_finite, col_index) sums[col_index] += col_finite.sum(0) sums_squares[col_index] += square(col_finite).sum(0) num_datapoints[col_index] += num_datapoints_col supercontig_attrs = supercontig.attrs next_chrom_start = find_next_present_coord(chromosome, next_supercontig) # If any data is present in this supercontig if any(mask_rows_any_present): # Find all gaps between present data starts, ends = find_chunk_gaps_in_supercontig( mask_rows_any_present) # Trim the starting chunk index for this supercontig starts[0] = trim_chunks_start(prev_supercontig, prev_chrom_end, supercontig, mask_rows_any_present, starts[0]) # Trim the end chunk index for this supercontig ends[-1] = trim_chunks_end(next_supercontig, next_chrom_start, supercontig, mask_rows_any_present, ends[-1]) # Update our new previously present coordinate for next iteration prev_chrom_end = supercontig.start + ends[-1] # Otherwise there isn't any data present for this contig else: # If this is the last supercontig if not next_supercontig: # Exclude the entire region starts = array([]) ends = array([]) # Otherwise there are supercontigs to lookahead at else: # If the distance between the previously last present value # And next possible present value is less than MIN_GAP_LEN if (next_chrom_start and next_chrom_start - prev_chrom_end < MIN_GAP_LEN): # Include the entire region starts = array([0]) ends = array([mask_rows_any_present.shape[0]]) # Otherwise else: # Exclude the entire region starts = array([]) ends = array([]) supercontig_attrs.chunk_starts = starts supercontig_attrs.chunk_ends = ends chromosome_attrs = chromosome.attrs chromosome_attrs.mins = mins chromosome_attrs.maxs = maxs chromosome_attrs.sums = sums chromosome_attrs.sums_squares = sums_squares chromosome_attrs.num_datapoints = num_datapoints chromosome_attrs.dirty = False tupler = (mins, maxs) # print(tupler) return (mins, maxs)
def make_dense_cpt_seg_subseg_spec(self): num_subsegs = self.num_subsegs cpt = fill_array(1.0 / num_subsegs, (self.num_segs, num_subsegs)) return self.make_table_spec("seg_subseg", cpt)
def make_start_seg(self): name = "start_seg" card = self.num_segs prob = fill_array(1.0 / self.num_segs, self.num_segs) start_seg = DenseCPT(name=name, cardinality=card, prob=prob) input_master.update(start_seg)