Exemplo n.º 1
0
    def generate_dense_cpt_objects(self):
        # names of dense cpts
        names = [
            "start_seg", "seg_subseg", "seg_seg", "seg_subseg_subseg",
            "segCountDown_seg_segTransition"
        ]
        num_segs = self.num_segs
        num_subsegs = self.num_subsegs

        # create required probability tables
        start_seg = fill_array(1.0 / num_segs, num_segs)
        seg_subseg = fill_array(1.0 / num_subsegs, (num_segs, num_subsegs))
        seg_seg = make_zero_diagonal_table(num_segs)
        cpt_seg = make_zero_diagonal_table(num_subsegs)
        seg_subseg_subseg = (vstack_tile(cpt_seg, num_segs, 1))
        segCountDown = self.make_dense_cpt_segCountDown_seg_segTransition()
        prob = [
            start_seg, seg_subseg, seg_seg, seg_subseg_subseg, segCountDown
        ]
        # create DenseCPTs and add to input_master.dense_cpt: InlineSection
        for i in range(len(names)):
            input_master.dense_cpt[names[i]] = np.squeeze(DenseCPT(prob[i]),
                                                          axis=0)
        # adding dirichlet row if necessary
        if self.len_seg_strength > 0:
            dirichlet_row = [
                "DirichletTable %s" %
                self.make_dirichlet_name(NAME_SEGCOUNTDOWN_SEG_SEGTRANSITION)
            ]
            input_master.dense_cpt[
                NAME_SEGCOUNTDOWN_SEG_SEGTRANSITION].extra_rows = dirichlet_row
        return str(input_master.dense_cpt)
Exemplo n.º 2
0
 def make_seg_subseg(self):
     name = "seg_subseg"
     parent_card = self.num_segs  # TODO check
     card = self.num_subseg  # TODO check
     prob = fill_array(1.0 / self.num_subsegs, (self.num_segs,
                                                self.num_subsegs))
     seg_subseg = DenseCPT(name=name, parent_card=parent_card,
                           cardinality=card, prob=prob)
     input_master.update(seg_subseg)
Exemplo n.º 3
0
def make_zero_diagonal_table(length):
    if length == 1:
        return array([1.0])  # always return to self

    prob_self_self = 0.0
    prob_self_other = (1.0 - prob_self_self) / (length - 1)

    # set everywhere (diagonal to be rewritten)
    res = fill_array(prob_self_other, (length, length))

    # set diagonal
    range_cpt = range(length)
    res[range_cpt, range_cpt] = prob_self_self

    return res
Exemplo n.º 4
0
def write_metadata(chromosome, verbose=False):
    if verbose:
        print("writing metadata for %s" % chromosome, file=sys.stderr)

    if chromosome.num_tracks_continuous == 0:
        chromosome.attrs.dirty = False
        return

    tracknames = chromosome.tracknames_continuous

    num_obs = len(tracknames)
    row_shape = (num_obs, )
    mins = fill_array(PINF, row_shape)
    maxs = fill_array(NINF, row_shape)
    sums = fill_array(0.0, row_shape)
    sums_squares = fill_array(0.0, row_shape)
    num_datapoints = fill_array(0, row_shape)

    supercontigs = chromosome.supercontigs[chromosome.start:chromosome.end]
    num_supercontigs = len(supercontigs)
    prev_chrom_end = 0  # keeps track of where data was last present

    prev_supercontigs = [None] + supercontigs[:-1]
    next_supercontigs = supercontigs[1:] + [None]

    zipper = zip(prev_supercontigs, supercontigs, next_supercontigs)

    for prev_supercontig, supercontig, next_supercontig in zipper:

        if verbose:
            print(" scanning %s" % supercontig, file=sys.stderr)

        try:
            continuous = supercontig.continuous
        except NoSuchNodeError:
            raise NoSuchNodeError("Supercontig found missing continuous")

        # only runs when assertions checked
        if __debug__:
            init_num_obs(num_obs, continuous)  # for the assertion

        # An array containing True when any data is present across all tracks
        mask_rows_any_present = fill_array(False, continuous.shape[0])

        # doing this column by column greatly reduces the memory
        # footprint when you have large numbers of tracks. It also
        # simplifies the logic for the summary stats, since you don't
        # have to change the mask value for every operation, like in
        # revisions <= r243
        for col_index, trackname in enumerate(tracknames):
            if verbose:
                print("  %s" % trackname, file=sys.stderr)

            ## read data
            col = continuous[:, col_index]

            mask_present = isfinite(col)
            mask_rows_any_present[mask_present] = True
            col_finite = col[mask_present]
            del col  # col not needed anymore (optimization)

            num_datapoints_col = len(col_finite)
            if num_datapoints_col:
                update_extrema(get_nonzero_min, mins, col_finite, col_index)
                # print(update_extrema(get_nonzero_min, mins, col_finite, col_index))
                update_extrema(amax, maxs, col_finite, col_index)

                sums[col_index] += col_finite.sum(0)
                sums_squares[col_index] += square(col_finite).sum(0)
                num_datapoints[col_index] += num_datapoints_col

        supercontig_attrs = supercontig.attrs

        next_chrom_start = find_next_present_coord(chromosome,
                                                   next_supercontig)

        # If any data is present in this supercontig
        if any(mask_rows_any_present):
            # Find all gaps between present data
            starts, ends = find_chunk_gaps_in_supercontig(
                mask_rows_any_present)

            # Trim the starting chunk index for this supercontig
            starts[0] = trim_chunks_start(prev_supercontig, prev_chrom_end,
                                          supercontig, mask_rows_any_present,
                                          starts[0])

            # Trim the end chunk index for this supercontig
            ends[-1] = trim_chunks_end(next_supercontig, next_chrom_start,
                                       supercontig, mask_rows_any_present,
                                       ends[-1])

            # Update our new previously present coordinate for next iteration
            prev_chrom_end = supercontig.start + ends[-1]
        # Otherwise there isn't any data present for this contig
        else:
            # If this is the last supercontig
            if not next_supercontig:
                # Exclude the entire region
                starts = array([])
                ends = array([])
            # Otherwise there are supercontigs to lookahead at
            else:
                # If the distance between the previously last present value
                # And next possible present value is less than MIN_GAP_LEN
                if (next_chrom_start
                        and next_chrom_start - prev_chrom_end < MIN_GAP_LEN):
                    # Include the entire region
                    starts = array([0])
                    ends = array([mask_rows_any_present.shape[0]])
                # Otherwise
                else:
                    # Exclude the entire region
                    starts = array([])
                    ends = array([])

        supercontig_attrs.chunk_starts = starts
        supercontig_attrs.chunk_ends = ends

    chromosome_attrs = chromosome.attrs
    chromosome_attrs.mins = mins
    chromosome_attrs.maxs = maxs
    chromosome_attrs.sums = sums
    chromosome_attrs.sums_squares = sums_squares
    chromosome_attrs.num_datapoints = num_datapoints
    chromosome_attrs.dirty = False

    tupler = (mins, maxs)
    #    print(tupler)
    return (mins, maxs)
Exemplo n.º 5
0
    def make_dense_cpt_seg_subseg_spec(self):
        num_subsegs = self.num_subsegs
        cpt = fill_array(1.0 / num_subsegs, (self.num_segs, num_subsegs))

        return self.make_table_spec("seg_subseg", cpt)
Exemplo n.º 6
0
 def make_start_seg(self):
     name = "start_seg"
     card = self.num_segs
     prob = fill_array(1.0 / self.num_segs, self.num_segs)
     start_seg = DenseCPT(name=name, cardinality=card, prob=prob)
     input_master.update(start_seg)