Exemplo n.º 1
0
    def testOverextendingIsochoresFail(self):
        '''
        annotations: covering full workspace
        isochores: covering full workspace
        workspace: covering isochores partially
        segments: uniformly distributed in isochores
        '''

        annotations = {}

        ws_size = 1000000
        nworkspaces = 20

        for x in range(nworkspaces):
            annotations["chr%i" % x] = gat.SegmentList(iter=[
                (x * ws_size, (x + 1) * ws_size)
            ],
                                                       normalize=True)

        workspacelist = gat.SegmentList(iter=[(0, nworkspaces * ws_size)],
                                        normalize=True)

        segmentlist = getRegularSegments(nworkspaces * ws_size, 1, 0.001)

        ss, aa, ww = self.addSingleIsochore(segmentlist, annotations,
                                            workspacelist)

        self.check(ss,
                   aa,
                   ww,
                   "testChromosomalBiasFail",
                   fold_is_different=False)
Exemplo n.º 2
0
    def createSet(self):
        '''create a test set.

        Segments are constructed starting from the first residue in
        the workspace and then added with one residue gap in-between
        to prevent them from merging. 

        Segments are NOT randomly distributed within the workspace.

        Segments can partially overlap the workspace, but gaps in
        the workspace are accounted for properly.

        Returns a list of segments and workspaces.
        '''

        workspace = self.createWorkspace()

        assert len(workspace) > 0

        workspace_idx = 0
        start = max(0, workspace[workspace_idx][0] - (self.segment_length - 1))
        segments = []

        for x in range(self.nsegments):
            end = start + self.segment_length
            segments.append((start, end))

            # add gap between segments
            end += 1

            if end > workspace[workspace_idx][1]:
                # segment extending beyond current workspace segment.
                # advance workspace
                workspace_idx += 1
                # continue advancing workspace until end of workspace is larger than end of segment
                # accounts for segments straddling gaps in workspace or
                # multiple workspace segments.
                while workspace_idx < len(workspace) and workspace[workspace_idx][1] < end:
                    workspace_idx += 1

                if workspace_idx == len(workspace):
                    # can't find any more workspaces, done
                    break

                start = max(workspace[workspace_idx - 1][1] + 1,
                            workspace[workspace_idx][0] - (self.segment_length - 1))
            else:
                start = end

        if len(segments) != self.nsegments:
            # print segments
            # print workspace
            # E.warn( "not enough space in workspace for %i segments" % (self.nsegments) )
            pass

        _segments = gat.SegmentList(iter=segments, normalize=True)
        _workspace = gat.SegmentList(iter=workspace, normalize=True)

        return _segments, _workspace
Exemplo n.º 3
0
    def computeExpectedCoverage(self, samples):

        # expected coverage
        # number_of_samples * segment_length / workspace_length
        # modifier:
        # can be simplified

        # number of segments
        nsegments = float(len(self.segments))

        # number of nucleotides in segments overlapping the workspace
        tmp = gat.SegmentList(clone=self.workspace)
        tmp.intersect(self.segments)
        expected_overlap = tmp.sum()

        # for computing expected overlap use complete segments
        # as there is no truncation
        segment_size = self.segments.sum()

        # density of segment nucleotides in workspace
        segment_density = float(expected_overlap) / self.workspace.sum()

        # average length of segments within workspace
        expected_segment_length = segment_size / nsegments

        # expected coverage of segments
        expected_coverage = len(samples) * \
            expected_overlap / float(self.workspace.sum())
        #   float(workspace.sum()) / (workspace.sum() + segments.sum()  * len(workspace) )

        return (nsegments, segment_size, segment_density,
                expected_overlap,
                expected_segment_length,
                expected_coverage)
Exemplo n.º 4
0
    def testLengthBiasFail(self):

        annotations, start, end = getRegularAnnotations()

        workspacelist = gat.SegmentList(iter=[(start - 100, end + 100)],
                                        normalize=True)

        segmentlist = getRegularSegments(workspacelist.sum(),
                                         self.segment_size,
                                         self.segment_density)
Exemplo n.º 5
0
    def testIntervalsPartialOverlap(self):
        '''test with intervals with 
        increasing amount of overlap.

        '''
        return
        workspaces, segments, annotations = \
            gat.IntervalCollection("workspace"), \
            gat.IntervalCollection("segment"), \
            gat.IntervalCollection("annotation")

        workspace_size = 1000

        size = 100

        # workspace of size 1000000
        workspaces.add(
            "default", "chr1",
            gat.SegmentList(iter=[
                (0, workspace_size),
            ], normalize=True))
        workspace = workspaces["default"]

        # segment of size 10
        segments.add("default", "chr1",
                     gat.SegmentList(iter=[
                         (0, size),
                     ], normalize=True))

        # annotations: a collection of segments.
        # overlap increases
        annotations.add(
            "full", "chr1",
            gat.SegmentList(iter=[
                (y, size + y),
            ], normalize=True))

        self.check(workspace, annotations, segments)
Exemplo n.º 6
0
    def testDefault(self):

        annotations, start, end = getRegularAnnotations()

        workspacelist = gat.SegmentList(iter=[(start - 100, end + 100)],
                                        normalize=True)

        segmentlist = getRegularSegments(workspacelist.sum(),
                                         self.segment_size,
                                         self.segment_density)

        ss, aa, ww = self.addSingleIsochore(segmentlist, annotations,
                                            workspacelist)

        self.check(ss, aa, ww, "testDefaultPass", fold_is_different=False)
Exemplo n.º 7
0
def getRegularSegments(workspace_size, size, density):
    '''return regular segments of size *size* and *density* in
    workspace.
    '''

    nsegments = int(workspace_size * density / size)
    rest = workspace_size * (1.0 - density)
    distance = rest // (nsegments + 2)
    assert distance > 0
    start = int(distance)
    s = []
    for x in range(nsegments):
        s.append((start, start + size))
        start += distance + size

    return gat.SegmentList(iter=s, normalize=True)
Exemplo n.º 8
0
    def testCompositionBiasFail(self):

        isochores = gat.SegmentList()
        for x in range(self.nisochores):

            start = x * self.isochore_size * self.nisochores

            i = start
            for x in range(self.nisochores):
                isochores.add(i, i + self.isochore_size)
                i += self.isochore_size

            # segments have increasing spacing within isochores
            s = getIncrementallySpacedSegments(
                self.nisochores * self.isochore_size, self.segment_size,
                self.segment_density)
            s.shift(start)
            segmentlist.extend(s)
Exemplo n.º 9
0
    def testAscertainmentBiasFail2(self):

        annotations, start, end = getRegularAnnotations()

        workspacelist = gat.SegmentList(iter=[(start - 100, end + 100)],
                                        normalize=True)

        segmentlist = getRegularSegmentsInAnnotations(annotations,
                                                      self.segment_size,
                                                      self.segment_density)

        ss, aa, ww = self.addSingleIsochore(segmentlist, annotations,
                                            workspacelist)

        self.check(ss,
                   aa,
                   ww,
                   "testAscertainmentBiasFail2",
                   fold_is_different=True)
Exemplo n.º 10
0
    def testChromosomalBiasPass(self):

        annotations1, start, end = getRegularAnnotations()
        annotations2, start, end = getRegularAnnotations()

        workspacelist = gat.SegmentList(iter=[(start - 100, end + 100)],
                                        normalize=True)

        # segments only in first part of workspace
        segmentlist = getRegularSegments(end - start + 200, self.segment_size,
                                         self.segment_density)

        ss, aa, ww = self.addIsochores((segmentlist, None),
                                       (annotations1, annotations2),
                                       (workspacelist, workspacelist))

        self.check(ss,
                   aa,
                   ww,
                   "testChromosomalBiasPass",
                   fold_is_different=False)
Exemplo n.º 11
0
    def testAscertainmentBiasFail1(self):

        segment_density = 0.1
        segment_size = 100

        annotations, start, end = getRegularAnnotations()

        # workspace twice as large as needed
        workspacelist = gat.SegmentList(iter=[(start - 100, 2 * end + 100)],
                                        normalize=True)

        segmentlist = getRegularSegments(workspacelist.sum(),
                                         self.segment_size,
                                         self.segment_density)

        ss, aa, ww = self.addSingleIsochore(segmentlist, annotations,
                                            workspacelist)

        self.check(ss,
                   aa,
                   ww,
                   "testAscertainmentBiasFail1",
                   fold_is_different=True)
Exemplo n.º 12
0
    def computeExpectedCoverageOld(self, samples):

        # expected coverage
        # number_of_samples * segment_length / workspace_length
        # modifier:
        # can be simplified

        # number of segments
        nsegments = float(len(self.segments))

        # number of nucleotides in segments overlapping the workspace
        tmp = gat.SegmentList(clone=self.workspace)
        tmp.intersect(self.segments)
        segment_overlap = tmp.sum()

        # density of segment nucleotides in workspace
        segment_density = float(segment_overlap) / self.workspace.sum()

        segment_length = segment_size / nsegments
        expected = len(samples) * segment_overlap / float(self.workspace.sum())
        #   float(workspace.sum()) / (workspace.sum() + segments.sum()  * len(workspace) )

        return nsegments, segment_overlap, segment_density, segment_length, expected
Exemplo n.º 13
0
def getIncrementallySpacedSegments(workspace_size, size, density):
    '''return segments of size *size* and *density* in
    workspace. The inter-segment distance increases over the range.

    '''

    nsegments = int(workspace_size * density / size)
    rest = workspace_size * (1.0 - density)
    ngaps = nsegments + 2

    # sum of distances is total with increment.
    increment = rest // sum(range(ngaps))

    assert increment > 0

    distance = increment
    start = int(distance)
    s = []
    for x in range(nsegments):
        s.append((start, start + size))
        distance += increment
        start += distance + size

    return gat.SegmentList(iter=s, normalize=True)
Exemplo n.º 14
0
    def testChromosomalBiasFail(self):

        annotations1, start, end = getRegularAnnotations()
        annotations2, start, end = getRegularAnnotations()

        # concatenate annotations
        annotations2.shift(workspacelist.sum())
        annotations1.extend(annotations2)

        workspacelist = gat.SegmentList(iter=[(start - 100, 2 * end + 100)],
                                        normalize=True)

        # segments only in first part of workspace
        segmentlist = getRegularSegments(end - start + 200, self.segment_size,
                                         self.segment_density)

        ss, aa, ww = self.addSingleIsochore(segmentlist, annotations,
                                            workspacelist)

        self.check(ss,
                   aa,
                   ww,
                   "testChromosomalBiasFail",
                   fold_is_different=True)
Exemplo n.º 15
0
def getRegularAnnotations(
        sizes=(100, 200, 400, 800, 1600, 3200, 6400, 128000, 256000, 512000),
        nsegments=20,
        distance=100):
    '''return annotations of size *size*. Each annotation has
    *nsegments* and segments are *distance* bases apart.

    Returns a dictionary of segment lists and start/end of 
    the annotations.
    '''

    annos = [[] for x in range(len(sizes))]

    start = distance
    for x in range(nsegments):
        for s, size in enumerate(sizes):
            annos[s].append((start, start + size))
            start += size + distance

    annotations = {}
    for x, size in enumerate(sizes):
        annotations["size%06i" % size] = gat.SegmentList(iter=annos[x],
                                                         normalize=True)
    return annotations, distance, start - distance
Exemplo n.º 16
0
    def validate(self, samples):

        # filename = getPlotFilename()

        # compute expected coverage
        (nsegments, segment_size, segment_density,
         expected_overlap,
         expected_segment_length,
         expected_coverage) = self.computeExpectedCoverage(samples)

        # compute actual coverage counts
        counts_within_workspace, segment_sizes, starts, ends = computeSegmentDensityProfile(self.workspace,
                                                                                            samples)

        # plotCounts( None,
        #             counts_within_workspace,
        #             segment_sizes,
        #             starts, ends,
        #             self.workspace,
        #             expected_coverage = expected_coverage,
        #             density = segment_density )

        ##################################
        # check if each sample has the correct number of nucleotides
        nucleotide_ok = True

        sums = [x.sum() for x in samples]
        overlaps = []
        for x, s in enumerate(samples):
            tmp = gat.SegmentList(clone=self.workspace, normalize=True)
            s.normalize()
            tmp.intersect(s)
            ovl = tmp.sum()
            overlaps.append(ovl)
            if ovl != expected_overlap:
                nucleotide_ok = False
                E.warn("incorrect number of nucleotides in sample %i, got %i, expected %i, %s" %
                       (x, ovl, expected_overlap, samples[x]))

        ##################################
        # check if average overlap is ok
        overlap_ok = True
        average_overlap = numpy.mean(overlaps)
        average_overlap_d = abs(
            average_overlap - expected_overlap) / float(expected_overlap)
        if average_overlap_d >= self.stringency_level:
            overlap_ok = False
            E.warn("expected_overlap counts (%f) != sampled counts (%f)" % (expected_overlap,
                                                                            average_overlap))

        ##################################
        # check average coverage
        average_ok = True
        average_coverage = counts_within_workspace.mean()
        average_d = abs(average_coverage - expected_coverage) / \
            float(expected_coverage)

        if average_d >= self.stringency_level:
            average_ok = False
            E.warn("expected_coverage counts (%f) != sampled counts (%f)" % (expected_coverage,
                                                                             counts_within_workspace.mean()))

        # check for uniform coverage
        uniform_ok = True
        stddev = numpy.std(counts_within_workspace)
        uniform_d = stddev / float(expected_coverage)

        if uniform_d >= self.stringency_level:
            uniform_ok = False
            E.warn("coverage variation too large : stddev (%f) / %f = %f > %f" %
                   (stddev,
                    expected_coverage,
                    uniform_d,
                    self.stringency_level))

        return "\t".join(("%i" % nucleotide_ok,
                          "%i" % average_ok,
                          "%i" % uniform_ok,
                          "%i" % overlap_ok,
                          "%6.4f" % average_d,
                          "%6.4f" % uniform_d,
                          "%6.4f" % segment_density,
                          "%6.4f" % average_overlap,
                          "%6.4f" % expected_overlap,
                          "%6.4f" % average_coverage,
                          "%6.4f" % expected_coverage))
Exemplo n.º 17
0
def getRegularSegmentsInAnnotations(annotations, segment_size,
                                    segment_density):
    '''get a collection of segments overlapping with annotations.

    Segments are uniformly distributed over annotations. Longer annotations
    will have more segments. 

    Segments straddling annotations are truncated.
    '''

    merged = gat.SegmentList()
    for x, i in annotations.items():
        merged.extend(i)
    merged.normalize()

    workspace_size = merged.sum()
    nsegments = int(workspace_size * segment_density / segment_size)
    rest = workspace_size * (1.0 - segment_density)
    distance = rest // (nsegments + 2)

    # get regularly placed segments in a virtual workspace
    segments = getRegularSegments(workspace_size, segment_size,
                                  segment_density)

    # size of annotations
    n = 0
    m = 0
    s = []
    # negative for inter-segment segments
    overhang = distance
    annotation_start, annotation_end = merged[m]
    lannotation = annotation_end - annotation_start

    for start, end in segments:

        lsegment = end - start

        # place intergap segment, split if necessary
        while overhang > 0:

            if overhang < lannotation:
                annotation_start += overhang
                overhang = 0
                break

            overhang -= lannotation
            m += 1
            annotation_start, annotation_end = merged[m]
            lannotation = annotation_end - annotation_start

        lannotation = annotation_end - annotation_start

        # place segment - split if necessary
        while lsegment > 0:

            if lsegment < lannotation:
                s.append((annotation_start, annotation_start + lsegment))
                annotation_start += lsegment
                break

            s.append((annotation_start, annotation_end))

            lsegment -= lannotation
            m += 1
            annotation_start, annotation_end = merged[m]
            lannotation = annotation_end - annotation_start

        lannotation = annotation_end - annotation_start

    segments = gat.SegmentList(iter=s, normalize=True)

    noverlap = segments.overlapWithSegments(merged)
    return segments