Exemplo n.º 1
    def test_limits(self):
        """Check line graphs."""
        #TODO - Fix GD so that the same min/max is used for all three lines?
        points = 1000
        scale = math.pi * 2.0 / points
        data1 = [math.sin(x*scale) for x in range(points)]
        data2 = [math.cos(x*scale) for x in range(points)]
        data3 = [2*math.sin(2*x*scale) for x in range(points)]

        gdd = Diagram('Test Diagram', circular=False,
                      y=0.01, yt=0.01, yb=0.01,
                      x=0.01, xl=0.01, xr=0.01)
        gdt_data = gdd.new_track(1, greytrack=False)
        gds_data = gdt_data.new_set("graph")
        for data_values, name, color in zip([data1, data2, data3],
                                            ["sin", "cos", "2sin2"],
                                            ["red", "green", "blue"]):
            data = list(zip(range(points), data_values))
            gds_data.new_graph(data, "", style="line",
                               color = color, altcolor = color,
                               center = 0)

                 pagesize=(15*cm, 15*cm),
                 start=0, end=points)
        gdd.write(os.path.join('Graphics', "line_graph.pdf"), "pdf")
        #Circular diagram
                 pagesize=(15*cm, 15*cm),
                 circular=True,  # Data designed to be periodic
                 start=0, end=points, circle_core=0.5)
        gdd.write(os.path.join('Graphics', "line_graph_c.pdf"), "pdf")
 def test_get_db_items(self):
     """Check list, keys, length etc."""
     db = self.db
     items = list(db.values())
     keys = list(db)
     length = len(items)
     self.assertEqual(length, len(db))
     self.assertEqual(length, len(list(db)))
     self.assertEqual(length, len(list(db.items())))
     self.assertEqual(length, len(list(db.keys())))
     self.assertEqual(length, len(list(db.values())))
     if sys.version_info[0] == 2:
         # Check legacy methods for Python 2 as well:
         self.assertEqual(length, len(list(db.iteritems())))
         self.assertEqual(length, len(list(db.iterkeys())))
         self.assertEqual(length, len(list(db.itervalues())))
     for (k1, r1), (k2, r2) in zip(zip(keys, items), db.items()):
         self.assertEqual(k1, k2)
         self.assertEqual(r1.id, r2.id)
     for k in keys:
         del db[k]
     self.assertEqual(0, len(db))
         del db["non-existant-name"]
         assert False, "Should have raised KeyError"
     except KeyError:
    def common_ancestor(self, targets, *more_targets):
        """Most recent common ancestor (clade) of all the given targets.

        Edge cases:
        - If no target is given, returns self.root
        - If 1 target is given, returns the target
        - If any target is not found in this tree, raises a ValueError
        paths = [self.get_path(t)
                 for t in _combine_args(targets, *more_targets)]
        # Validation -- otherwise izip throws a spooky error below
        for p, t in zip(paths, targets):
            if p is None:
                raise ValueError("target %s is not in this tree" % repr(t))
        mrca = self.root
        for level in zip(*paths):
            ref = level[0]
            for other in level[1:]:
                if ref is not other:
                mrca = ref
            if ref is not mrca:
        return mrca
def get_fasta_stats(probes, fa_fname):
    """Calculate GC and RepeatMasker content of each bin in the FASTA genome."""
    fa_coords = zip(probes.chromosome, probes.start, probes.end)
    logging.info("Calculating GC and RepeatMasker content in %s ...", fa_fname)
    gc_rm_vals = [calculate_gc_lo(subseq)
                  for subseq in ngfrills.fasta_extract_regions(fa_fname,
    gc_vals, rm_vals = zip(*gc_rm_vals)
    return np.asfarray(gc_vals), np.asfarray(rm_vals)
def _get_inter_coords(coords, strand=1):
    """From the given pairs of coordinates, returns a list of pairs
    covering the intervening ranges."""
    # adapted from Python's itertools guide
    # if strand is -1, adjust coords to the ends and starts are chained
    if strand == -1:
        sorted_coords = [(max(a, b), min(a, b)) for a, b in coords]
        inter_coords = list(chain(*sorted_coords))[1:-1]
        return list(zip(inter_coords[1::2], inter_coords[::2]))
        inter_coords = list(chain(*coords))[1:-1]
        return list(zip(inter_coords[::2], inter_coords[1::2]))
Arquivo: fix.py Projeto: roryk/cnvkit
def match_ref_to_probes(ref_pset, probes):
    """Filter the reference probes to match the target or antitarget probe set.
    ref_lookup = dict(zip(ref_pset.labels(), ref_pset))
    ref_matched_rows = [tuple(ref_lookup[label]) for label in probes.labels()]
    ref_matched = ref_pset.to_rows(ref_matched_rows)
    return ref_matched
def _reorient_starts(starts, blksizes, seqlen, strand):
    """Reorients block starts into the opposite strand's coordinates.

    starts -- List of integers, start coordinates.
    start -- Integer, 'Q start' or 'T start' column
    blksizes -- List of integers, block sizes.
    seqlen -- Integer of total sequence length.
    strand -- Integer denoting sequence strand.

    assert len(starts) == len(blksizes), \
            "Unequal start coordinates and block sizes list (%r vs %r)" \
            % (len(starts), len(blksizes))
    # see: http://genome.ucsc.edu/goldenPath/help/blatSpec.html
    # no need to reorient if it's already the positive strand
    if strand >= 0:
        return starts
        # the plus-oriented coordinate is calculated by this:
        # plus_coord = length - minus_coord - block_size
        return [
            seqlen - start - blksize
            for start, blksize in zip(starts, blksizes)
def _reorient_starts(starts, blksizes, seqlen, strand):
    """Reorients block starts into the opposite strand's coordinates.

    :param starts: start coordinates
    :type starts: list [int]
    :param blksizes: block sizes
    :type blksizes: list [int]
    :param seqlen: sequence length
    :type seqlen: int
    :param strand: sequence strand
    :type strand: int, choice of -1, 0, or 1

    assert len(starts) == len(blksizes), \
            "Unequal start coordinates and block sizes list (%r vs %r)" \
            % (len(starts), len(blksizes))
    # see: http://genome.ucsc.edu/goldenPath/help/blatSpec.html
    # no need to reorient if it's already the positive strand
    if strand >= 0:
        return starts
        # the plus-oriented coordinate is calculated by this:
        # plus_coord = length - minus_coord - block_size
        return [seqlen - start - blksize for
                start, blksize in zip(starts, blksizes)]
 def loop(self, filename, format):
     original_records = list(SeqIO.parse(filename, format))
     # now open a connection to load the database
     server = BioSeqDatabase.open_database(driver=DBDRIVER, user=DBUSER, passwd=DBPASSWD, host=DBHOST, db=TESTDB)
     db_name = "test_loop_%s" % filename  # new namespace!
     db = server.new_database(db_name)
     count = db.load(original_records)
     self.assertEqual(count, len(original_records))
     # Now read them back...
     biosql_records = [db.lookup(name=rec.name) for rec in original_records]
     # And check they agree
     self.assertTrue(compare_records(original_records, biosql_records))
     # Now write to a handle...
     handle = StringIO()
     SeqIO.write(biosql_records, handle, "gb")
     # Now read them back...
     new_records = list(SeqIO.parse(handle, "gb"))
     # And check they still agree
     self.assertEqual(len(new_records), len(original_records))
     for old, new in zip(original_records, new_records):
         # TODO - remove this hack because we don't yet write these (yet):
         for key in ["comment", "references", "db_source"]:
             if key in old.annotations and key not in new.annotations:
                 del old.annotations[key]
         self.assertTrue(compare_record(old, new))
     # Done
    def format_phylip(self, handle):
        """Write data in Phylip format to a given file-like object or handle.

        The output stream is the input distance matrix format used with Phylip
        programs (e.g. 'neighbor'). See:

            handle : file or file-like object
                A writeable file handle or other object supporting the 'write'
                method, such as StringIO or sys.stdout. On Python 3, should be
                open in text mode.

        handle.write("    {0}\n".format(len(self.names)))
        # Phylip needs space-separated, vertically aligned columns
        name_width = max(12, max(map(len, self.names)) + 1)
        value_fmts = ("{" + str(x) + ":.4f}"
                      for x in range(1, len(self.matrix) + 1))
        row_fmt = "{0:" + str(name_width) + "s}" + "  ".join(value_fmts) + "\n"
        for i, (name, values) in enumerate(zip(self.names, self.matrix)):
            # Mirror the matrix values across the diagonal
            mirror_values = (self.matrix[j][i]
                             for j in range(i + 1, len(self.matrix)))
            fields = itertools.chain([name], values, mirror_values)
 def loop(self, filename, format):
     original_records = list(SeqIO.parse(filename, format))
     # now open a connection to load the database
     server = BioSeqDatabase.open_database(driver=DBDRIVER,
     db_name = "test_loop_%s" % filename  # new namespace!
     db = server.new_database(db_name)
     count = db.load(original_records)
     self.assertEqual(count, len(original_records))
     # Now read them back...
     biosql_records = [db.lookup(name=rec.name) for rec in original_records]
     # And check they agree
     self.assertTrue(compare_records(original_records, biosql_records))
     # Now write to a handle...
     handle = StringIO()
     SeqIO.write(biosql_records, handle, "gb")
     # Now read them back...
     new_records = list(SeqIO.parse(handle, "gb"))
     # And check they still agree
     self.assertEqual(len(new_records), len(original_records))
     for old, new in zip(original_records, new_records):
         # TODO - remove this hack because we don't yet write these (yet):
         for key in ["comment", "references", "db_source"]:
             if key in old.annotations and key not in new.annotations:
                 del old.annotations[key]
         self.assertTrue(compare_record(old, new))
     # Done
def _comp_intron_lens(seq_type, inter_blocks, raw_inter_lens):
    """Returns the length of introns between fragments."""
    # set opposite type, for setting introns
    opp_type = 'hit' if seq_type == 'query' else 'query'
    # list of flags to denote if an intron follows a block
    # it reads e.g. this line:
    # "ATGTT{TT}  >>>> Target Intron 1 >>>>  {G}TGTGTGTACATT"
    # and sets the opposing sequence type's intron (since this
    # line is present on the opposite sequence type line)
    has_intron_after = ['Intron' in x[seq_type] for x in inter_blocks]
    assert len(has_intron_after) == len(raw_inter_lens)
    # create list containing coord adjustments incorporating
    # intron lengths
    inter_lens = []
    for flag, parsed_len in zip(has_intron_after, raw_inter_lens):
        if flag:
            # joint introns
            if all(parsed_len[:2]):
                # intron len is [0] if opp_type is query, otherwise it's [1]
                intron_len = int(parsed_len[0]) if opp_type == 'query' \
                        else int(parsed_len[1])
            # single hit/query introns
            elif parsed_len[2]:
                intron_len = int(parsed_len[2])
                raise ValueError("Unexpected intron parsing "
                                 "result: %r" % parsed_len)
            intron_len = 0


    return inter_lens
    def randomized(cls, taxa, branch_length=1.0, branch_stdev=None):
        """Create a randomized bifurcating tree given a list of taxa.

        :param taxa: Either an integer specifying the number of taxa to create
            (automatically named taxon#), or an iterable of taxon names, as

        :returns: a tree of the same type as this class.
        if isinstance(taxa, int):
            taxa = ['taxon%s' % (i+1) for i in range(taxa)]
        elif hasattr(taxa, '__iter__'):
            taxa = list(taxa)
            raise TypeError("taxa argument must be integer (# taxa) or "
                            "iterable of taxon names.")
        rtree = cls()
        terminals = [rtree.root]
        while len(terminals) < len(taxa):
            newsplit = random.choice(terminals)
            newterms = newsplit.clades
            if branch_stdev:
                # Add some noise to the branch lengths
                for nt in newterms:
                    nt.branch_length = max(0,
                            random.gauss(branch_length, branch_stdev))
        # Distribute taxon labels randomly
        for node, name in zip(terminals, taxa):
            node.name = name
        return rtree
    def format_phylip(self, handle):
        """Write data in Phylip format to a given file-like object or handle.

        The output stream is the input distance matrix format used with Phylip
        programs (e.g. 'neighbor'). See:

            handle : file or file-like object
                A writeable file handle or other object supporting the 'write'
                method, such as StringIO or sys.stdout. On Python 3, should be
                open in text mode.

        handle.write("    {0}\n".format(len(self.names)))
        # Phylip needs space-separated, vertically aligned columns
        name_width = max(12, max(map(len, self.names)) + 1)
        value_fmts = ("{" + str(x) + ":.4f}"
                      for x in range(1, len(self.matrix) + 1))
        row_fmt = "{0:" + str(name_width) + "s}" + "  ".join(value_fmts) + "\n"
        for i, (name, values) in enumerate(zip(self.names, self.matrix)):
            # Mirror the matrix values across the diagonal
            mirror_values = (self.matrix[j][i]
                             for j in range(i + 1, len(self.matrix)))
            fields = itertools.chain([name], values, mirror_values)
def _reorient_starts(starts, blksizes, seqlen, strand):
    """Reorients block starts into the opposite strand's coordinates (PRIVATE).

    :param starts: start coordinates
    :type starts: list [int]
    :param blksizes: block sizes
    :type blksizes: list [int]
    :param seqlen: sequence length
    :type seqlen: int
    :param strand: sequence strand
    :type strand: int, choice of -1, 0, or 1

    if len(starts) != len(blksizes):
        raise RuntimeError("Unequal start coordinates and block sizes list"
                           " (%r vs %r)" % (len(starts), len(blksizes)))
    # see: http://genome.ucsc.edu/goldenPath/help/blatSpec.html
    # no need to reorient if it's already the positive strand
    if strand >= 0:
        return starts
        # the plus-oriented coordinate is calculated by this:
        # plus_coord = length - minus_coord - block_size
        return [
            seqlen - start - blksize
            for start, blksize in zip(starts, blksizes)
def _comp_intron_lens(seq_type, inter_blocks, raw_inter_lens):
    """Return the length of introns between fragments (PRIVATE)."""
    # set opposite type, for setting introns
    opp_type = 'hit' if seq_type == 'query' else 'query'
    # list of flags to denote if an intron follows a block
    # it reads e.g. this line:
    # "ATGTT{TT}  >>>> Target Intron 1 >>>>  {G}TGTGTGTACATT"
    # and sets the opposing sequence type's intron (since this
    # line is present on the opposite sequence type line)
    has_intron_after = ['Intron' in x[seq_type] for x in
    assert len(has_intron_after) == len(raw_inter_lens)
    # create list containing coord adjustments incorporating
    # intron lengths
    inter_lens = []
    for flag, parsed_len in zip(has_intron_after, raw_inter_lens):
        if flag:
            # joint introns
            if all(parsed_len[:2]):
                # intron len is [0] if opp_type is query, otherwise it's [1]
                intron_len = int(parsed_len[0]) if opp_type == 'query' \
                        else int(parsed_len[1])
            # single hit/query introns
            elif parsed_len[2]:
                intron_len = int(parsed_len[2])
                raise ValueError("Unexpected intron parsing "
                                 "result: %r" % parsed_len)
            intron_len = 0


    return inter_lens
    def randomized(cls, taxa, branch_length=1.0, branch_stdev=None):
        """Create a randomized bifurcating tree given a list of taxa.

        :param taxa: Either an integer specifying the number of taxa to create
            (automatically named taxon#), or an iterable of taxon names, as

        :returns: a tree of the same type as this class.
        if isinstance(taxa, int):
            taxa = ['taxon%s' % (i + 1) for i in range(taxa)]
        elif hasattr(taxa, '__iter__'):
            taxa = list(taxa)
            raise TypeError("taxa argument must be integer (# taxa) or "
                            "iterable of taxon names.")
        rtree = cls()
        terminals = [rtree.root]
        while len(terminals) < len(taxa):
            newsplit = random.choice(terminals)
            newterms = newsplit.clades
            if branch_stdev:
                # Add some noise to the branch lengths
                for nt in newterms:
                    nt.branch_length = max(
                        0, random.gauss(branch_length, branch_stdev))
        # Distribute taxon labels randomly
        for node, name in zip(terminals, taxa):
            node.name = name
        return rtree
Arquivo: fix.py Projeto: roryk/cnvkit
    def get_edge(chrom, tgt_start, tgt_end, insert_size):
        """Quantify the "edge effect" of the target tile and its neighbors.

        The result is proportional to the change in the target's coverage due to
        these edge effects, i.e. the expected loss of coverage near the target
        edges and, if there are close neighboring tiles, gain of coverage due
        to "spill over" reads from the neighbor tiles.

        (This is not the actual change in coverage. This is just a tribute.)
        margin_start = tgt_start - insert_size
        margin_end = tgt_end + insert_size
        tile_starts = chrom_tile_starts[chrom]
        tile_ends = chrom_tile_ends[chrom]
        target_size = (tgt_end - tgt_start)

        # Calculate coverage loss at (both) tile edges
        loss = edge_loss(target_size, insert_size)

        # For each neighbor tile, calculate coverage gain to the target
        gaps_left = []
        gaps_right = []
        # Find the leftmost tile in the margin
        left_idx = max(0, bisect.bisect_left(tile_ends, margin_start) - 1)
        for (tile_start, tile_end) in zip(tile_starts[left_idx:],
            if tile_end <= margin_start:
                # No overlap on the 5' end -- keep moving forward
            if tile_start >= margin_end:
                # No overlap on the 3' end -- we're done
            if tile_start == tgt_start and tile_end == tgt_end:
                # The target itself
            # Tile is within margins
            if margin_start <= tile_end <= tgt_start:
                # Left neighbor
                gaps_left.append(tgt_start - tile_end)
            elif tgt_end <= tile_start <= margin_end:
                # Right neighbor
                gaps_right.append(tile_start - tgt_end)
            elif tile_start < tgt_start and tile_end >= tgt_start:
                # Overlap on left side -- treat as adjacent
            elif tile_start <= tgt_end and tile_end > tgt_end:
                # Overlap on right side -- treat as adjacent
                # DBG: This should probably never happen
                echo("Oddly positioned tile (%s:%d-%d) vs. target (%d-%d)" %
                     (chrom, tile_start, tile_end, tgt_start, tgt_end))
        gain = 0
        if gaps_left:
            gain += edge_gain(target_size, insert_size, min(gaps_left))
        if gaps_right:
            gain += edge_gain(target_size, insert_size, min(gaps_right))
        return gain - loss
    def get_edge(chrom, tgt_start, tgt_end, insert_size):
        """Quantify the "edge effect" of the target tile and its neighbors.

        The result is proportional to the change in the target's coverage due to
        these edge effects, i.e. the expected loss of coverage near the target
        edges and, if there are close neighboring tiles, gain of coverage due
        to "spill over" reads from the neighbor tiles.

        (This is not the actual change in coverage. This is just a tribute.)
        margin_start = tgt_start - insert_size
        margin_end = tgt_end + insert_size
        tile_starts = chrom_tile_starts[chrom]
        tile_ends = chrom_tile_ends[chrom]
        target_size = (tgt_end - tgt_start)

        # Calculate coverage loss at (both) tile edges
        loss = edge_loss(target_size, insert_size)

        # For each neighbor tile, calculate coverage gain to the target
        gaps_left = []
        gaps_right = []
        # Find the leftmost tile in the margin
        left_idx = max(0, bisect.bisect_left(tile_ends, margin_start) - 1)
        for (tile_start, tile_end) in zip(tile_starts[left_idx:],
            if tile_end <= margin_start:
                # No overlap on the 5' end -- keep moving forward
            if tile_start >= margin_end:
                # No overlap on the 3' end -- we're done
            if tile_start == tgt_start and tile_end == tgt_end:
                # The target itself
            # Tile is within margins
            if margin_start <= tile_end <= tgt_start:
                # Left neighbor
                gaps_left.append(tgt_start - tile_end)
            elif tgt_end <= tile_start <= margin_end:
                # Right neighbor
                gaps_right.append(tile_start - tgt_end)
            elif tile_start < tgt_start and tile_end >= tgt_start:
                # Overlap on left side -- treat as adjacent
            elif tile_start <= tgt_end and tile_end > tgt_end:
                # Overlap on right side -- treat as adjacent
                # DBG: This should probably never happen
                logging.info("Oddly positioned tile (%s:%d-%d) vs. target (%d-%d)",
                             chrom, tile_start, tile_end, tgt_start, tgt_end)
        gain = 0
        if gaps_left:
            gain += edge_gain(target_size, insert_size, min(gaps_left))
        if gaps_right:
            gain += edge_gain(target_size, insert_size, min(gaps_right))
        return gain - loss
    def test_limits(self):
        """Check line graphs."""
        # TODO - Fix GD so that the same min/max is used for all three lines?
        points = 1000
        scale = math.pi * 2.0 / points
        data1 = [math.sin(x * scale) for x in range(points)]
        data2 = [math.cos(x * scale) for x in range(points)]
        data3 = [2 * math.sin(2 * x * scale) for x in range(points)]

        gdd = Diagram('Test Diagram',
        gdt_data = gdd.new_track(1, greytrack=False)
        gds_data = gdt_data.new_set("graph")
        for data_values, name, color in zip([data1, data2, data3],
                                            ["sin", "cos", "2sin2"],
                                            ["red", "green", "blue"]):
            data = list(zip(range(points), data_values))

                 pagesize=(15 * cm, 15 * cm),
        gdd.write(os.path.join('Graphics', "line_graph.pdf"), "pdf")
        # Circular diagram
            pagesize=(15 * cm, 15 * cm),
            circular=True,  # Data designed to be periodic
        gdd.write(os.path.join('Graphics', "line_graph_c.pdf"), "pdf")
def interval_coverages_count(bed_fname, bam_fname):
    """Calculate log2 coverages in the BAM file at each interval."""
    bamfile = pysam.Samfile(bam_fname, 'rb')
    # Parse the BED lines and group them by chromosome
    # (efficient if records are already sorted by chromosome)
    for chrom, rows_iter in groupby(ngfrills.parse_regions(bed_fname),
                                    key=lambda r: r[0]):
        # Thunk and reshape this chromosome's intervals
        echo("Processing chromosome", chrom, "of", os.path.basename(bam_fname))
        _chroms, starts, ends, names = zip(*rows_iter)
        counts_depths = [region_depth_count(bamfile, chrom, s, e)
                         for s, e in zip(starts, ends)]
        for start, end, name, (count, depth) in zip(starts, ends, names,
            yield [count,
                   (chrom, start, end, name,
                    math.log(depth, 2) if depth else NULL_LOG2_COVERAGE)]
 def __init__(self, line=None):
     self.chromat_file = ''
     self.phd_file = ''
     self.time = ''
     self.chem = ''
     self.dye = ''
     self.template = ''
     self.direction = ''
     if line:
         tags = ['CHROMAT_FILE', 'PHD_FILE', 'TIME', 'CHEM', 'DYE', 'TEMPLATE', 'DIRECTION']
         poss = [line.find(x) for x in tags]
         tagpos = dict(zip(poss, tags))
         if -1 in tagpos:
             del tagpos[-1]
         ps = sorted(tagpos)  # the keys
         for (p1, p2) in zip(ps, ps[1:] + [len(line) + 1]):
             setattr(self, tagpos[p1].lower(), line[p1 + len(tagpos[p1]) + 1:p2].strip())
 def __init__(self, line=None):
     self.chromat_file = ''
     self.phd_file = ''
     self.time = ''
     self.chem = ''
     self.dye = ''
     self.template = ''
     self.direction = ''
     if line:
         tags = ['CHROMAT_FILE', 'PHD_FILE', 'TIME', 'CHEM', 'DYE', 'TEMPLATE', 'DIRECTION']
         poss = [line.find(x) for x in tags]
         tagpos = dict(zip(poss, tags))
         if -1 in tagpos:
             del tagpos[-1]
         ps = sorted(tagpos) # the keys
         for (p1, p2) in zip(ps, ps[1:]+[len(line)+1]):
             setattr(self, tagpos[p1].lower(), line[p1+len(tagpos[p1])+1:p2].strip())
def cnv_on_genome(axis, probes, segments, pad, do_trend=False, y_min=None,
    """Plot coverages and CBS calls for all chromosomes on one plot."""
    # Group probes by chromosome (to calculate plotting coordinates)
    if probes:
        chrom_probe_centers = {chrom: 0.5 * (rows['start'] + rows['end'])
                               for chrom, rows in probes.by_chromosome()}
        chrom_sizes = chromosome_sizes(probes)
        chrom_sizes = chromosome_sizes(segments)

    # Same for segment calls
    chrom_seg_coords = {chrom: zip(rows['log2'], rows['start'], rows['end'])
                        for chrom, rows in segments.by_chromosome()
                       } if segments else {}

    x_starts = plot_x_dividers(axis, chrom_sizes, pad)
    x = []
    seg_lines = []  # y-val, x-start, x-end
    for chrom, curr_offset in x_starts.items():
        if probes:
            x.extend(chrom_probe_centers[chrom] + curr_offset)
        if chrom in chrom_seg_coords:
            seg_lines.extend((c[0], c[1] + curr_offset, c[2] + curr_offset)
                             for c in chrom_seg_coords[chrom])

    # Configure axes etc.
    axis.set_ylabel("Copy ratio (log2)")
    if not (y_min and y_max):
        if segments:
            # Auto-scale y-axis according to segment mean-coverage values
            seg_auto_vals = segments[(segments.chromosome != 'chr6') &
                                     (segments.chromosome != 'chrY')]['log2']
            if not y_min:
                y_min = min(seg_auto_vals.min() - .2, -1.5)
            if not y_max:
                y_max = max(seg_auto_vals.max() + .2, 1.5)
            if not y_min:
                y_min = -2.5
            if not y_max:
                y_max = 2.5
    axis.set_ylim(y_min, y_max)

    # Plot points
    if probes:
        axis.scatter(x, probes['log2'], color=POINT_COLOR, edgecolor='none',
                     alpha=0.2, marker='.')
        # Add a local trend line
        if do_trend:
            axis.plot(x, _smooth_genome_log2(probes, smoothing.smoothed, 150),
                      color=POINT_COLOR, linewidth=2, zorder=-1)
    # Plot segments
    for seg_line in seg_lines:
        y1, x1, x2 = seg_line
        axis.plot((x1, x2), (y1, y1),
                  color=SEG_COLOR, linewidth=3, solid_capstyle='round')
def _codons2re(codons):
    """Generate regular expression based on a given list of codons (PRIVATE)."""
    reg = ''
    for i in zip(*codons):
        if len(set(i)) == 1:
            reg += ''.join(set(i))
            reg += '[' + ''.join(set(i)) + ']'
    return reg
 def test_get_db_items(self):
     """Check list, keys, length etc."""
     db = self.db
     items = list(db.values())
     keys = list(db)
     length = len(items)
     self.assertEqual(length, len(db))
     self.assertEqual(length, len(list(db.items())))
     self.assertEqual(length, len(list(db)))
     self.assertEqual(length, len(list(db.values())))
     for (k1, r1), (k2, r2) in zip(zip(keys, items), db.items()):
         self.assertEqual(k1, k2)
         self.assertEqual(r1.id, r2.id)
     for k in keys:
         del db[k]
     self.assertEqual(0, len(db))
     with self.assertRaises(KeyError):
         del db["non-existant-name"]
def _codons2re(codons):
    """Generate regular expression based on a given list of codons (PRIVATE)."""
    reg = ''
    for i in zip(*codons):
        if len(set(i)) == 1:
            reg += ''.join(set(i))
            reg += '[' + ''.join(set(i)) + ']'
    return reg
 def test_get_db_items(self):
     """Check list, keys, length etc."""
     db = self.db
     items = list(db.values())
     keys = list(db)
     length = len(items)
     self.assertEqual(length, len(db))
     self.assertEqual(length, len(list(db.items())))
     self.assertEqual(length, len(list(db)))
     self.assertEqual(length, len(list(db.values())))
     for (k1, r1), (k2, r2) in zip(zip(keys, items), db.items()):
         self.assertEqual(k1, k2)
         self.assertEqual(r1.id, r2.id)
     for k in keys:
         del db[k]
     self.assertEqual(0, len(db))
     with self.assertRaises(KeyError):
         del db["non-existant-name"]
def _codons2re(codons):
    """Generate regular expression based on a given list of codons (PRIVATE)."""
    reg = ""
    for i in zip(*codons):
        if len(set(i)) == 1:
            reg += "".join(set(i))
            reg += "[" + "".join(set(i)) + "]"
    return reg
 def by_chromosome(self):
     """Iterate over probes grouped by chromosome name."""
     uniq_chrom, idx = numpy.unique(self.chromosome, True)
     sort_idx = idx.argsort()
     idx = idx.take(sort_idx)
     uniq_chrom = uniq_chrom.take(sort_idx)
     idx2 = numpy.concatenate((idx[1:], [len(self.data)]))
     for chrom, start, end in zip(uniq_chrom, idx, idx2):
         subarr = self.to_array(self.data[start:end])
         yield chrom, subarr
 def by_chromosome(self):
     """Iterate over probes grouped by chromosome name."""
     uniq_chrom, idx = numpy.unique(self.chromosome, True)
     sort_idx = idx.argsort()
     idx = idx.take(sort_idx)
     uniq_chrom = uniq_chrom.take(sort_idx)
     idx2 = numpy.concatenate((idx[1:], [len(self.data)]))
     for chrom, start, end in zip(uniq_chrom, idx, idx2):
         subarr = self.to_array(self.data[start:end])
         yield chrom, subarr
def segments2vcf(segments, ploidy, is_reference_male, is_sample_female):
    """Convert copy number segments to VCF records."""
    out_dframe = segments.data.loc[:, ["chromosome", "end", "log2", "probes"]]
    abs_dframe = call.absolute_dataframe(segments, ploidy, 1.0,
                                         is_reference_male, is_sample_female)
    out_dframe["ncopies"] = np.rint(abs_dframe["absolute"])
    idx_losses = (out_dframe["ncopies"] < abs_dframe["expect"])

    starts = segments.start.copy()
    starts[starts == 0] = 1
    out_dframe["start"] = starts

    svlen = segments.end - segments.start
    svlen[idx_losses] *= -1
    out_dframe["svlen"] = svlen

    out_dframe["svtype"] = "DUP"
    out_dframe.loc[idx_losses, "svtype"] = "DEL"

    out_dframe["format"] = "GT:GQ:CN:CNQ"
    out_dframe.loc[idx_losses, "format"] = "GT:GQ"  # :CN:CNQ ?

    # Reformat this data to create INFO and genotype
    # TODO be more clever about this
    for out_row, abs_row in zip(out_dframe.itertuples(index=False),
        if (out_row.ncopies == abs_row.expect or
                # Survive files from buggy v0.7.1 (#53)
                not str(out_row.probes).isdigit()):
            # Skip regions of neutral copy number
            continue  # or "CNV" for subclonal?

        if out_row.ncopies > abs_row.expect:
            genotype = "0/1:0:%d:%d" % (out_row.ncopies, out_row.probes)
        elif out_row.ncopies < abs_row.expect:
            # TODO XXX handle non-diploid ploidies, haploid chroms
            if out_row.ncopies == 0:
                # Complete deletion, 0 copies
                gt = "1/1"
                # Single copy deletion
                gt = "0/1"
            genotype = "%s:%d" % (gt, out_row.probes)

        info = ";".join([
            "SVTYPE=%s" % out_row.svtype,
            "END=%d" % out_row.end,
            "SVLEN=%d" % out_row.svlen,
            # CIPOS=-56,20;CIEND=-10,62

        yield (out_row.chromosome, out_row.start, '.', 'N',
               "<%s>" % out_row.svtype, '.', '.', info, out_row.format,
def segments2vcf(segments, ploidy, is_reference_male, is_sample_female):
    """Convert copy number segments to VCF records."""
    out_dframe = segments.data.loc[:, ["chromosome", "end", "log2", "probes"]]
    abs_dframe = call.absolute_dataframe(segments, ploidy, 1.0,
                                         is_reference_male, is_sample_female)
    out_dframe["ncopies"] = np.rint(abs_dframe["absolute"])
    idx_losses = (out_dframe["ncopies"] < abs_dframe["expect"])

    starts = segments.start.copy()
    starts[starts == 0] = 1
    out_dframe["start"] = starts

    svlen = segments.end - segments.start
    svlen[idx_losses] *= -1
    out_dframe["svlen"] = svlen

    out_dframe["svtype"] = "DUP"
    out_dframe.loc[idx_losses, "svtype"] = "DEL"

    out_dframe["format"] = "GT:GQ:CN:CNQ"
    out_dframe.loc[idx_losses, "format"] = "GT:GQ" # :CN:CNQ ?

    # Reformat this data to create INFO and genotype
    # TODO be more clever about this
    for out_row, abs_row in zip(out_dframe.itertuples(index=False),
        if (out_row.ncopies == abs_row.expect or
            # Survive files from buggy v0.7.1 (#53)
            not str(out_row.probes).isdigit()):
            # Skip regions of neutral copy number
            continue  # or "CNV" for subclonal?

        if out_row.ncopies > abs_row.expect:
            genotype = "0/1:0:%d:%d" % (out_row.ncopies, out_row.probes)
        elif out_row.ncopies < abs_row.expect:
            # TODO XXX handle non-diploid ploidies, haploid chroms
            if out_row.ncopies == 0:
                # Complete deletion, 0 copies
                gt = "1/1"
                # Single copy deletion
                gt = "0/1"
            genotype = "%s:%d" % (gt, out_row.probes)

        info = ";".join(["IMPRECISE",
                         "SVTYPE=%s" % out_row.svtype,
                         "END=%d" % out_row.end,
                         "SVLEN=%d" % out_row.svlen,
                         # CIPOS=-56,20;CIEND=-10,62

        yield (out_row.chromosome, out_row.start, '.', 'N',
               "<%s>" % out_row.svtype, '.', '.',
               info, out_row.format, genotype)
 def test_get_db_items(self):
     """Check list, keys, length etc"""
     db = self.db
     items = list(db.values())
     keys = list(db)
     l = len(items)
     self.assertEqual(l, len(db))
     self.assertEqual(l, len(list(db.items())))
     self.assertEqual(l, len(list(db)))
     self.assertEqual(l, len(list(db.values())))
     for (k1, r1), (k2, r2) in zip(zip(keys, items), db.items()):
         self.assertEqual(k1, k2)
         self.assertEqual(r1.id, r2.id)
     for k in keys:
         del db[k]
     self.assertEqual(0, len(db))
         del db["non-existant-name"]
         assert False, "Should have raised KeyError"
     except KeyError:
 def test_get_db_items(self):
     """Check list, keys, length etc"""
     db = self.db
     items = list(db.values())
     keys = list(db.keys())
     l = len(items)
     self.assertEqual(l, len(db))
     self.assertEqual(l, len(list(db.items())))
     self.assertEqual(l, len(list(db.keys())))
     self.assertEqual(l, len(list(db.values())))
     for (k1, r1), (k2, r2) in zip(zip(keys, items), db.items()):
         self.assertEqual(k1, k2)
         self.assertEqual(r1.id, r2.id)
     for k in keys:
         del db[k]
     self.assertEqual(0, len(db))
         del db["non-existant-name"]
         assert False, "Should have raised KeyError"
     except KeyError:
def interval_coverages_count(bed_fname, bam_fname):
    """Calculate log2 coverages in the BAM file at each interval."""
    bamfile = pysam.Samfile(bam_fname, 'rb')
    # Parse the BED lines and group them by chromosome
    # (efficient if records are already sorted by chromosome)
    for chrom, rows_iter in groupby(ngfrills.parse_regions(bed_fname),
                                    key=lambda r: r[0]):
        # Thunk and reshape this chromosome's intervals
        echo("Processing chromosome", chrom, "of", os.path.basename(bam_fname))
        _chroms, starts, ends, names = zip(*rows_iter)
        counts_depths = [
            region_depth_count(bamfile, chrom, s, e)
            for s, e in zip(starts, ends)
        for start, end, name, (count, depth) in zip(starts, ends, names,
            yield [
                (chrom, start, end, name,
                 math.log(depth, 2) if depth else NULL_LOG2_COVERAGE)
def _gaf10iterator(handle):
    for inline in handle:
        if inline[0] == '!':
        inrec = inline.rstrip('\n').split('\t')
        if len(inrec) == 1:
        inrec[3] = inrec[3].split('|')  # Qualifier
        inrec[5] = inrec[5].split('|')  # DB:reference(s)
        inrec[7] = inrec[7].split('|')  # With || From
        inrec[10] = inrec[10].split('|')  # Synonym
        inrec[12] = inrec[12].split('|')  # Taxon
        yield dict(zip(GAF10FIELDS, inrec))
 def __init__(self, line=None):
     """Initialize the class."""
     self.chromat_file = ""
     self.phd_file = ""
     self.time = ""
     self.chem = ""
     self.dye = ""
     self.template = ""
     self.direction = ""
     if line:
         tags = [
             "CHROMAT_FILE", "PHD_FILE", "TIME", "CHEM", "DYE", "TEMPLATE",
         poss = [line.find(x) for x in tags]
         tagpos = dict(zip(poss, tags))
         if -1 in tagpos:
             del tagpos[-1]
         ps = sorted(tagpos)  # the keys
         for (p1, p2) in zip(ps, ps[1:] + [len(line) + 1]):
             setattr(self, tagpos[p1].lower(),
                     line[p1 + len(tagpos[p1]) + 1:p2].strip())
def _gaf20iterator(handle):
    for inline in handle:
        if inline[0] == "!":
        inrec = inline.rstrip("\n").split("\t")
        if len(inrec) == 1:
        inrec[3] = inrec[3].split("|")  # Qualifier
        inrec[5] = inrec[5].split("|")  # DB:reference(s)
        inrec[7] = inrec[7].split("|")  # With || From
        inrec[10] = inrec[10].split("|")  # Synonym
        inrec[12] = inrec[12].split("|")  # Taxon
        yield dict(zip(GAF20FIELDS, inrec))
def _gaf20iterator(handle):
    for inline in handle:
        if inline[0] == "!":
        inrec = inline.rstrip("\n").split("\t")
        if len(inrec) == 1:
        inrec[3] = inrec[3].split("|")  # Qualifier
        inrec[5] = inrec[5].split("|")  # DB:reference(s)
        inrec[7] = inrec[7].split("|")  # With || From
        inrec[10] = inrec[10].split("|")  # Synonym
        inrec[12] = inrec[12].split("|")  # Taxon
        yield dict(zip(GAF20FIELDS, inrec))
def _flip_codons(codon_seq, target_seq):
    """Flips the codon characters from one seq to another (PRIVATE)."""
    a, b = '', ''
    for char1, char2 in zip(codon_seq, target_seq):
        # no need to do anything if the codon seq line has nothing
        if char1 == ' ':
            a += char1
            b += char2
            a += char2
            b += char1

    return a, b
def _gaf10iterator(handle):
    for inline in handle:
        if inline[0] == '!':
        inrec = inline.rstrip('\n').split('\t')
        if len(inrec) == 1:
        inrec[3] = inrec[3].split('|')  # Qualifier
        inrec[5] = inrec[5].split('|')  # DB:reference(s)
        inrec[7] = inrec[7].split('|')  # With || From
        inrec[10] = inrec[10].split('|')  # Synonym
        inrec[12] = inrec[12].split('|')  # Taxon
        yield dict(zip(GAF10FIELDS, inrec))
def segments2freebayes(segments, sample_name, ploidy, purity, is_reference_male,
    """Convert a copy number array to a BED-like format."""
    absolutes = cna_absolutes(segments, ploidy, purity, is_reference_male,
    for row, abs_val in zip(segments, absolutes):
        ncopies = _round_to_integer(abs_val, half_is_zero=purity is None)
        # Ignore regions of neutral copy number
        if ncopies != ploidy:
            yield (row["chromosome"], # reference sequence
                   row["start"], # start (0-indexed)
                   row["end"], # end
                   sample_name, # sample name
                   ncopies) # copy number
def merge_rows(rows):
    """Combine equivalent rows of coverage data across multiple samples.

    Check that probe info matches across all samples, then merge the log2
    coverage values.

    Input: a list of individual rows corresponding to the same probes from
    different coverage files.
    Output: a list starting with the single common Probe object, followed by the
    log2 coverage values from each sample, in order.
    probe_infos, coverages = zip(*map(row_to_probe_coverage, rows))
    probe_info = core.check_unique(probe_infos, "probe Name")
    combined_row = [probe_info] + list(coverages)
    return combined_row
def merge_rows(rows):
    """Combine equivalent rows of coverage data across multiple samples.

    Check that probe info matches across all samples, then merge the log2
    coverage values.

    Input: a list of individual rows corresponding to the same probes from
    different coverage files.
    Output: a list starting with the single common Probe object, followed by the
    log2 coverage values from each sample, in order.
    probe_infos, coverages = zip(*map(row_to_probe_coverage, rows))
    probe_info = core.check_unique(probe_infos, "probe Name")
    combined_row = [probe_info] + list(coverages)
    return combined_row
 def test_get_db_items(self):
     """Check list, keys, length etc."""
     db = self.db
     items = list(db.values())
     keys = list(db)
     length = len(items)
     self.assertEqual(length, len(db))
     self.assertEqual(length, len(list(db)))
     self.assertEqual(length, len(list(db.items())))
     self.assertEqual(length, len(list(db.keys())))
     self.assertEqual(length, len(list(db.values())))
     if sys.version_info[0] == 2:
         # Check legacy methods for Python 2 as well:
         self.assertEqual(length, len(list(db.iteritems())))  # noqa: B301
         self.assertEqual(length, len(list(db.iterkeys())))  # noqa: B301
         self.assertEqual(length, len(list(db.itervalues())))  # noqa: B301
     for (k1, r1), (k2, r2) in zip(zip(keys, items), db.items()):
         self.assertEqual(k1, k2)
         self.assertEqual(r1.id, r2.id)
     for k in keys:
         del db[k]
     self.assertEqual(0, len(db))
     with self.assertRaises(KeyError):
         del db["non-existant-name"]
    def __init__(self, sample_id, chromosomes, starts, ends, genes, coverages,
                 gc=None, rmask=None, spread=None, weight=None, probes=None):
        dtype = list(self._dtype)
        if all(x is None for x in (gc, rmask, spread, weight, probes)):
            self._xtra = ()
            table = list(zip(chromosomes, starts, ends, genes, coverages))
            # XXX There Must Be a Better Way -- use **kwargs?
            xtra_names = []
            xtra_cols = []
            if gc is not None:
            if rmask is not None:
            if spread is not None:
            if weight is not None:
            if probes is not None:

            self._xtra = tuple(xtra_names)
            table = list(zip(chromosomes, starts, ends, genes, coverages,
        self.data = numpy.asarray(table, dtype)
        self.sample_id = sample_id
def _gpi10iterator(handle):
    """Read GPI 1.0 format files (PRIVATE).

    This iterator is used to read a gp_information.goa_uniprot
    file which is in the GPI 1.0 format.
    for inline in handle:
        if inline[0] == '!':
        inrec = inline.rstrip('\n').split('\t')
        if len(inrec) == 1:
        inrec[5] = inrec[5].split('|')  # DB_Object_Synonym(s)
        inrec[8] = inrec[8].split('|')  # Annotation_Target_Set
        yield dict(zip(GPI10FIELDS, inrec))
def _gpi10iterator(handle):
    """Read GPI 1.0 format files (PRIVATE).

    This iterator is used to read a gp_information.goa_uniprot
    file which is in the GPI 1.0 format.
    for inline in handle:
        if inline[0] == '!':
        inrec = inline.rstrip('\n').split('\t')
        if len(inrec) == 1:
        inrec[5] = inrec[5].split('|') # DB_Object_Synonym(s)
        inrec[8] = inrec[8].split('|') # Annotation_Target_Set
        yield dict(zip(GPI10FIELDS, inrec))
def segments2freebayes(segments, sample_name, ploidy, purity,
                       is_reference_male, is_sample_female):
    """Convert a copy number array to a BED-like format."""
    absolutes = cna_absolutes(segments, ploidy, purity, is_reference_male,
    for row, abs_val in zip(segments, absolutes):
        ncopies = _round_to_integer(abs_val, half_is_zero=purity is None)
        # Ignore regions of neutral copy number
        if ncopies != ploidy:
            yield (
                row["chromosome"],  # reference sequence
                row["start"],  # start (0-indexed)
                row["end"],  # end
                sample_name,  # sample name
                ncopies)  # copy number
 def test_get_db_items(self):
     """Check list, keys, length etc."""
     db = self.db
     items = list(db.values())
     keys = list(db)
     length = len(items)
     self.assertEqual(length, len(db))
     self.assertEqual(length, len(list(db)))
     self.assertEqual(length, len(list(db.items())))
     self.assertEqual(length, len(list(db.keys())))
     self.assertEqual(length, len(list(db.values())))
     if sys.version_info[0] == 2:
         # Check legacy methods for Python 2 as well:
         self.assertEqual(length, len(list(db.iteritems())))  # noqa: B301
         self.assertEqual(length, len(list(db.iterkeys())))  # noqa: B301
         self.assertEqual(length, len(list(db.itervalues())))  # noqa: B301
     for (k1, r1), (k2, r2) in zip(zip(keys, items), db.items()):
         self.assertEqual(k1, k2)
         self.assertEqual(r1.id, r2.id)
     for k in keys:
         del db[k]
     self.assertEqual(0, len(db))
     with self.assertRaises(KeyError):
         del db["non-existant-name"]
def _get_coords(filename):
    alb = open(filename)

    start_line = None
    end_line = None

    for line in alb:
        if line.startswith("["):
            if not start_line:
                start_line = line  # rstrip not needed
                end_line = line

    if end_line is None:  # sequence is too short
        return [(0, 0), (0, 0)]

    return list(zip(*map(_alb_line2coords, [start_line, end_line])))  # returns [(start0, end0), (start1, end1)]
def _get_coords(filename):
    alb = open(filename)

    start_line = None
    end_line = None

    for line in alb:
        if line.startswith("["):
            if not start_line:
                start_line = line  # rstrip not needed
                end_line = line

    if end_line is None:  # sequence is too short
        return [(0, 0), (0, 0)]

    return list(zip(*map(_alb_line2coords, [start_line, end_line])))  # returns [(start0, end0), (start1, end1)]
def _gpi11iterator(handle):
    """Read GPI 1.0 format files (PRIVATE).

    This iterator is used to read a gp_information.goa_uniprot
    file which is in the GPI 1.0 format.
    for inline in handle:
        if inline[0] == '!':
        inrec = inline.rstrip('\n').split('\t')
        if len(inrec) == 1:
        inrec[2] = inrec[2].split('|') # DB_Object_Name
        inrec[3] = inrec[3].split('|') # DB_Object_Synonym(s)
        inrec[7] = inrec[7].split('|') # DB_Xref(s)
        inrec[8] = inrec[8].split('|') # Properties
        yield dict(zip(GPI11FIELDS, inrec))
def _gpi11iterator(handle):
    """Read GPI 1.0 format files (PRIVATE).

    This iterator is used to read a gp_information.goa_uniprot
    file which is in the GPI 1.0 format.
    for inline in handle:
        if inline[0] == '!':
        inrec = inline.rstrip('\n').split('\t')
        if len(inrec) == 1:
        inrec[2] = inrec[2].split('|')  # DB_Object_Name
        inrec[3] = inrec[3].split('|')  # DB_Object_Synonym(s)
        inrec[7] = inrec[7].split('|')  # DB_Xref(s)
        inrec[8] = inrec[8].split('|')  # Properties
        yield dict(zip(GPI11FIELDS, inrec))
 def from_columns(cls, sample_id, **columns):
     these_cols = set(columns)
     required_cols = {'chromosome', 'start', 'end', 'gene', 'coverage'}
     optional_cols = {'gc', 'rmask', 'spread', 'weight', 'probes'}
     missing_cols = required_cols - these_cols
     if missing_cols:
         raise ValueError("Missing required column(s): " +
                          " ".join(sorted(missing_cols)))
     extra_cols = these_cols - required_cols
     bogus_cols = extra_cols - optional_cols
     if bogus_cols:
         raise ValueError("Unrecognized column name(s): " +
                          " ".join(sorted(bogus_cols)))
     cnarr = cls(sample_id, list(extra_cols))
     # XXX better way?
     table = list(zip(*[columns[key] for key in cnarr.data.dtype.names]))
     cnarr.data = numpy.asarray(table, dtype=cnarr.data.dtype)
     return cnarr
def _gpa11iterator(handle):
    """Read GPA 1.1 format files (PRIVATE).

    This iterator is used to read a gp_association.goa_uniprot
    file which is in the GPA 1.1 format. Do not call directly. Rather
    use the gpa_iterator function
    for inline in handle:
        if inline[0] == '!':
        inrec = inline.rstrip('\n').split('\t')
        if len(inrec) == 1:
        inrec[2] = inrec[2].split('|')  # Qualifier
        inrec[4] = inrec[4].split('|')  # DB:Reference(s)
        inrec[6] = inrec[6].split('|')  # With
        inrec[10] = inrec[10].split('|')  # Annotation extension
        yield dict(zip(GPA11FIELDS, inrec))