def test_limits(self): """Check line graphs.""" #TODO - Fix GD so that the same min/max is used for all three lines? points = 1000 scale = math.pi * 2.0 / points data1 = [math.sin(x*scale) for x in range(points)] data2 = [math.cos(x*scale) for x in range(points)] data3 = [2*math.sin(2*x*scale) for x in range(points)] gdd = Diagram('Test Diagram', circular=False, y=0.01, yt=0.01, yb=0.01, x=0.01, xl=0.01, xr=0.01) gdt_data = gdd.new_track(1, greytrack=False) gds_data = gdt_data.new_set("graph") for data_values, name, color in zip([data1, data2, data3], ["sin", "cos", "2sin2"], ["red", "green", "blue"]): data = list(zip(range(points), data_values)) gds_data.new_graph(data, "", style="line", color = color, altcolor = color, center = 0) gdd.draw(format='linear', tracklines=False, pagesize=(15*cm, 15*cm), fragments=1, start=0, end=points) gdd.write(os.path.join('Graphics', "line_graph.pdf"), "pdf") #Circular diagram gdd.draw(tracklines=False, pagesize=(15*cm, 15*cm), circular=True, # Data designed to be periodic start=0, end=points, circle_core=0.5) gdd.write(os.path.join('Graphics', "line_graph_c.pdf"), "pdf")
def test_get_db_items(self): """Check list, keys, length etc.""" db = self.db items = list(db.values()) keys = list(db) length = len(items) self.assertEqual(length, len(db)) self.assertEqual(length, len(list(db))) self.assertEqual(length, len(list(db.items()))) self.assertEqual(length, len(list(db.keys()))) self.assertEqual(length, len(list(db.values()))) if sys.version_info[0] == 2: # Check legacy methods for Python 2 as well: self.assertEqual(length, len(list(db.iteritems()))) self.assertEqual(length, len(list(db.iterkeys()))) self.assertEqual(length, len(list(db.itervalues()))) for (k1, r1), (k2, r2) in zip(zip(keys, items), db.items()): self.assertEqual(k1, k2) self.assertEqual(r1.id, r2.id) for k in keys: del db[k] self.assertEqual(0, len(db)) try: del db["non-existant-name"] assert False, "Should have raised KeyError" except KeyError: pass
def common_ancestor(self, targets, *more_targets): """Most recent common ancestor (clade) of all the given targets. Edge cases: - If no target is given, returns self.root - If 1 target is given, returns the target - If any target is not found in this tree, raises a ValueError """ paths = [self.get_path(t) for t in _combine_args(targets, *more_targets)] # Validation -- otherwise izip throws a spooky error below for p, t in zip(paths, targets): if p is None: raise ValueError("target %s is not in this tree" % repr(t)) mrca = self.root for level in zip(*paths): ref = level[0] for other in level[1:]: if ref is not other: break else: mrca = ref if ref is not mrca: break return mrca
def get_fasta_stats(probes, fa_fname): """Calculate GC and RepeatMasker content of each bin in the FASTA genome.""" fa_coords = zip(probes.chromosome, probes.start, probes.end) logging.info("Calculating GC and RepeatMasker content in %s ...", fa_fname) gc_rm_vals = [calculate_gc_lo(subseq) for subseq in ngfrills.fasta_extract_regions(fa_fname, fa_coords)] gc_vals, rm_vals = zip(*gc_rm_vals) return np.asfarray(gc_vals), np.asfarray(rm_vals)
def _get_inter_coords(coords, strand=1): """From the given pairs of coordinates, returns a list of pairs covering the intervening ranges.""" # adapted from Python's itertools guide # if strand is -1, adjust coords to the ends and starts are chained if strand == -1: sorted_coords = [(max(a, b), min(a, b)) for a, b in coords] inter_coords = list(chain(*sorted_coords))[1:-1] return list(zip(inter_coords[1::2], inter_coords[::2])) else: inter_coords = list(chain(*coords))[1:-1] return list(zip(inter_coords[::2], inter_coords[1::2]))
def match_ref_to_probes(ref_pset, probes): """Filter the reference probes to match the target or antitarget probe set. """ ref_lookup = dict(zip(ref_pset.labels(), ref_pset)) ref_matched_rows = [tuple(ref_lookup[label]) for label in probes.labels()] ref_matched = ref_pset.to_rows(ref_matched_rows) return ref_matched
def _reorient_starts(starts, blksizes, seqlen, strand): """Reorients block starts into the opposite strand's coordinates. Arguments: starts -- List of integers, start coordinates. start -- Integer, 'Q start' or 'T start' column blksizes -- List of integers, block sizes. seqlen -- Integer of total sequence length. strand -- Integer denoting sequence strand. """ assert len(starts) == len(blksizes), \ "Unequal start coordinates and block sizes list (%r vs %r)" \ % (len(starts), len(blksizes)) # see: http://genome.ucsc.edu/goldenPath/help/blatSpec.html # no need to reorient if it's already the positive strand if strand >= 0: return starts else: # the plus-oriented coordinate is calculated by this: # plus_coord = length - minus_coord - block_size return [ seqlen - start - blksize for start, blksize in zip(starts, blksizes) ]
def _reorient_starts(starts, blksizes, seqlen, strand): """Reorients block starts into the opposite strand's coordinates. :param starts: start coordinates :type starts: list [int] :param blksizes: block sizes :type blksizes: list [int] :param seqlen: sequence length :type seqlen: int :param strand: sequence strand :type strand: int, choice of -1, 0, or 1 """ assert len(starts) == len(blksizes), \ "Unequal start coordinates and block sizes list (%r vs %r)" \ % (len(starts), len(blksizes)) # see: http://genome.ucsc.edu/goldenPath/help/blatSpec.html # no need to reorient if it's already the positive strand if strand >= 0: return starts else: # the plus-oriented coordinate is calculated by this: # plus_coord = length - minus_coord - block_size return [seqlen - start - blksize for start, blksize in zip(starts, blksizes)]
def loop(self, filename, format): original_records = list(SeqIO.parse(filename, format)) # now open a connection to load the database server = BioSeqDatabase.open_database(driver=DBDRIVER, user=DBUSER, passwd=DBPASSWD, host=DBHOST, db=TESTDB) db_name = "test_loop_%s" % filename # new namespace! db = server.new_database(db_name) count = db.load(original_records) self.assertEqual(count, len(original_records)) server.commit() # Now read them back... biosql_records = [db.lookup(name=rec.name) for rec in original_records] # And check they agree self.assertTrue(compare_records(original_records, biosql_records)) # Now write to a handle... handle = StringIO() SeqIO.write(biosql_records, handle, "gb") # Now read them back... handle.seek(0) new_records = list(SeqIO.parse(handle, "gb")) # And check they still agree self.assertEqual(len(new_records), len(original_records)) for old, new in zip(original_records, new_records): # TODO - remove this hack because we don't yet write these (yet): for key in ["comment", "references", "db_source"]: if key in old.annotations and key not in new.annotations: del old.annotations[key] self.assertTrue(compare_record(old, new)) # Done handle.close() server.close()
def format_phylip(self, handle): """Write data in Phylip format to a given file-like object or handle. The output stream is the input distance matrix format used with Phylip programs (e.g. 'neighbor'). See: http://evolution.genetics.washington.edu/phylip/doc/neighbor.html :Parameters: handle : file or file-like object A writeable file handle or other object supporting the 'write' method, such as StringIO or sys.stdout. On Python 3, should be open in text mode. """ handle.write(" {0}\n".format(len(self.names))) # Phylip needs space-separated, vertically aligned columns name_width = max(12, max(map(len, self.names)) + 1) value_fmts = ("{" + str(x) + ":.4f}" for x in range(1, len(self.matrix) + 1)) row_fmt = "{0:" + str(name_width) + "s}" + " ".join(value_fmts) + "\n" for i, (name, values) in enumerate(zip(self.names, self.matrix)): # Mirror the matrix values across the diagonal mirror_values = (self.matrix[j][i] for j in range(i + 1, len(self.matrix))) fields = itertools.chain([name], values, mirror_values) handle.write(row_fmt.format(*fields))
def _comp_intron_lens(seq_type, inter_blocks, raw_inter_lens): """Returns the length of introns between fragments.""" # set opposite type, for setting introns opp_type = 'hit' if seq_type == 'query' else 'query' # list of flags to denote if an intron follows a block # it reads e.g. this line: # "ATGTT{TT} >>>> Target Intron 1 >>>> {G}TGTGTGTACATT" # and sets the opposing sequence type's intron (since this # line is present on the opposite sequence type line) has_intron_after = ['Intron' in x[seq_type] for x in inter_blocks] assert len(has_intron_after) == len(raw_inter_lens) # create list containing coord adjustments incorporating # intron lengths inter_lens = [] for flag, parsed_len in zip(has_intron_after, raw_inter_lens): if flag: # joint introns if all(parsed_len[:2]): # intron len is [0] if opp_type is query, otherwise it's [1] intron_len = int(parsed_len[0]) if opp_type == 'query' \ else int(parsed_len[1]) # single hit/query introns elif parsed_len[2]: intron_len = int(parsed_len[2]) else: raise ValueError("Unexpected intron parsing " "result: %r" % parsed_len) else: intron_len = 0 inter_lens.append(intron_len) return inter_lens
def randomized(cls, taxa, branch_length=1.0, branch_stdev=None): """Create a randomized bifurcating tree given a list of taxa. :param taxa: Either an integer specifying the number of taxa to create (automatically named taxon#), or an iterable of taxon names, as strings. :returns: a tree of the same type as this class. """ if isinstance(taxa, int): taxa = ['taxon%s' % (i+1) for i in range(taxa)] elif hasattr(taxa, '__iter__'): taxa = list(taxa) else: raise TypeError("taxa argument must be integer (# taxa) or " "iterable of taxon names.") rtree = cls() terminals = [rtree.root] while len(terminals) < len(taxa): newsplit = random.choice(terminals) newsplit.split(branch_length=branch_length) newterms = newsplit.clades if branch_stdev: # Add some noise to the branch lengths for nt in newterms: nt.branch_length = max(0, random.gauss(branch_length, branch_stdev)) terminals.remove(newsplit) terminals.extend(newterms) # Distribute taxon labels randomly random.shuffle(taxa) for node, name in zip(terminals, taxa): node.name = name return rtree
def _reorient_starts(starts, blksizes, seqlen, strand): """Reorients block starts into the opposite strand's coordinates (PRIVATE). :param starts: start coordinates :type starts: list [int] :param blksizes: block sizes :type blksizes: list [int] :param seqlen: sequence length :type seqlen: int :param strand: sequence strand :type strand: int, choice of -1, 0, or 1 """ if len(starts) != len(blksizes): raise RuntimeError("Unequal start coordinates and block sizes list" " (%r vs %r)" % (len(starts), len(blksizes))) # see: http://genome.ucsc.edu/goldenPath/help/blatSpec.html # no need to reorient if it's already the positive strand if strand >= 0: return starts else: # the plus-oriented coordinate is calculated by this: # plus_coord = length - minus_coord - block_size return [ seqlen - start - blksize for start, blksize in zip(starts, blksizes) ]
def _comp_intron_lens(seq_type, inter_blocks, raw_inter_lens): """Return the length of introns between fragments (PRIVATE).""" # set opposite type, for setting introns opp_type = 'hit' if seq_type == 'query' else 'query' # list of flags to denote if an intron follows a block # it reads e.g. this line: # "ATGTT{TT} >>>> Target Intron 1 >>>> {G}TGTGTGTACATT" # and sets the opposing sequence type's intron (since this # line is present on the opposite sequence type line) has_intron_after = ['Intron' in x[seq_type] for x in inter_blocks] assert len(has_intron_after) == len(raw_inter_lens) # create list containing coord adjustments incorporating # intron lengths inter_lens = [] for flag, parsed_len in zip(has_intron_after, raw_inter_lens): if flag: # joint introns if all(parsed_len[:2]): # intron len is [0] if opp_type is query, otherwise it's [1] intron_len = int(parsed_len[0]) if opp_type == 'query' \ else int(parsed_len[1]) # single hit/query introns elif parsed_len[2]: intron_len = int(parsed_len[2]) else: raise ValueError("Unexpected intron parsing " "result: %r" % parsed_len) else: intron_len = 0 inter_lens.append(intron_len) return inter_lens
def randomized(cls, taxa, branch_length=1.0, branch_stdev=None): """Create a randomized bifurcating tree given a list of taxa. :param taxa: Either an integer specifying the number of taxa to create (automatically named taxon#), or an iterable of taxon names, as strings. :returns: a tree of the same type as this class. """ if isinstance(taxa, int): taxa = ['taxon%s' % (i + 1) for i in range(taxa)] elif hasattr(taxa, '__iter__'): taxa = list(taxa) else: raise TypeError("taxa argument must be integer (# taxa) or " "iterable of taxon names.") rtree = cls() terminals = [rtree.root] while len(terminals) < len(taxa): newsplit = random.choice(terminals) newsplit.split(branch_length=branch_length) newterms = newsplit.clades if branch_stdev: # Add some noise to the branch lengths for nt in newterms: nt.branch_length = max( 0, random.gauss(branch_length, branch_stdev)) terminals.remove(newsplit) terminals.extend(newterms) # Distribute taxon labels randomly random.shuffle(taxa) for node, name in zip(terminals, taxa): node.name = name return rtree
def get_edge(chrom, tgt_start, tgt_end, insert_size): """Quantify the "edge effect" of the target tile and its neighbors. The result is proportional to the change in the target's coverage due to these edge effects, i.e. the expected loss of coverage near the target edges and, if there are close neighboring tiles, gain of coverage due to "spill over" reads from the neighbor tiles. (This is not the actual change in coverage. This is just a tribute.) """ margin_start = tgt_start - insert_size margin_end = tgt_end + insert_size tile_starts = chrom_tile_starts[chrom] tile_ends = chrom_tile_ends[chrom] target_size = (tgt_end - tgt_start) # Calculate coverage loss at (both) tile edges loss = edge_loss(target_size, insert_size) # For each neighbor tile, calculate coverage gain to the target gaps_left = [] gaps_right = [] # Find the leftmost tile in the margin left_idx = max(0, bisect.bisect_left(tile_ends, margin_start) - 1) for (tile_start, tile_end) in zip(tile_starts[left_idx:], tile_ends[left_idx:]): if tile_end <= margin_start: # No overlap on the 5' end -- keep moving forward continue if tile_start >= margin_end: # No overlap on the 3' end -- we're done break if tile_start == tgt_start and tile_end == tgt_end: # The target itself continue # Tile is within margins if margin_start <= tile_end <= tgt_start: # Left neighbor gaps_left.append(tgt_start - tile_end) elif tgt_end <= tile_start <= margin_end: # Right neighbor gaps_right.append(tile_start - tgt_end) elif tile_start < tgt_start and tile_end >= tgt_start: # Overlap on left side -- treat as adjacent gaps_left.append(0) elif tile_start <= tgt_end and tile_end > tgt_end: # Overlap on right side -- treat as adjacent gaps_right.append(0) else: # DBG: This should probably never happen echo("Oddly positioned tile (%s:%d-%d) vs. target (%d-%d)" % (chrom, tile_start, tile_end, tgt_start, tgt_end)) continue gain = 0 if gaps_left: gain += edge_gain(target_size, insert_size, min(gaps_left)) if gaps_right: gain += edge_gain(target_size, insert_size, min(gaps_right)) return gain - loss
def get_edge(chrom, tgt_start, tgt_end, insert_size): """Quantify the "edge effect" of the target tile and its neighbors. The result is proportional to the change in the target's coverage due to these edge effects, i.e. the expected loss of coverage near the target edges and, if there are close neighboring tiles, gain of coverage due to "spill over" reads from the neighbor tiles. (This is not the actual change in coverage. This is just a tribute.) """ margin_start = tgt_start - insert_size margin_end = tgt_end + insert_size tile_starts = chrom_tile_starts[chrom] tile_ends = chrom_tile_ends[chrom] target_size = (tgt_end - tgt_start) # Calculate coverage loss at (both) tile edges loss = edge_loss(target_size, insert_size) # For each neighbor tile, calculate coverage gain to the target gaps_left = [] gaps_right = [] # Find the leftmost tile in the margin left_idx = max(0, bisect.bisect_left(tile_ends, margin_start) - 1) for (tile_start, tile_end) in zip(tile_starts[left_idx:], tile_ends[left_idx:]): if tile_end <= margin_start: # No overlap on the 5' end -- keep moving forward continue if tile_start >= margin_end: # No overlap on the 3' end -- we're done break if tile_start == tgt_start and tile_end == tgt_end: # The target itself continue # Tile is within margins if margin_start <= tile_end <= tgt_start: # Left neighbor gaps_left.append(tgt_start - tile_end) elif tgt_end <= tile_start <= margin_end: # Right neighbor gaps_right.append(tile_start - tgt_end) elif tile_start < tgt_start and tile_end >= tgt_start: # Overlap on left side -- treat as adjacent gaps_left.append(0) elif tile_start <= tgt_end and tile_end > tgt_end: # Overlap on right side -- treat as adjacent gaps_right.append(0) else: # DBG: This should probably never happen logging.info("Oddly positioned tile (%s:%d-%d) vs. target (%d-%d)", chrom, tile_start, tile_end, tgt_start, tgt_end) continue gain = 0 if gaps_left: gain += edge_gain(target_size, insert_size, min(gaps_left)) if gaps_right: gain += edge_gain(target_size, insert_size, min(gaps_right)) return gain - loss
def test_limits(self): """Check line graphs.""" # TODO - Fix GD so that the same min/max is used for all three lines? points = 1000 scale = math.pi * 2.0 / points data1 = [math.sin(x * scale) for x in range(points)] data2 = [math.cos(x * scale) for x in range(points)] data3 = [2 * math.sin(2 * x * scale) for x in range(points)] gdd = Diagram('Test Diagram', circular=False, y=0.01, yt=0.01, yb=0.01, x=0.01, xl=0.01, xr=0.01) gdt_data = gdd.new_track(1, greytrack=False) gds_data = gdt_data.new_set("graph") for data_values, name, color in zip([data1, data2, data3], ["sin", "cos", "2sin2"], ["red", "green", "blue"]): data = list(zip(range(points), data_values)) gds_data.new_graph(data, "", style="line", color=color, altcolor=color, center=0) gdd.draw(format='linear', tracklines=False, pagesize=(15 * cm, 15 * cm), fragments=1, start=0, end=points) gdd.write(os.path.join('Graphics', "line_graph.pdf"), "pdf") # Circular diagram gdd.draw( tracklines=False, pagesize=(15 * cm, 15 * cm), circular=True, # Data designed to be periodic start=0, end=points, circle_core=0.5) gdd.write(os.path.join('Graphics', "line_graph_c.pdf"), "pdf")
def interval_coverages_count(bed_fname, bam_fname): """Calculate log2 coverages in the BAM file at each interval.""" bamfile = pysam.Samfile(bam_fname, 'rb') # Parse the BED lines and group them by chromosome # (efficient if records are already sorted by chromosome) for chrom, rows_iter in groupby(ngfrills.parse_regions(bed_fname), key=lambda r: r[0]): # Thunk and reshape this chromosome's intervals echo("Processing chromosome", chrom, "of", os.path.basename(bam_fname)) _chroms, starts, ends, names = zip(*rows_iter) counts_depths = [region_depth_count(bamfile, chrom, s, e) for s, e in zip(starts, ends)] for start, end, name, (count, depth) in zip(starts, ends, names, counts_depths): yield [count, (chrom, start, end, name, math.log(depth, 2) if depth else NULL_LOG2_COVERAGE)]
def __init__(self, line=None): self.chromat_file = '' self.phd_file = '' self.time = '' self.chem = '' self.dye = '' self.template = '' self.direction = '' if line: tags = ['CHROMAT_FILE', 'PHD_FILE', 'TIME', 'CHEM', 'DYE', 'TEMPLATE', 'DIRECTION'] poss = [line.find(x) for x in tags] tagpos = dict(zip(poss, tags)) if -1 in tagpos: del tagpos[-1] ps = sorted(tagpos) # the keys for (p1, p2) in zip(ps, ps[1:] + [len(line) + 1]): setattr(self, tagpos[p1].lower(), line[p1 + len(tagpos[p1]) + 1:p2].strip())
def __init__(self, line=None): self.chromat_file = '' self.phd_file = '' self.time = '' self.chem = '' self.dye = '' self.template = '' self.direction = '' if line: tags = ['CHROMAT_FILE', 'PHD_FILE', 'TIME', 'CHEM', 'DYE', 'TEMPLATE', 'DIRECTION'] poss = [line.find(x) for x in tags] tagpos = dict(zip(poss, tags)) if -1 in tagpos: del tagpos[-1] ps = sorted(tagpos) # the keys for (p1, p2) in zip(ps, ps[1:]+[len(line)+1]): setattr(self, tagpos[p1].lower(), line[p1+len(tagpos[p1])+1:p2].strip())
def cnv_on_genome(axis, probes, segments, pad, do_trend=False, y_min=None, y_max=None): """Plot coverages and CBS calls for all chromosomes on one plot.""" # Group probes by chromosome (to calculate plotting coordinates) if probes: chrom_probe_centers = {chrom: 0.5 * (rows['start'] + rows['end']) for chrom, rows in probes.by_chromosome()} chrom_sizes = chromosome_sizes(probes) else: chrom_sizes = chromosome_sizes(segments) # Same for segment calls chrom_seg_coords = {chrom: zip(rows['log2'], rows['start'], rows['end']) for chrom, rows in segments.by_chromosome() } if segments else {} x_starts = plot_x_dividers(axis, chrom_sizes, pad) x = [] seg_lines = [] # y-val, x-start, x-end for chrom, curr_offset in x_starts.items(): if probes: x.extend(chrom_probe_centers[chrom] + curr_offset) if chrom in chrom_seg_coords: seg_lines.extend((c[0], c[1] + curr_offset, c[2] + curr_offset) for c in chrom_seg_coords[chrom]) # Configure axes etc. axis.axhline(color='k') axis.set_ylabel("Copy ratio (log2)") if not (y_min and y_max): if segments: # Auto-scale y-axis according to segment mean-coverage values seg_auto_vals = segments[(segments.chromosome != 'chr6') & (segments.chromosome != 'chrY')]['log2'] if not y_min: y_min = min(seg_auto_vals.min() - .2, -1.5) if not y_max: y_max = max(seg_auto_vals.max() + .2, 1.5) else: if not y_min: y_min = -2.5 if not y_max: y_max = 2.5 axis.set_ylim(y_min, y_max) # Plot points if probes: axis.scatter(x, probes['log2'], color=POINT_COLOR, edgecolor='none', alpha=0.2, marker='.') # Add a local trend line if do_trend: axis.plot(x, _smooth_genome_log2(probes, smoothing.smoothed, 150), color=POINT_COLOR, linewidth=2, zorder=-1) # Plot segments for seg_line in seg_lines: y1, x1, x2 = seg_line axis.plot((x1, x2), (y1, y1), color=SEG_COLOR, linewidth=3, solid_capstyle='round')
def _codons2re(codons): """Generate regular expression based on a given list of codons (PRIVATE).""" reg = '' for i in zip(*codons): if len(set(i)) == 1: reg += ''.join(set(i)) else: reg += '[' + ''.join(set(i)) + ']' return reg
def test_get_db_items(self): """Check list, keys, length etc.""" db = self.db items = list(db.values()) keys = list(db) length = len(items) self.assertEqual(length, len(db)) self.assertEqual(length, len(list(db.items()))) self.assertEqual(length, len(list(db))) self.assertEqual(length, len(list(db.values()))) for (k1, r1), (k2, r2) in zip(zip(keys, items), db.items()): self.assertEqual(k1, k2) self.assertEqual(r1.id, r2.id) for k in keys: del db[k] self.assertEqual(0, len(db)) with self.assertRaises(KeyError): del db["non-existant-name"]
def _codons2re(codons): """Generate regular expression based on a given list of codons (PRIVATE).""" reg = "" for i in zip(*codons): if len(set(i)) == 1: reg += "".join(set(i)) else: reg += "[" + "".join(set(i)) + "]" return reg
def by_chromosome(self): """Iterate over probes grouped by chromosome name.""" uniq_chrom, idx = numpy.unique(self.chromosome, True) sort_idx = idx.argsort() idx = idx.take(sort_idx) uniq_chrom = uniq_chrom.take(sort_idx) idx2 = numpy.concatenate((idx[1:], [len(self.data)])) for chrom, start, end in zip(uniq_chrom, idx, idx2): subarr = self.to_array(self.data[start:end]) yield chrom, subarr
def segments2vcf(segments, ploidy, is_reference_male, is_sample_female): """Convert copy number segments to VCF records.""" out_dframe = segments.data.loc[:, ["chromosome", "end", "log2", "probes"]] abs_dframe = call.absolute_dataframe(segments, ploidy, 1.0, is_reference_male, is_sample_female) out_dframe["ncopies"] = np.rint(abs_dframe["absolute"]) idx_losses = (out_dframe["ncopies"] < abs_dframe["expect"]) starts = segments.start.copy() starts[starts == 0] = 1 out_dframe["start"] = starts svlen = segments.end - segments.start svlen[idx_losses] *= -1 out_dframe["svlen"] = svlen out_dframe["svtype"] = "DUP" out_dframe.loc[idx_losses, "svtype"] = "DEL" out_dframe["format"] = "GT:GQ:CN:CNQ" out_dframe.loc[idx_losses, "format"] = "GT:GQ" # :CN:CNQ ? # Reformat this data to create INFO and genotype # TODO be more clever about this for out_row, abs_row in zip(out_dframe.itertuples(index=False), abs_dframe.itertuples(index=False)): if (out_row.ncopies == abs_row.expect or # Survive files from buggy v0.7.1 (#53) not str(out_row.probes).isdigit()): # Skip regions of neutral copy number continue # or "CNV" for subclonal? if out_row.ncopies > abs_row.expect: genotype = "0/1:0:%d:%d" % (out_row.ncopies, out_row.probes) elif out_row.ncopies < abs_row.expect: # TODO XXX handle non-diploid ploidies, haploid chroms if out_row.ncopies == 0: # Complete deletion, 0 copies gt = "1/1" else: # Single copy deletion gt = "0/1" genotype = "%s:%d" % (gt, out_row.probes) info = ";".join([ "IMPRECISE", "SVTYPE=%s" % out_row.svtype, "END=%d" % out_row.end, "SVLEN=%d" % out_row.svlen, # CIPOS=-56,20;CIEND=-10,62 ]) yield (out_row.chromosome, out_row.start, '.', 'N', "<%s>" % out_row.svtype, '.', '.', info, out_row.format, genotype)
def segments2vcf(segments, ploidy, is_reference_male, is_sample_female): """Convert copy number segments to VCF records.""" out_dframe = segments.data.loc[:, ["chromosome", "end", "log2", "probes"]] abs_dframe = call.absolute_dataframe(segments, ploidy, 1.0, is_reference_male, is_sample_female) out_dframe["ncopies"] = np.rint(abs_dframe["absolute"]) idx_losses = (out_dframe["ncopies"] < abs_dframe["expect"]) starts = segments.start.copy() starts[starts == 0] = 1 out_dframe["start"] = starts svlen = segments.end - segments.start svlen[idx_losses] *= -1 out_dframe["svlen"] = svlen out_dframe["svtype"] = "DUP" out_dframe.loc[idx_losses, "svtype"] = "DEL" out_dframe["format"] = "GT:GQ:CN:CNQ" out_dframe.loc[idx_losses, "format"] = "GT:GQ" # :CN:CNQ ? # Reformat this data to create INFO and genotype # TODO be more clever about this for out_row, abs_row in zip(out_dframe.itertuples(index=False), abs_dframe.itertuples(index=False)): if (out_row.ncopies == abs_row.expect or # Survive files from buggy v0.7.1 (#53) not str(out_row.probes).isdigit()): # Skip regions of neutral copy number continue # or "CNV" for subclonal? if out_row.ncopies > abs_row.expect: genotype = "0/1:0:%d:%d" % (out_row.ncopies, out_row.probes) elif out_row.ncopies < abs_row.expect: # TODO XXX handle non-diploid ploidies, haploid chroms if out_row.ncopies == 0: # Complete deletion, 0 copies gt = "1/1" else: # Single copy deletion gt = "0/1" genotype = "%s:%d" % (gt, out_row.probes) info = ";".join(["IMPRECISE", "SVTYPE=%s" % out_row.svtype, "END=%d" % out_row.end, "SVLEN=%d" % out_row.svlen, # CIPOS=-56,20;CIEND=-10,62 ]) yield (out_row.chromosome, out_row.start, '.', 'N', "<%s>" % out_row.svtype, '.', '.', info, out_row.format, genotype)
def test_get_db_items(self): """Check list, keys, length etc""" db = self.db items = list(db.values()) keys = list(db) l = len(items) self.assertEqual(l, len(db)) self.assertEqual(l, len(list(db.items()))) self.assertEqual(l, len(list(db))) self.assertEqual(l, len(list(db.values()))) for (k1, r1), (k2, r2) in zip(zip(keys, items), db.items()): self.assertEqual(k1, k2) self.assertEqual(r1.id, r2.id) for k in keys: del db[k] self.assertEqual(0, len(db)) try: del db["non-existant-name"] assert False, "Should have raised KeyError" except KeyError: pass
def test_get_db_items(self): """Check list, keys, length etc""" db = self.db items = list(db.values()) keys = list(db.keys()) l = len(items) self.assertEqual(l, len(db)) self.assertEqual(l, len(list(db.items()))) self.assertEqual(l, len(list(db.keys()))) self.assertEqual(l, len(list(db.values()))) for (k1, r1), (k2, r2) in zip(zip(keys, items), db.items()): self.assertEqual(k1, k2) self.assertEqual(r1.id, r2.id) for k in keys: del db[k] self.assertEqual(0, len(db)) try: del db["non-existant-name"] assert False, "Should have raised KeyError" except KeyError: pass
def interval_coverages_count(bed_fname, bam_fname): """Calculate log2 coverages in the BAM file at each interval.""" bamfile = pysam.Samfile(bam_fname, 'rb') # Parse the BED lines and group them by chromosome # (efficient if records are already sorted by chromosome) for chrom, rows_iter in groupby(ngfrills.parse_regions(bed_fname), key=lambda r: r[0]): # Thunk and reshape this chromosome's intervals echo("Processing chromosome", chrom, "of", os.path.basename(bam_fname)) _chroms, starts, ends, names = zip(*rows_iter) counts_depths = [ region_depth_count(bamfile, chrom, s, e) for s, e in zip(starts, ends) ] for start, end, name, (count, depth) in zip(starts, ends, names, counts_depths): yield [ count, (chrom, start, end, name, math.log(depth, 2) if depth else NULL_LOG2_COVERAGE) ]
def _gaf10iterator(handle): for inline in handle: if inline[0] == '!': continue inrec = inline.rstrip('\n').split('\t') if len(inrec) == 1: continue inrec[3] = inrec[3].split('|') # Qualifier inrec[5] = inrec[5].split('|') # DB:reference(s) inrec[7] = inrec[7].split('|') # With || From inrec[10] = inrec[10].split('|') # Synonym inrec[12] = inrec[12].split('|') # Taxon yield dict(zip(GAF10FIELDS, inrec))
def __init__(self, line=None): """Initialize the class.""" self.chromat_file = "" self.phd_file = "" self.time = "" self.chem = "" self.dye = "" self.template = "" self.direction = "" if line: tags = [ "CHROMAT_FILE", "PHD_FILE", "TIME", "CHEM", "DYE", "TEMPLATE", "DIRECTION" ] poss = [line.find(x) for x in tags] tagpos = dict(zip(poss, tags)) if -1 in tagpos: del tagpos[-1] ps = sorted(tagpos) # the keys for (p1, p2) in zip(ps, ps[1:] + [len(line) + 1]): setattr(self, tagpos[p1].lower(), line[p1 + len(tagpos[p1]) + 1:p2].strip())
def _gaf20iterator(handle): for inline in handle: if inline[0] == "!": continue inrec = inline.rstrip("\n").split("\t") if len(inrec) == 1: continue inrec[3] = inrec[3].split("|") # Qualifier inrec[5] = inrec[5].split("|") # DB:reference(s) inrec[7] = inrec[7].split("|") # With || From inrec[10] = inrec[10].split("|") # Synonym inrec[12] = inrec[12].split("|") # Taxon yield dict(zip(GAF20FIELDS, inrec))
def _flip_codons(codon_seq, target_seq): """Flips the codon characters from one seq to another (PRIVATE).""" a, b = '', '' for char1, char2 in zip(codon_seq, target_seq): # no need to do anything if the codon seq line has nothing if char1 == ' ': a += char1 b += char2 else: a += char2 b += char1 return a, b
def segments2freebayes(segments, sample_name, ploidy, purity, is_reference_male, is_sample_female): """Convert a copy number array to a BED-like format.""" absolutes = cna_absolutes(segments, ploidy, purity, is_reference_male, is_sample_female) for row, abs_val in zip(segments, absolutes): ncopies = _round_to_integer(abs_val, half_is_zero=purity is None) # Ignore regions of neutral copy number if ncopies != ploidy: yield (row["chromosome"], # reference sequence row["start"], # start (0-indexed) row["end"], # end sample_name, # sample name ncopies) # copy number
def merge_rows(rows): """Combine equivalent rows of coverage data across multiple samples. Check that probe info matches across all samples, then merge the log2 coverage values. Input: a list of individual rows corresponding to the same probes from different coverage files. Output: a list starting with the single common Probe object, followed by the log2 coverage values from each sample, in order. """ probe_infos, coverages = zip(*map(row_to_probe_coverage, rows)) probe_info = core.check_unique(probe_infos, "probe Name") combined_row = [probe_info] + list(coverages) return combined_row
def test_get_db_items(self): """Check list, keys, length etc.""" db = self.db items = list(db.values()) keys = list(db) length = len(items) self.assertEqual(length, len(db)) self.assertEqual(length, len(list(db))) self.assertEqual(length, len(list(db.items()))) self.assertEqual(length, len(list(db.keys()))) self.assertEqual(length, len(list(db.values()))) if sys.version_info[0] == 2: # Check legacy methods for Python 2 as well: self.assertEqual(length, len(list(db.iteritems()))) # noqa: B301 self.assertEqual(length, len(list(db.iterkeys()))) # noqa: B301 self.assertEqual(length, len(list(db.itervalues()))) # noqa: B301 for (k1, r1), (k2, r2) in zip(zip(keys, items), db.items()): self.assertEqual(k1, k2) self.assertEqual(r1.id, r2.id) for k in keys: del db[k] self.assertEqual(0, len(db)) with self.assertRaises(KeyError): del db["non-existant-name"]
def __init__(self, sample_id, chromosomes, starts, ends, genes, coverages, gc=None, rmask=None, spread=None, weight=None, probes=None): dtype = list(self._dtype) if all(x is None for x in (gc, rmask, spread, weight, probes)): self._xtra = () table = list(zip(chromosomes, starts, ends, genes, coverages)) else: # XXX There Must Be a Better Way -- use **kwargs? xtra_names = [] xtra_cols = [] if gc is not None: xtra_names.append('gc') xtra_cols.append(gc) dtype.append(self._dtype_gc) if rmask is not None: xtra_names.append('rmask') xtra_cols.append(rmask) dtype.append(self._dtype_rmask) if spread is not None: xtra_names.append('spread') xtra_cols.append(spread) dtype.append(self._dtype_spread) if weight is not None: xtra_names.append('weight') xtra_cols.append(weight) dtype.append(self._dtype_weight) if probes is not None: xtra_names.append('probes') xtra_cols.append(probes) dtype.append(self._dtype_probes) self._xtra = tuple(xtra_names) table = list(zip(chromosomes, starts, ends, genes, coverages, *xtra_cols)) self.data = numpy.asarray(table, dtype) self.sample_id = sample_id
def _gpi10iterator(handle): """Read GPI 1.0 format files (PRIVATE). This iterator is used to read a gp_information.goa_uniprot file which is in the GPI 1.0 format. """ for inline in handle: if inline[0] == '!': continue inrec = inline.rstrip('\n').split('\t') if len(inrec) == 1: continue inrec[5] = inrec[5].split('|') # DB_Object_Synonym(s) inrec[8] = inrec[8].split('|') # Annotation_Target_Set yield dict(zip(GPI10FIELDS, inrec))
def segments2freebayes(segments, sample_name, ploidy, purity, is_reference_male, is_sample_female): """Convert a copy number array to a BED-like format.""" absolutes = cna_absolutes(segments, ploidy, purity, is_reference_male, is_sample_female) for row, abs_val in zip(segments, absolutes): ncopies = _round_to_integer(abs_val, half_is_zero=purity is None) # Ignore regions of neutral copy number if ncopies != ploidy: yield ( row["chromosome"], # reference sequence row["start"], # start (0-indexed) row["end"], # end sample_name, # sample name ncopies) # copy number
def _get_coords(filename): alb = open(filename) start_line = None end_line = None for line in alb: if line.startswith("["): if not start_line: start_line = line # rstrip not needed else: end_line = line if end_line is None: # sequence is too short return [(0, 0), (0, 0)] return list(zip(*map(_alb_line2coords, [start_line, end_line]))) # returns [(start0, end0), (start1, end1)]
def _gpi11iterator(handle): """Read GPI 1.0 format files (PRIVATE). This iterator is used to read a gp_information.goa_uniprot file which is in the GPI 1.0 format. """ for inline in handle: if inline[0] == '!': continue inrec = inline.rstrip('\n').split('\t') if len(inrec) == 1: continue inrec[2] = inrec[2].split('|') # DB_Object_Name inrec[3] = inrec[3].split('|') # DB_Object_Synonym(s) inrec[7] = inrec[7].split('|') # DB_Xref(s) inrec[8] = inrec[8].split('|') # Properties yield dict(zip(GPI11FIELDS, inrec))
def from_columns(cls, sample_id, **columns): these_cols = set(columns) required_cols = {'chromosome', 'start', 'end', 'gene', 'coverage'} optional_cols = {'gc', 'rmask', 'spread', 'weight', 'probes'} missing_cols = required_cols - these_cols if missing_cols: raise ValueError("Missing required column(s): " + " ".join(sorted(missing_cols))) extra_cols = these_cols - required_cols bogus_cols = extra_cols - optional_cols if bogus_cols: raise ValueError("Unrecognized column name(s): " + " ".join(sorted(bogus_cols))) cnarr = cls(sample_id, list(extra_cols)) # XXX better way? table = list(zip(*[columns[key] for key in cnarr.data.dtype.names])) cnarr.data = numpy.asarray(table, dtype=cnarr.data.dtype) return cnarr
def _gpa11iterator(handle): """Read GPA 1.1 format files (PRIVATE). This iterator is used to read a gp_association.goa_uniprot file which is in the GPA 1.1 format. Do not call directly. Rather use the gpa_iterator function """ for inline in handle: if inline[0] == '!': continue inrec = inline.rstrip('\n').split('\t') if len(inrec) == 1: continue inrec[2] = inrec[2].split('|') # Qualifier inrec[4] = inrec[4].split('|') # DB:Reference(s) inrec[6] = inrec[6].split('|') # With inrec[10] = inrec[10].split('|') # Annotation extension yield dict(zip(GPA11FIELDS, inrec))