def _calculate_totals(table): for (name, samples) in sorted(table.items()): for (sample, libraries) in sorted(samples.items()): for (library, contigs) in sorted(libraries.items()): set_in(table, (name, sample, library), _calculate_totals_in(contigs)) set_in(table, (name, sample, "*"), _calculate_totals_in(libraries)) set_in(table, (name, sample, "*", "*", "Size"), get_in(table, (name, sample, library, "*", "Size"))) set_in(table, (name, "*", "*"), _calculate_totals_in(table)) set_in(table, (name, "*", "*", "*", "Size"), get_in(table, (name, sample, "*", "*", "Size")))
def _open_se_sink(reads_type): key = (name, reads_type) if not get_in(sink_cache, key): filename = ReadSink.get_filename(destination, reads_type.lower()) set_in(sink_cache, key, ReadSink.open(config.destination, filename)) return key
def _initialize_tables(cls, target_name, intervals, readgroups): subtables = {} for (name, intervals) in intervals.iteritems(): size = sum((end - start) for (_, start, end) in intervals) subtables[name] = { "SE": 0, "PE_1": 0, "PE_2": 0, "Collapsed": 0, "Hits": 0, "M": 0, "I": 0, "D": 0, "Size": size } tables, mapping = {}, {} for rg in readgroups.itervalues(): subtbl_copy = get_in(tables, (target_name, rg["SM"], rg["LB"]), None) if not subtbl_copy: subtbl_copy = copy.deepcopy(subtables) set_in(tables, (target_name, rg["SM"], rg["LB"]), subtbl_copy) mapping[rg["ID"]] = subtbl_copy return tables, mapping
def create_or_get_subtable(table, subtable_key, size): subtable = get_in(table, subtable_key) if subtable is None: subtable = dict(READGROUP_TEMPLATE) subtable["Size"] = size set_in(table, subtable_key, subtable) return subtable
def create_or_get_subtable(table, subtable_key, size): subtable = get_in(table, subtable_key) if subtable is None: subtable = dict(READGROUP_TEMPLATE) subtable["Size"] = size set_in(table, subtable_key, subtable) return subtable
def update_gtf_table(table, gtf, scaffolds, contig_prefix): # Workaround for bug in Pysam, which mis-parses individual properties # (e.g. exon_number) if these are not quoted. This does not apply to # asDict, which uses a different parsing implementation (v0.7.8). properties = gtf.asDict() keys = (properties["gene_biotype"], properties["gene_id"], properties["transcript_id"], int(properties["exon_number"]), gtf.feature) record = {"contig": contig_prefix + gtf.contig, "start": gtf.start, # In pysam, 'end' equals the past-the-end position "end": gtf.end - 1, "strand": gtf.strand, "feature": gtf.feature, "transcript": properties["transcript_id"]} if record["contig"] in scaffolds: contig = scaffolds[record["contig"]] record["contig"] = contig["chrom"] record["start"] += int(contig["chromStart"]) record["end"] += int(contig["chromStart"]) assert not get_in(table, keys), keys set_in(table, keys, record)
def _read_coverage_tables(cls, key, filenames): hits = nts = 0 for filename in filenames: subtable = {} read_coverage_table(subtable, filename) for contigtable in get_in(subtable, key).itervalues(): hits += contigtable["Hits"] nts += contigtable["M"] return hits, nts
def _collect_clade_from(self, cache, p_node, c_node): c_clade = get_in(cache, (p_node, c_node), set()) if not c_clade: if self.is_leaf(c_node): c_clade.add(c_node) for n_node in self.connections[c_node]: if n_node != p_node: c_clade.update(self._collect_clade_from(cache, c_node, n_node)) set_in(cache, (p_node, c_node), frozenset(c_clade)) return c_clade
def _collect_clade_from(self, cache, p_node, c_node): c_clade = get_in(cache, (p_node, c_node), set()) if not c_clade: if self.is_leaf(c_node): c_clade.add(c_node) for n_node in self.connections[c_node]: if n_node != p_node: c_clade.update( self._collect_clade_from(cache, c_node, n_node)) set_in(cache, (p_node, c_node), frozenset(c_clade)) return c_clade
def _read_raw_bam_stats(self, table): for ((genome, target, sample, library), filenames) in self._in_raw_bams.iteritems(): subtable = {} for filename in filenames: read_coverage_table(subtable, filename) key = (target, sample, library) hits = 0 for contigtable in get_in(subtable, key).itervalues(): hits += contigtable["Hits"] value = (hits, "# Total number of hits (prior to PCR duplicate filtering)") set_in(table, (target, sample, library, genome, "hits_raw(%s)" % genome), value)
def _read_table(self, key, table, filename): all_key = "all" if self._intervals else "genome" with open(filename) as handle: for line in handle: fields = line.split("\t") # 'all' is generated by coverageBed, as a catchall group if (fields[0] == all_key) and (len(fields) == 5): name = None elif self._max_contigs_reached: continue elif (len(fields) > (3 + 4)): # Probably a BED6 file, 4 columns are from bedTools name = fields[3] else: name = fields[0] ckey = key + (name, ) if not get_in(table, ckey, None): set_in(table, ckey, [0] * (_MAX_DEPTH + 1)) depth = min(_MAX_DEPTH, int(fields[-4])) get_in(table, ckey)[depth] += int(fields[-3])
def _read_table(self, key, table, filename): all_key = "all" if self._intervals else "genome" with open(filename) as handle: for line in handle: fields = line.split("\t") # 'all' is generated by coverageBed, as a catchall group if (fields[0] == all_key) and (len(fields) == 5): name = None elif self._max_contigs_reached: continue elif (len(fields) > (3 + 4)): # Probably a BED6 file, 4 columns are from bedTools name = fields[3] else: name = fields[0] ckey = key + (name,) if not get_in(table, ckey, None): set_in(table, ckey, [0] * (_MAX_DEPTH + 1)) depth = min(_MAX_DEPTH, int(fields[-4])) get_in(table, ckey)[depth] += int(fields[-3])
def convert_reads(config, destination, record, sink_cache): # Source name is used, to re-merge split lanes name = record.tags.get("PU_src") destination = os.path.join(destination, name) make_dirs(os.path.join(config.destination, destination)) def _open_se_sink(reads_type): key = (name, reads_type) if not get_in(sink_cache, key): filename = ReadSink.get_filename(destination, reads_type.lower()) set_in(sink_cache, key, ReadSink.open(config.destination, filename)) return key for (reads_type, bam_files) in record.bams.iteritems(): # Processed reads are pre-aligned BAMs which have been cleaned up if reads_type in ("Paired", "Processed"): # Record "Single" reads; these may result from orphan SE reads _open_se_sink("Single") key = (name, "Paired") if not get_in(sink_cache, key): set_in(sink_cache, key, PEReadSink.open(config.destination, destination)) else: key = _open_se_sink(reads_type) sink = get_in(sink_cache, key) for filename in bam_files: print("%sProcessing file %r" % (_INDENTATION * 4, filename)) with pysam.Samfile(filename) as handle: def _keep_record(record): return (record.qual >= config.min_quality) and \ (len(record.seq) >= config.min_length) sink.write_records(record for record in handle if _keep_record(record))
def convert_reads(config, destination, record, sink_cache): # Source name is used, to re-merge split lanes name = record.tags.get("PU_src") destination = os.path.join(destination, name) make_dirs(os.path.join(config.destination, destination)) def _open_se_sink(reads_type): key = (name, reads_type) if not get_in(sink_cache, key): filename = ReadSink.get_filename(destination, reads_type.lower()) set_in(sink_cache, key, ReadSink.open(config.destination, filename)) return key for (reads_type, bam_files) in record.bams.iteritems(): # Processed reads are pre-aligned BAMs which have been cleaned up if reads_type in ("Paired", "Processed"): # Record "Single" reads; these may result from orphan SE reads _open_se_sink("Single") key = (name, "Paired") if not get_in(sink_cache, key): set_in(sink_cache, key, PEReadSink.open(config.destination, destination)) else: key = _open_se_sink(reads_type) sink = get_in(sink_cache, key) for filename in bam_files: print("%sProcessing file %r" % (_INDENTATION * 4, filename)) with pysam.Samfile(filename) as handle: def _keep_record(record): return (record.qual >= config.min_quality) and \ (len(record.seq) >= config.min_length) sink.write_records(record for record in handle if _keep_record(record))
def _read_lib_bam_stats(self, table): for ((genome, target, sample, library), filenames) in self._in_lib_bams.iteritems(): subtable = {} for filename in filenames: read_coverage_table(subtable, filename) key = (target, sample, library) hits = nts = 0 for contigtable in get_in(subtable, key).itervalues(): hits += contigtable["Hits"] nts += contigtable["M"] value = (hits, "# Total number of hits (excluding any PCR duplicates)") set_in(table, (target, sample, library, genome, "hits_unique(%s)" % genome), value) set_in(table, (target, sample, library, genome, "hits_unique_nts(%s)" % genome), (nts, None))
def read_table(table, filename): with open(filename) as table_file: for record in parse_padded_table(table_file): key = (record["Name"], record["Sample"], record["Library"], record["Contig"]) if "*" in key: continue subtable = get_in(table, key, {"Size": int(record["Size"])}) assert int(subtable["Size"]) == int(record["Size"]) for field in ("Hits", "SE", "PE_1", "PE_2", "Collapsed", "M", "I", "D"): subtable[field] = subtable.get(field, 0) + int( record.get(field, 0)) set_in(table, key, subtable)
def read_table(table, filename): with open(filename) as table_file: for record in parse_padded_table(table_file): key = (record["Name"], record["Sample"], record["Library"], record["Contig"]) if "*" in key: continue subtable = get_in(table, key) if subtable is None: subtable = dict(READGROUP_TEMPLATE) subtable["Size"] = int(record["Size"]) set_in(table, key, subtable) assert int(subtable["Size"]) == int(record["Size"]) for key in READGROUP_TEMPLATE: if key != "Size": subtable[key] += int(record.get(key, 0))
def read_table(table, filename): with open(filename) as table_file: for record in parse_padded_table(table_file): key = (record["Name"], record["Sample"], record["Library"], record["Contig"]) if "*" in key: continue subtable = get_in(table, key) if subtable is None: subtable = dict(READGROUP_TEMPLATE) subtable["Size"] = int(record["Size"]) set_in(table, key, subtable) assert int(subtable["Size"]) == int(record["Size"]) for key in READGROUP_TEMPLATE: if key != "Size": subtable[key] += int(record.get(key, 0))
def _read_raw_bam_stats(self, table): for ((genome, target, sample, library), filenames) in self._in_raw_bams.iteritems(): subtable = {} for filename in filenames: read_coverage_table(subtable, filename) key = (target, sample, library) hits = 0 for contigtable in get_in(subtable, key).itervalues(): hits += contigtable["Hits"] value = ( hits, "# Total number of hits (prior to PCR duplicate filtering)") set_in(table, (target, sample, library, genome, "hits_raw(%s)" % genome), value)
def _read_coverage_tables(cls, key, filenames): hits = nts = 0 for filename in filenames: subtable = {} read_coverage_table(subtable, filename) contigtables = get_in(subtable, key) if contigtables is None: raise NodeError("Error reading table %r; row not found:" "\n %s ...\n\nIf files have been renamed " "during the run, then please remove this file " "in that it may be re-generated.\nHowever, " "note that read-group tags in the BAM files " "may not be correct!" % (filename, " ".join(key))) for contigtable in contigtables.itervalues(): hits += contigtable["Hits"] nts += contigtable["M"] return hits, nts
def _read_lib_bam_stats(self, table): for ((genome, target, sample, library), filenames) in self._in_lib_bams.iteritems(): subtable = {} for filename in filenames: read_coverage_table(subtable, filename) key = (target, sample, library) hits = nts = 0 for contigtable in get_in(subtable, key).itervalues(): hits += contigtable["Hits"] nts += contigtable["M"] value = (hits, "# Total number of hits (excluding any PCR duplicates)") set_in( table, (target, sample, library, genome, "hits_unique(%s)" % genome), value) set_in(table, (target, sample, library, genome, "hits_unique_nts(%s)" % genome), (nts, None))
def _read_coverage_tables(cls, key, filenames): hits = nts = 0 for filename in filenames: subtable = {} read_coverage_table(subtable, filename) contigtables = get_in(subtable, key) if contigtables is None: raise NodeError( "Error reading table %r; row not found:" "\n %s ...\n\nIf files have been renamed " "during the run, then please remove this file " "in that it may be re-generated.\nHowever, " "note that read-group tags in the BAM files " "may not be correct!" % (filename, " ".join(key)) ) for contigtable in contigtables.itervalues(): hits += contigtable["Hits"] nts += contigtable["M"] return hits, nts
def test_get_in__get_default_one_keyword(): assert_equal(utils.get_in({1: 2}, [2]), None)
def test_get_in__get_value_three_keywords(): assert_equal(utils.get_in({1: {2: {3: 4}}}, [1, 2, 3]), 4)
def test_get_in__get_value_two_keywords(): assert_equal(utils.get_in({1: {2: 3}}, [1, 2]), 3)
def test_get_in__get_value_one_keyword(): assert_equal(utils.get_in({1: 2}, [1]), 2)
def test_get_in__get_default_three_keywords_fail_at_third_with_default(): assert_equal(utils.get_in({1: {2: {3: 4}}}, [1, 2, 4], "other"), "other")
def test_get_in__get_default_one_keyword(): assert_equal(utils.get_in({1 : 2}, [2]), None)
def test_get_in__get_default_one_keyword_with_default(): assert_equal(utils.get_in({1 : 2}, [2], "other"), "other")
def test_get_in__get_default_three_keywords_fail_at_third(): assert_equal(utils.get_in({1 : {2 : {3 : 4}}}, [1, 2, 4]), None)
def test_get_in__get_default_three_keywords_fail_at_third_with_default(): assert_equal(utils.get_in({1 : {2 : {3 : 4}}}, [1, 2, 4], "other"), "other")
def test_get_in__iterator_keywords(): assert_equal(utils.get_in({1 : {2 : {3 : 4}}}, iter([1, 2, 3])), 4)
def test_get_in__get_default_one_keyword_with_default(): assert_equal(utils.get_in({1: 2}, [2], "other"), "other")
def test_get_in__get_default_three_keywords_fail_at_third(): assert_equal(utils.get_in({1: {2: {3: 4}}}, [1, 2, 4]), None)
def test_get_in__get_value_three_keywords(): assert_equal(utils.get_in({1 : {2 : {3 : 4}}}, [1, 2, 3]), 4)
def test_get_in__iterator_keywords(): assert_equal(utils.get_in({1: {2: {3: 4}}}, iter([1, 2, 3])), 4)
def test_get_in__get_value_two_keywords(): assert_equal(utils.get_in({1 : {2 : 3}}, [1, 2]), 3)
def test_get_in__get_value_one_keyword(): assert_equal(utils.get_in({1 : 2}, [1]), 2)
def _open_se_sink(reads_type): key = (name, reads_type) if not get_in(sink_cache, key): filename = ReadSink.get_filename(destination, reads_type.lower()) set_in(sink_cache, key, ReadSink.open(config.destination, filename)) return key