def create_or_get_subtable(table, subtable_key, size): subtable = get_in(table, subtable_key) if subtable is None: subtable = dict(READGROUP_TEMPLATE) subtable["Size"] = size set_in(table, subtable_key, subtable) return subtable
def update_gtf_table(table, gtf, scaffolds, contig_prefix): # Workaround for bug in Pysam, which mis-parses individual properties # (e.g. exon_number) if these are not quoted. This does not apply to # asDict, which uses a different parsing implementation (v0.7.8). properties = gtf.asDict() gene_type = properties.get("gene_biotype") if gene_type is None: gene_type = properties.get("gene_type", "unknown_genetype") keys = (gene_type, properties["gene_id"], properties["transcript_id"], int(properties["exon_number"]), gtf.feature) record = { "contig": contig_prefix + gtf.contig, "start": gtf.start, # In pysam, 'end' equals the past-the-end position "end": gtf.end - 1, "strand": gtf.strand, "feature": gtf.feature, "transcript": properties["transcript_id"] } if record["contig"] in scaffolds: contig = scaffolds[record["contig"]] record["contig"] = contig["chrom"] record["start"] += int(contig["chromStart"]) record["end"] += int(contig["chromStart"]) assert not get_in(table, keys), keys set_in(table, keys, record)
def update_gtf_table(table, gtf, scaffolds, contig_prefix): # Workaround for bug in Pysam, which mis-parses individual properties # (e.g. exon_number) if these are not quoted. This does not apply to # asDict, which uses a different parsing implementation (v0.7.8). properties = gtf.asDict() gene_type = properties.get("gene_biotype") if gene_type is None: gene_type = properties.get("gene_type", "unknown_genetype") keys = (gene_type, properties["gene_id"], properties["transcript_id"], int(properties["exon_number"]), gtf.feature) record = {"contig": contig_prefix + gtf.contig, "start": gtf.start, # In pysam, 'end' equals the past-the-end position "end": gtf.end - 1, "strand": gtf.strand, "feature": gtf.feature, "transcript": properties["transcript_id"]} if record["contig"] in scaffolds: contig = scaffolds[record["contig"]] record["contig"] = contig["chrom"] record["start"] += int(contig["chromStart"]) record["end"] += int(contig["chromStart"]) assert not get_in(table, keys), keys set_in(table, keys, record)
def create_or_get_subtable(table, subtable_key, size): subtable = get_in(table, subtable_key) if subtable is None: subtable = ReadGroup() subtable.Size = size set_in(table, subtable_key, subtable) return subtable
def create_or_get_subtable(table, subtable_key, size): subtable = get_in(table, subtable_key) if subtable is None: subtable = ReadGroup() subtable.Size = size set_in(table, subtable_key, subtable) return subtable
def _collect_clade_from(self, cache, p_node, c_node): c_clade = get_in(cache, (p_node, c_node), set()) if not c_clade: if self.is_leaf(c_node): c_clade.add(c_node) for n_node in self.connections[c_node]: if n_node != p_node: c_clade.update( self._collect_clade_from(cache, c_node, n_node)) set_in(cache, (p_node, c_node), frozenset(c_clade)) return c_clade
def convert_reads(config, destination, record, sink_cache): # Source name is used, to re-merge split lanes name = record.tags.get("PU_src") destination = os.path.join(destination, name) make_dirs(os.path.join(config.destination, destination)) def _open_se_sink(reads_type): key = (name, reads_type) if not get_in(sink_cache, key): filename = ReadSink.get_filename(destination, reads_type.lower()) set_in(sink_cache, key, ReadSink.open(config.destination, filename)) return key for (reads_type, bam_files) in record.bams.iteritems(): # Processed reads are pre-aligned BAMs which have been cleaned up if reads_type in ("Paired", "Processed"): # Record "Single" reads; these may result from orphan SE reads _open_se_sink("Singleton") key = (name, "Paired") if not get_in(sink_cache, key): set_in(sink_cache, key, PEReadSink.open(config.destination, destination)) else: key = _open_se_sink(reads_type) sink = get_in(sink_cache, key) for filename in bam_files: print("%sProcessing file %r" % (_INDENTATION * 4, filename)) with pysam.Samfile(filename) as handle: def _keep_record(record): return (record.qual >= config.min_quality) and \ (len(record.seq) >= config.min_length) sink.write_records(record for record in handle if _keep_record(record))
def convert_reads(config, destination, record, sink_cache): # Source name is used, to re-merge split lanes name = record.tags.get("PU_src") destination = os.path.join(destination, name) make_dirs(os.path.join(config.destination, destination)) def _open_se_sink(reads_type): key = (name, reads_type) if not get_in(sink_cache, key): filename = ReadSink.get_filename(destination, reads_type.lower()) set_in(sink_cache, key, ReadSink.open(config.destination, filename)) return key for (reads_type, bam_files) in record.bams.iteritems(): # Processed reads are pre-aligned BAMs which have been cleaned up if reads_type in ("Paired", "Processed"): # Record "Single" reads; these may result from orphan SE reads _open_se_sink("Singleton") key = (name, "Paired") if not get_in(sink_cache, key): set_in(sink_cache, key, PEReadSink.open(config.destination, destination)) else: key = _open_se_sink(reads_type) sink = get_in(sink_cache, key) for filename in bam_files: print("%sProcessing file %r" % (_INDENTATION * 4, filename)) with pysam.Samfile(filename) as handle: def _keep_record(record): return (record.qual >= config.min_quality) and \ (len(record.seq) >= config.min_length) sink.write_records(record for record in handle if _keep_record(record))
def read_table(table, filename): with open(filename) as table_file: for record in parse_padded_table(table_file): key = (record["Name"], record["Sample"], record["Library"], record["Contig"]) if "*" in key: continue subtable = get_in(table, key) if subtable is None: subtable = dict(READGROUP_TEMPLATE) subtable["Size"] = int(record["Size"]) set_in(table, key, subtable) assert int(subtable["Size"]) == int(record["Size"]) for key in READGROUP_TEMPLATE: if key != "Size": subtable[key] += int(record.get(key, 0))
def read_table(table, filename): with open(filename) as table_file: for record in parse_padded_table(table_file): key = (record["Name"], record["Sample"], record["Library"], record["Contig"]) if "*" in key: continue subtable = get_in(table, key) if subtable is None: subtable = ReadGroup() subtable.Size = int(record["Size"]) set_in(table, key, subtable) assert int(subtable.Size) == int(record["Size"]) for key in ReadGroup.__slots__: if key != "Size": subtable[key] += int(record.get(key, 0))
def _read_coverage_tables(cls, key, filenames): hits = nts = 0 for filename in filenames: subtable = {} read_coverage_table(subtable, filename) contigtables = get_in(subtable, key) if contigtables is None: raise NodeError("Error reading table %r; row not found:" "\n %s ...\n\nIf files have been renamed " "during the run, then please remove this file " "in that it may be re-generated.\nHowever, " "note that read-group tags in the BAM files " "may not be correct!" % (filename, " ".join(key))) for contigtable in contigtables.itervalues(): hits += contigtable["Hits"] nts += contigtable["M"] return hits, nts
def read_table(table, filename): with open(filename) as table_file: for record in parse_padded_table(table_file): key = ( record["Name"], record["Sample"], record["Library"], record["Contig"], ) if "*" in key: continue subtable = get_in(table, key) if subtable is None: subtable = ReadGroup() subtable.Size = int(record["Size"]) set_in(table, key, subtable) assert int(subtable.Size) == int(record["Size"]) for key in ReadGroup.__slots__: if key != "Size": subtable[key] += int(record.get(key, 0))
def test_get_in__get_value_two_keywords(): assert utils.get_in({1: {2: 3}}, [1, 2]) == 3
def _open_se_sink(reads_type): key = (name, reads_type) if not get_in(sink_cache, key): filename = ReadSink.get_filename(destination, reads_type.lower()) set_in(sink_cache, key, ReadSink.open(config.destination, filename)) return key
def test_get_in__get_default_three_keywords_fail_at_first(): assert utils.get_in({1: {2: {3: 4}}}, [2, 2, 4]) is None
def test_get_in__get_default_one_keyword(): assert utils.get_in({1: 2}, [2]) is None
def test_get_in__get_value_one_keyword(): assert utils.get_in({1: 2}, [1]) == 2
def _open_se_sink(reads_type): key = (name, reads_type) if not get_in(sink_cache, key): filename = ReadSink.get_filename(destination, reads_type.lower()) set_in(sink_cache, key, ReadSink.open(config.destination, filename)) return key