示例#1
0
def _calculate_totals(table):
    for (name, samples) in sorted(table.items()):
        for (sample, libraries) in sorted(samples.items()):
            for (library, contigs) in sorted(libraries.items()):
                set_in(table, (name, sample, library),
                       _calculate_totals_in(contigs))
            set_in(table, (name, sample, "*"), _calculate_totals_in(libraries))
            set_in(table, (name, sample, "*", "*", "Size"),
                   get_in(table, (name, sample, library, "*", "Size")))
        set_in(table, (name, "*", "*"), _calculate_totals_in(table))
        set_in(table, (name, "*", "*", "*", "Size"),
               get_in(table, (name, sample, "*", "*", "Size")))
示例#2
0
 def _open_se_sink(reads_type):
     key = (name, reads_type)
     if not get_in(sink_cache, key):
         filename = ReadSink.get_filename(destination, reads_type.lower())
         set_in(sink_cache, key, ReadSink.open(config.destination,
                                               filename))
     return key
示例#3
0
    def _initialize_tables(cls, target_name, intervals, readgroups):
        subtables = {}
        for (name, intervals) in intervals.iteritems():
            size = sum((end - start) for (_, start, end) in intervals)
            subtables[name] = {
                "SE": 0,
                "PE_1": 0,
                "PE_2": 0,
                "Collapsed": 0,
                "Hits": 0,
                "M": 0,
                "I": 0,
                "D": 0,
                "Size": size
            }

        tables, mapping = {}, {}
        for rg in readgroups.itervalues():
            subtbl_copy = get_in(tables, (target_name, rg["SM"], rg["LB"]),
                                 None)
            if not subtbl_copy:
                subtbl_copy = copy.deepcopy(subtables)

            set_in(tables, (target_name, rg["SM"], rg["LB"]), subtbl_copy)
            mapping[rg["ID"]] = subtbl_copy

        return tables, mapping
示例#4
0
def create_or_get_subtable(table, subtable_key, size):
    subtable = get_in(table, subtable_key)
    if subtable is None:
        subtable = dict(READGROUP_TEMPLATE)
        subtable["Size"] = size
        set_in(table, subtable_key, subtable)
    return subtable
示例#5
0
def create_or_get_subtable(table, subtable_key, size):
    subtable = get_in(table, subtable_key)
    if subtable is None:
        subtable = dict(READGROUP_TEMPLATE)
        subtable["Size"] = size
        set_in(table, subtable_key, subtable)
    return subtable
示例#6
0
def update_gtf_table(table, gtf, scaffolds, contig_prefix):
    # Workaround for bug in Pysam, which mis-parses individual properties
    # (e.g. exon_number) if these are not quoted. This does not apply to
    # asDict, which uses a different parsing implementation (v0.7.8).
    properties = gtf.asDict()

    keys = (properties["gene_biotype"],
            properties["gene_id"],
            properties["transcript_id"],
            int(properties["exon_number"]),
            gtf.feature)

    record = {"contig": contig_prefix + gtf.contig,
              "start": gtf.start,
              # In pysam, 'end' equals the past-the-end position
              "end": gtf.end - 1,
              "strand": gtf.strand,
              "feature": gtf.feature,
              "transcript": properties["transcript_id"]}

    if record["contig"] in scaffolds:
        contig = scaffolds[record["contig"]]
        record["contig"] = contig["chrom"]
        record["start"] += int(contig["chromStart"])
        record["end"] += int(contig["chromStart"])

    assert not get_in(table, keys), keys
    set_in(table, keys, record)
示例#7
0
 def _read_coverage_tables(cls, key, filenames):
     hits = nts = 0
     for filename in filenames:
         subtable = {}
         read_coverage_table(subtable, filename)
         for contigtable in get_in(subtable, key).itervalues():
             hits += contigtable["Hits"]
             nts += contigtable["M"]
     return hits, nts
示例#8
0
    def _collect_clade_from(self, cache, p_node, c_node):
        c_clade = get_in(cache, (p_node, c_node), set())
        if not c_clade:
            if self.is_leaf(c_node):
                c_clade.add(c_node)

            for n_node in self.connections[c_node]:
                if n_node != p_node:
                    c_clade.update(self._collect_clade_from(cache, c_node, n_node))
            set_in(cache, (p_node, c_node), frozenset(c_clade))
        return c_clade
示例#9
0
    def _collect_clade_from(self, cache, p_node, c_node):
        c_clade = get_in(cache, (p_node, c_node), set())
        if not c_clade:
            if self.is_leaf(c_node):
                c_clade.add(c_node)

            for n_node in self.connections[c_node]:
                if n_node != p_node:
                    c_clade.update(
                        self._collect_clade_from(cache, c_node, n_node))
            set_in(cache, (p_node, c_node), frozenset(c_clade))
        return c_clade
示例#10
0
    def _read_raw_bam_stats(self, table):
        for ((genome, target, sample, library), filenames) in self._in_raw_bams.iteritems():
            subtable = {}
            for filename in filenames:
                read_coverage_table(subtable, filename)
            key = (target, sample, library)

            hits = 0
            for contigtable in get_in(subtable, key).itervalues():
                hits += contigtable["Hits"]

            value = (hits, "# Total number of hits (prior to PCR duplicate filtering)")
            set_in(table, (target, sample, library, genome, "hits_raw(%s)" % genome), value)
示例#11
0
    def _read_table(self, key, table, filename):
        all_key = "all" if self._intervals else "genome"
        with open(filename) as handle:
            for line in handle:
                fields = line.split("\t")
                # 'all' is generated by coverageBed, as a catchall group
                if (fields[0] == all_key) and (len(fields) == 5):
                    name = None
                elif self._max_contigs_reached:
                    continue
                elif (len(fields) > (3 + 4)):
                    # Probably a BED6 file, 4 columns are from bedTools
                    name = fields[3]
                else:
                    name = fields[0]

                ckey = key + (name, )
                if not get_in(table, ckey, None):
                    set_in(table, ckey, [0] * (_MAX_DEPTH + 1))

                depth = min(_MAX_DEPTH, int(fields[-4]))
                get_in(table, ckey)[depth] += int(fields[-3])
示例#12
0
    def _read_table(self, key, table, filename):
        all_key = "all" if self._intervals else "genome"
        with open(filename) as handle:
            for line in handle:
                fields = line.split("\t")
                # 'all' is generated by coverageBed, as a catchall group
                if (fields[0] == all_key) and (len(fields) == 5):
                    name = None
                elif self._max_contigs_reached:
                    continue
                elif (len(fields) > (3 + 4)):
                    # Probably a BED6 file, 4 columns are from bedTools
                    name = fields[3]
                else:
                    name = fields[0]

                ckey = key + (name,)
                if not get_in(table, ckey, None):
                    set_in(table, ckey, [0] * (_MAX_DEPTH + 1))

                depth = min(_MAX_DEPTH, int(fields[-4]))
                get_in(table, ckey)[depth] += int(fields[-3])
示例#13
0
def convert_reads(config, destination, record, sink_cache):
    # Source name is used, to re-merge split lanes
    name = record.tags.get("PU_src")
    destination = os.path.join(destination, name)
    make_dirs(os.path.join(config.destination, destination))

    def _open_se_sink(reads_type):
        key = (name, reads_type)
        if not get_in(sink_cache, key):
            filename = ReadSink.get_filename(destination, reads_type.lower())
            set_in(sink_cache, key, ReadSink.open(config.destination,
                                                  filename))
        return key

    for (reads_type, bam_files) in record.bams.iteritems():
        # Processed reads are pre-aligned BAMs which have been cleaned up
        if reads_type in ("Paired", "Processed"):
            # Record "Single" reads; these may result from orphan SE reads
            _open_se_sink("Single")

            key = (name, "Paired")
            if not get_in(sink_cache, key):
                set_in(sink_cache, key,
                       PEReadSink.open(config.destination, destination))
        else:
            key = _open_se_sink(reads_type)

        sink = get_in(sink_cache, key)
        for filename in bam_files:
            print("%sProcessing file %r" % (_INDENTATION * 4, filename))
            with pysam.Samfile(filename) as handle:

                def _keep_record(record):
                    return (record.qual >= config.min_quality) and \
                        (len(record.seq) >= config.min_length)

                sink.write_records(record for record in handle
                                   if _keep_record(record))
示例#14
0
文件: remap.py 项目: CarlesV/paleomix
def convert_reads(config, destination, record, sink_cache):
    # Source name is used, to re-merge split lanes
    name = record.tags.get("PU_src")
    destination = os.path.join(destination, name)
    make_dirs(os.path.join(config.destination, destination))

    def _open_se_sink(reads_type):
        key = (name, reads_type)
        if not get_in(sink_cache, key):
            filename = ReadSink.get_filename(destination, reads_type.lower())
            set_in(sink_cache, key, ReadSink.open(config.destination, filename))
        return key

    for (reads_type, bam_files) in record.bams.iteritems():
        # Processed reads are pre-aligned BAMs which have been cleaned up
        if reads_type in ("Paired", "Processed"):
            # Record "Single" reads; these may result from orphan SE reads
            _open_se_sink("Single")

            key = (name, "Paired")
            if not get_in(sink_cache, key):
                set_in(sink_cache, key, PEReadSink.open(config.destination,
                                                        destination))
        else:
            key = _open_se_sink(reads_type)

        sink = get_in(sink_cache, key)
        for filename in bam_files:
            print("%sProcessing file %r" % (_INDENTATION * 4, filename))
            with pysam.Samfile(filename) as handle:
                def _keep_record(record):
                    return (record.qual >= config.min_quality) and \
                        (len(record.seq) >= config.min_length)

                sink.write_records(record for record in handle
                                   if _keep_record(record))
示例#15
0
    def _read_lib_bam_stats(self, table):
        for ((genome, target, sample, library), filenames) in self._in_lib_bams.iteritems():
            subtable = {}
            for filename in filenames:
                read_coverage_table(subtable, filename)
            key = (target, sample, library)

            hits = nts = 0
            for contigtable in get_in(subtable, key).itervalues():
                hits += contigtable["Hits"]
                nts += contigtable["M"]

            value = (hits, "# Total number of hits (excluding any PCR duplicates)")
            set_in(table, (target, sample, library, genome, "hits_unique(%s)" % genome), value)
            set_in(table, (target, sample, library, genome, "hits_unique_nts(%s)" % genome), (nts, None))
示例#16
0
def read_table(table, filename):
    with open(filename) as table_file:
        for record in parse_padded_table(table_file):
            key = (record["Name"], record["Sample"], record["Library"],
                   record["Contig"])
            if "*" in key:
                continue

            subtable = get_in(table, key, {"Size": int(record["Size"])})
            assert int(subtable["Size"]) == int(record["Size"])

            for field in ("Hits", "SE", "PE_1", "PE_2", "Collapsed", "M", "I",
                          "D"):
                subtable[field] = subtable.get(field, 0) + int(
                    record.get(field, 0))
            set_in(table, key, subtable)
示例#17
0
def read_table(table, filename):
    with open(filename) as table_file:
        for record in parse_padded_table(table_file):
            key = (record["Name"], record["Sample"], record["Library"], record["Contig"])
            if "*" in key:
                continue

            subtable = get_in(table, key)
            if subtable is None:
                subtable = dict(READGROUP_TEMPLATE)
                subtable["Size"] = int(record["Size"])
                set_in(table, key, subtable)

            assert int(subtable["Size"]) == int(record["Size"])
            for key in READGROUP_TEMPLATE:
                if key != "Size":
                    subtable[key] += int(record.get(key, 0))
示例#18
0
def read_table(table, filename):
    with open(filename) as table_file:
        for record in parse_padded_table(table_file):
            key = (record["Name"], record["Sample"], record["Library"],
                   record["Contig"])
            if "*" in key:
                continue

            subtable = get_in(table, key)
            if subtable is None:
                subtable = dict(READGROUP_TEMPLATE)
                subtable["Size"] = int(record["Size"])
                set_in(table, key, subtable)

            assert int(subtable["Size"]) == int(record["Size"])
            for key in READGROUP_TEMPLATE:
                if key != "Size":
                    subtable[key] += int(record.get(key, 0))
示例#19
0
    def _read_raw_bam_stats(self, table):
        for ((genome, target, sample, library),
             filenames) in self._in_raw_bams.iteritems():
            subtable = {}
            for filename in filenames:
                read_coverage_table(subtable, filename)
            key = (target, sample, library)

            hits = 0
            for contigtable in get_in(subtable, key).itervalues():
                hits += contigtable["Hits"]

            value = (
                hits,
                "# Total number of hits (prior to PCR duplicate filtering)")
            set_in(table,
                   (target, sample, library, genome, "hits_raw(%s)" % genome),
                   value)
示例#20
0
    def _read_coverage_tables(cls, key, filenames):
        hits = nts = 0
        for filename in filenames:
            subtable = {}
            read_coverage_table(subtable, filename)
            contigtables = get_in(subtable, key)

            if contigtables is None:
                raise NodeError("Error reading table %r; row not found:"
                                "\n   %s   ...\n\nIf files have been renamed "
                                "during the run, then please remove this file "
                                "in that it may be re-generated.\nHowever, "
                                "note that read-group tags in the BAM files "
                                "may not be correct!"
                                % (filename, "   ".join(key)))

            for contigtable in contigtables.itervalues():
                hits += contigtable["Hits"]
                nts += contigtable["M"]
        return hits, nts
示例#21
0
    def _read_lib_bam_stats(self, table):
        for ((genome, target, sample, library),
             filenames) in self._in_lib_bams.iteritems():
            subtable = {}
            for filename in filenames:
                read_coverage_table(subtable, filename)
            key = (target, sample, library)

            hits = nts = 0
            for contigtable in get_in(subtable, key).itervalues():
                hits += contigtable["Hits"]
                nts += contigtable["M"]

            value = (hits,
                     "# Total number of hits (excluding any PCR duplicates)")
            set_in(
                table,
                (target, sample, library, genome, "hits_unique(%s)" % genome),
                value)
            set_in(table, (target, sample, library, genome,
                           "hits_unique_nts(%s)" % genome), (nts, None))
示例#22
0
    def _read_coverage_tables(cls, key, filenames):
        hits = nts = 0
        for filename in filenames:
            subtable = {}
            read_coverage_table(subtable, filename)
            contigtables = get_in(subtable, key)

            if contigtables is None:
                raise NodeError(
                    "Error reading table %r; row not found:"
                    "\n   %s   ...\n\nIf files have been renamed "
                    "during the run, then please remove this file "
                    "in that it may be re-generated.\nHowever, "
                    "note that read-group tags in the BAM files "
                    "may not be correct!" % (filename, "   ".join(key))
                )

            for contigtable in contigtables.itervalues():
                hits += contigtable["Hits"]
                nts += contigtable["M"]
        return hits, nts
示例#23
0
def test_get_in__get_default_one_keyword():
    assert_equal(utils.get_in({1: 2}, [2]), None)
示例#24
0
def test_get_in__get_value_three_keywords():
    assert_equal(utils.get_in({1: {2: {3: 4}}}, [1, 2, 3]), 4)
示例#25
0
def test_get_in__get_value_two_keywords():
    assert_equal(utils.get_in({1: {2: 3}}, [1, 2]), 3)
示例#26
0
def test_get_in__get_value_one_keyword():
    assert_equal(utils.get_in({1: 2}, [1]), 2)
示例#27
0
def test_get_in__get_default_three_keywords_fail_at_third_with_default():
    assert_equal(utils.get_in({1: {2: {3: 4}}}, [1, 2, 4], "other"), "other")
示例#28
0
def test_get_in__get_default_one_keyword():
    assert_equal(utils.get_in({1 : 2}, [2]), None)
示例#29
0
def test_get_in__get_default_one_keyword_with_default():
    assert_equal(utils.get_in({1 : 2}, [2], "other"), "other")
示例#30
0
def test_get_in__get_default_three_keywords_fail_at_third():
    assert_equal(utils.get_in({1 : {2 : {3 : 4}}}, [1, 2, 4]), None)
示例#31
0
def test_get_in__get_default_three_keywords_fail_at_third_with_default():
    assert_equal(utils.get_in({1 : {2 : {3 : 4}}}, [1, 2, 4], "other"), "other")
示例#32
0
def test_get_in__iterator_keywords():
    assert_equal(utils.get_in({1 : {2 : {3 : 4}}}, iter([1, 2, 3])), 4)
示例#33
0
def test_get_in__get_default_one_keyword_with_default():
    assert_equal(utils.get_in({1: 2}, [2], "other"), "other")
示例#34
0
def test_get_in__get_default_three_keywords_fail_at_third():
    assert_equal(utils.get_in({1: {2: {3: 4}}}, [1, 2, 4]), None)
示例#35
0
def test_get_in__get_value_three_keywords():
    assert_equal(utils.get_in({1 : {2 : {3 : 4}}}, [1, 2, 3]), 4)
示例#36
0
def test_get_in__iterator_keywords():
    assert_equal(utils.get_in({1: {2: {3: 4}}}, iter([1, 2, 3])), 4)
示例#37
0
def test_get_in__get_value_two_keywords():
    assert_equal(utils.get_in({1 : {2 : 3}}, [1, 2]), 3)
示例#38
0
def test_get_in__get_value_one_keyword():
    assert_equal(utils.get_in({1 : 2}, [1]), 2)
示例#39
0
文件: remap.py 项目: CarlesV/paleomix
 def _open_se_sink(reads_type):
     key = (name, reads_type)
     if not get_in(sink_cache, key):
         filename = ReadSink.get_filename(destination, reads_type.lower())
         set_in(sink_cache, key, ReadSink.open(config.destination, filename))
     return key