示例#1
0
def WASP(chrom, gpos, genotype, snps, tagval, read, debug=False):
    """
    Do the WASP-style allele swap
    """

    name = read.query_name
    pos = read.query_alignment_start
    name = name + "_-_" + chrom + ":" + str(gpos[pos])

    # Read attributes
    q = pysam.array_to_qualitystring(read.query_qualities)
    r = read.query_sequence

    chrn = re.sub("^[Cc]hr", "", chrom)
    if (tagval == 1):
        for i in range(len(genotype)):
            if gpos[i] != None:
                if (str(chrn), int(gpos[i]),
                        '1') in snps.keys() and (str(chrn), int(
                            gpos[i]), '2') in snps.keys():
                    r = replace_str_index(r, i,
                                          snps[(str(chrn), int(gpos[i]), '2')])
    if (tagval == 2):
        for i in range(len(genotype)):
            if gpos[i] != None:
                if (str(chrn), int(gpos[i]),
                        '1') in snps.keys() and (str(chrn), int(
                            gpos[i]), '2') in snps.keys():
                    r = replace_str_index(r, i,
                                          snps[(str(chrn), int(gpos[i]), '1')])

    o = "@" + name + "\n" + r + "\n+\n" + q + "\n"
    return (o)
示例#2
0
def out2fq(reads, fout):
    if reads.is_reverse:
        fout.write("@{0}\n{1}\n+\n{2}\n".format(
            reads.qname, reads.get_forward_sequence(),
            pysam.array_to_qualitystring(reads.get_forward_qualities())))
    else:
        fout.write("@{0}\n{1}\n+\n{2}\n".format(reads.qname, reads.query,
                                                reads.qqual))
示例#3
0
    def Clip(self, readStart, readEnd):
        """
        Clip this read to [readStart:readEnd), and return a copy of
        pysam.calignmentfile.AlignedSegment object.
        Assume that read.bamRecord.peer is an unmapped AlignedSegment.
        """
        new_query_name = "%s/%s/%d_%d" % (self.movieName, self.holeNumber,
                                          readStart, readEnd)
        if not (readStart >= self.readStart and readStart <= readEnd and
                readEnd <= self.readEnd):
            raise ValueError("Unable to clip subread %s from read %s." %
                             (new_query_name, self.readName))

        s, e = readStart - self.readStart, readEnd - self.readStart
        QV_TAGS = ["iq", "dq", "dt", "st", "sq", "mq", "ip", "pw"]

        # Create an unaligned pysam.AlignedSegment object.
        ret = pysam.AlignedSegment()
        ret.query_name = new_query_name

        peer = self.bamRecord.peer
        ret.query_sequence = peer.query_sequence[s:e]
        ret.flag = peer.flag

        assert peer.reference_id == -1
        assert peer.reference_start == -1
        assert peer.cigartuples is None
        assert peer.next_reference_id == -1
        assert peer.next_reference_start == -1
        assert peer.template_length == 0

        ret.reference_id = peer.reference_id
        ret.reference_start = peer.reference_start
        ret.cigar = []
        ret.next_reference_id = peer.next_reference_id
        ret.next_reference_start = peer.next_reference_start
        ret.template_length = peer.template_length

        ret.mapping_quality = peer.mapping_quality
        if peer.query_qualities is None:
            ret.query_qualities = None
        else:
            ret.query_qualities = pysam.array_to_qualitystring(peer.query_qualities)[s:e]

        tags = peer.tags[::]
        for index, (tag_name, tag_val) in enumerate(tags):
            if tag_name in QV_TAGS:
                if self.__len__() != len(tag_val):
                    raise ValueError("%s's %s length %d ! = sequence length %d" %
                                     (peer.query_name, tag_name, len(tag_val), self.__len__()))
                tags[index] = (tag_name, tag_val[s:e])
            elif tag_name == 'qs':
                tags[index] = (tag_name, int(readStart))
            elif tag_name == 'qe':
                tags[index] = (tag_name, int(readEnd))

        ret.tags = tags
        return ret
 def test_slideseq2(self):
     bam_fp = pipeline.convert_fastqs_to_unmapped_bam(
         self.fastq_slideseq2_fps,
         "slideseq2",
         tempfile.mkdtemp(),
         name="test",
     )
     with pysam.AlignmentFile(bam_fp, "rb", check_sq=False) as f:
         alignments = list(f.fetch(until_eof=True))
     self.assertEqual(2, len(alignments))
     self.assertEqual(
         [
             "NB501583:801:H7JLTBGXH:1:11101:20912:1050",
             "NB501583:801:H7JLTBGXH:1:11101:8670:1050",
         ],
         [al.query_name for al in alignments],
     )
     self.assertEqual(
         [
             read.sequence
             for read in ngs.fastq.Fastq(self.fastq_slideseq2_fps[1])
         ],
         [al.query_sequence for al in alignments],
     )
     self.assertEqual(
         [
             read.qualities.string
             for read in ngs.fastq.Fastq(self.fastq_slideseq2_fps[1])
         ],
         [
             pysam.array_to_qualitystring(al.query_qualities)
             for al in alignments
         ],
     )
     self.assertEqual(
         {
             ("UR", "TTTTTTTTT"),
             ("UY", "EEEEEEEEE"),
             ("CR", "CTTTGNTCAATGTT"),
             ("CY", "AAAAA#EEAEEEEE"),
             ("RG", "test"),
         },
         set(alignments[0].get_tags()),
     )
     self.assertEqual(
         {
             ("UR", "AGTGTCTCA"),
             ("UY", "EAEAEAEEE"),
             ("CR", "CTCTTNATCCTCAT"),
             ("CY", "AAAAA#EEE/EAE/"),
             ("RG", "test"),
         },
         set(alignments[1].get_tags()),
     )
 def test_indropsv3(self):
     bam_fp = pipeline.convert_fastqs_to_unmapped_bam(
         self.fastq_indropsv3_fps,
         "indropsv3",
         tempfile.mkdtemp(),
         name="test",
     )
     with pysam.AlignmentFile(bam_fp, "rb", check_sq=False) as f:
         alignments = list(f.fetch(until_eof=True))
     self.assertEqual(2, len(alignments))
     self.assertEqual(
         [
             "M03718:773:000000000-JKHP3:1:1101:18272:1693",
             "M03718:773:000000000-JKHP3:1:1101:17963:1710",
         ],
         [al.query_name for al in alignments],
     )
     self.assertEqual(
         [
             read.sequence
             for read in ngs.fastq.Fastq(self.fastq_10xv3_fps[1])
         ],
         [al.query_sequence for al in alignments],
     )
     self.assertEqual(
         [
             read.qualities.string
             for read in ngs.fastq.Fastq(self.fastq_10xv3_fps[1])
         ],
         [
             pysam.array_to_qualitystring(al.query_qualities)
             for al in alignments
         ],
     )
     self.assertEqual(
         {
             ("UR", "CCAAAA"),
             ("UY", "FFBFGG"),
             ("CR", "TACGTCATCTCCTACG"),
             ("CY", "1111AFAF1111AFAF"),
             ("RG", "test"),
         },
         set(alignments[0].get_tags()),
     )
     self.assertEqual(
         {
             ("UR", "TTAGAA"),
             ("UY", "FAAAFF"),
             ("CR", "TTAGATCGTTAGATCG"),
             ("CY", "1>>11DFA1>>11DFA"),
             ("RG", "test"),
         },
         set(alignments[1].get_tags()),
     )
示例#6
0
def read_bam(file):

    if file.endswith(".bam"):
        fh = pysam.AlignmentFile(file, "rb", check_sq=False)
    elif file.endswith(".sam"):
        fh = pysam.AlignmentFile(file, 'r')
    else:
        raise Exception("%r file format error" % file)

    for line in fh:
        #yield [line.qname, line.seq, pysam.array_to_qualitystring(line.query_qualities), line.get_tag('rq')]
        yield line.qname, line.seq, pysam.array_to_qualitystring(
            line.query_qualities)

    fh.close()
示例#7
0
def convert_bam_to_df(data_fp: str) -> pd.DataFrame:
    """Converts a BAM file to a Pandas dataframe.

    Args:
        data_fp: The input filepath for the BAM file to be converted.

    Returns:
        A Pandas dataframe containing the BAM information.
    """
    als = []
    with pysam.AlignmentFile(data_fp, ignore_truncation=True,
                             check_sq=False) as bam_fh:
        for al in bam_fh:
            cellBC, UMI, readCount, grpFlag = al.query_name.split("_")
            seq = al.query_sequence
            qual = al.query_qualities
            encode_qual = pysam.array_to_qualitystring(qual)
            als.append([
                cellBC,
                UMI,
                int(readCount),
                grpFlag,
                seq,
                encode_qual,
                al.query_name,
            ])
    return pd.DataFrame(
        als,
        columns=[
            "cellBC",
            "UMI",
            "readCount",
            "grpFlag",
            "seq",
            "qual",
            "readName",
        ],
    )
示例#8
0
    def Clip(self, readStart, readEnd):
        """
        Clip this read to [readStart:readEnd), and return a copy of
        pysam.calignmentfile.AlignedSegment object.
        Assume that read.bamRecord.peer is an unmapped AlignedSegment.
        """
        new_query_name = "%s/%s/%d_%d" % (self.movieName, self.holeNumber,
                                          readStart, readEnd)
        if not (readStart >= self.readStart and readStart <= readEnd
                and readEnd <= self.readEnd):
            raise ValueError("Unable to clip subread %s from read %s." %
                             (new_query_name, self.readName))

        s, e = readStart - self.readStart, readEnd - self.readStart
        QV_TAGS = ["iq", "dq", "dt", "st", "sq", "mq", "ip", "pw"]

        # Create an unaligned pysam.AlignedSegment object.
        ret = pysam.AlignedSegment()
        ret.query_name = new_query_name

        peer = self.bamRecord.peer
        ret.query_sequence = peer.query_sequence[s:e]
        ret.flag = peer.flag

        assert peer.reference_id == -1
        assert peer.reference_start == -1
        assert peer.cigartuples is None
        assert peer.next_reference_id == -1
        assert peer.next_reference_start == -1
        assert peer.template_length == 0

        ret.reference_id = peer.reference_id
        ret.reference_start = peer.reference_start
        ret.cigar = []
        ret.next_reference_id = peer.next_reference_id
        ret.next_reference_start = peer.next_reference_start
        ret.template_length = peer.template_length

        ret.mapping_quality = peer.mapping_quality
        if peer.query_qualities is None:
            ret.query_qualities = None
        else:
            ret.query_qualities = pysam.array_to_qualitystring(
                peer.query_qualities)[s:e]

        tags = peer.tags[::]
        for index, (tag_name, tag_val) in enumerate(tags):
            if tag_name in QV_TAGS:
                if self.__len__() != len(tag_val):
                    raise ValueError(
                        "%s's %s length %d ! = sequence length %d" %
                        (peer.query_name, tag_name, len(tag_val),
                         self.__len__()))
                tags[index] = (tag_name, tag_val[s:e])
            elif tag_name == 'qs':
                tags[index] = (tag_name, int(readStart))
            elif tag_name == 'qe':
                tags[index] = (tag_name, int(readEnd))

        ret.tags = tags
        return ret
示例#9
0
def reverse_converter(input_file,
                      output_dir,
                      verbose=False,
                      quiet=False,
                      progress=False,
                      **kwargs):
    """CRAM to Fast5 reverse conversion tool"""
    try:
        # Define logger with appropriate verbosity
        logger = get_logger(name="ont2cram_reverse_converter",
                            verbose=verbose,
                            quiet=quiet)

        logger.debug("Check input files")
        readable_file(input_file)
        #writable_dir(output_dir)
        check_destination_exists(input_file, output_dir)

        class Attribute:
            path = ''
            type = ''
            value = None
            is_col = False

        attr_dict = {}
        with pysam.AlignmentFile(input_file, "rc", check_sq=False) as samfile:
            logger.debug("Read CRAM header")
            for comment in samfile.header["CO"]:
                if not comment.startswith(("ATR:", "COL:")): continue

                parts = shlex.split(comment)
                part0 = parts[0].split(':')

                a = Attribute()
                a.path = part0[1]
                a.type = part0[2]
                a.value = parts[2][3:] if len(parts) == 3 else None
                a.is_col = comment.startswith("COL:")

                tag = parts[1][3:]

                attr_dict[tag] = a

            def is_hex_str(obj, expected_len, is_null_term=False):
                bytes_to_remove_end = 1 if is_null_term else 0
                return isinstance(obj,bytes) and len(obj)==expected_len and (not is_null_term or obj[-1:]==0) \
                       and STR_HEX_PATTERN.match(obj[:expected_len-bytes_to_remove_end].decode())

            def get_path(hdf_path, read_number_long, read_number_short):
                if read_number_short:
                    hdf_path = hdf_path.replace("Read_YYY", read_number_short)
                if read_number_long:
                    hdf_path = hdf_path.replace("read_XXXXX", read_number_long)
                return hdf_path

            def get_path_from_dummy(hdf_path, read_number_long,
                                    read_number_short):
                p = get_path(hdf_path, read_number_long, read_number_short)
                return p.replace("/dummy_attr", "")

            def write_hdf_attr(hdf5_file, attr_path, attr_value, attr_type):
                #print(f"path={attr_path}, val={attr_value}, type={attr_type}")
                group_name, _, attr_name = attr_path.rpartition('/')
                if attr_name == "noname": raise
                try:
                    group = hdf5_file[group_name]
                except KeyError:
                    group = hdf5_file.create_group(group_name)

                try:
                    group.attrs.create(attr_name, attr_value, dtype=attr_type)
                except (TypeError, ValueError):
                    group.attrs[attr_name] = convert_type(
                        attr_value, attr_type)

            logger.debug("CRAM header")
            for read in tqdm.tqdm(samfile.fetch(until_eof=True),
                                  unit=" Reads",
                                  unit_scale=True,
                                  disable=not progress):
                COUNTER["Reads written"] += 1
                fast5_filename = read.get_tag(FILENAME_TAG)
                read_number_long = None
                try:
                    read_number_long = "read_" + str(
                        read.get_tag(READ_NUM_TAG_LONG))
                except KeyError:
                    pass

                read_number_short = None
                try:
                    read_number_short = "Read_" + str(
                        read.get_tag(READ_NUM_TAG_SHORT))
                except KeyError:
                    pass

                output_file = os.path.join(output_dir, fast5_filename)
                dir = os.path.dirname(output_file)
                if not os.path.exists(dir) and len(dir) > 0:
                    os.makedirs(dir)

                with h5py.File(output_file, "a") as f:
                    if read.query_name != "nofastq":
                        fastq_lines = np.string_("\n".join([
                            read.query_name, read.query_sequence, '+',
                            pysam.array_to_qualitystring(read.query_qualities)
                            + '\n'
                        ]))
                        f.create_dataset(
                            "/Analyses/Basecall_1D_000/BaseCalled_template/Fastq",
                            data=fastq_lines)

                    DSETS = {}
                    for tag_name, tag_val in read.get_tags():
                        if tag_name in RESERVED_TAGS: continue
                        a = attr_dict[tag_name]
                        if a.is_col:
                            dset_name, _, col_name = get_path(
                                a.path, read_number_long,
                                read_number_short).rpartition('/')

                            if dset_name.endswith(
                                    "Fastq") and col_name == "noname":
                                continue
                            if dset_name not in DSETS:
                                DSETS[dset_name] = []
                            dset = DSETS[dset_name]

                            if col_name == "noname":
                                dset.append(tag_val)
                            else:
                                dset.append(
                                    np.array(list(tag_val.split('\x03'))
                                             if a.type.startswith(
                                                 ('S', 'U')) else tag_val,
                                             dtype=[(col_name, a.type)]))
                    for dset_name, columns in DSETS.items():
                        d = columns[0] if len(
                            columns) == 1 else rfn.merge_arrays(
                                columns, flatten=True, usemask=False)
                        f.create_dataset(dset_name, data=d)

                    # write constant values stored in cram header
                    for a in attr_dict.values():
                        if a.is_col:
                            continue
                        if a.value:
                            write_hdf_attr(
                                f,
                                get_path(a.path, read_number_long,
                                         read_number_short), a.value, a.type)

                    # write tags stored in cram records
                    for tag_name, tag_val in read.get_tags():
                        if tag_name in RESERVED_TAGS: continue
                        a = attr_dict[tag_name]
                        if is_empty_hdf_node(a.path):
                            f.create_group(
                                get_path_from_dummy(a.path, read_number_long,
                                                    read_number_short))
                            continue
                        if a.is_col: continue
                        if a.value != tag_val:
                            write_hdf_attr(
                                f,
                                get_path(a.path, read_number_long,
                                         read_number_short), tag_val, a.type)

    finally:
        logger.info(dict_to_str(COUNTER))
for aligned_segment in alignment_file:
    """ @type aligned_segment: pysam.libcalignedsegment.AlignedSegment """
    if vendor_filter and aligned_segment.is_qcfail:
        continue

    # Assign the AlignedSegment to its ReadGroup-specific GzipFile.
    if aligned_segment.is_read1:
        fifo_queue = fifo_queue_dict[aligned_segment.get_tag('RG')][0]
    else:
        fifo_queue = fifo_queue_dict[aligned_segment.get_tag('RG')][1]

    fifo_queue.put(
        '@' + aligned_segment.query_name + '\n' +
        aligned_segment.query_sequence + '\n' +
        '+\n' +
        pysam.array_to_qualitystring(aligned_segment.query_qualities) + '\n'
    )

    if aligned_segment.has_tag('BC'):
        # Assign the AlignedSegment to its ReadGroup-specific GzipFile.
        fifo_queue = fifo_queue_dict[aligned_segment.get_tag('RG')][2]

        fifo_queue.put(
            '@' + aligned_segment.query_name + '\n' +
            aligned_segment.get_tag('BC') + '\n' +
            '+\n' +
            aligned_segment.get_tag('QT') + '\n'
        )

for read_group_id in fifo_queue_dict.iterkeys():
    for fifo_queue in fifo_queue_dict[read_group_id]:
示例#11
0
文件: network.py 项目: js2264/metaTOR
def retrieve_reads_contig_wise(sam_merged, contig_data, output_dir):

    contig_dict = dict()
    opened_files = dict()

    def close_all_files():
        for my_file in opened_files.values():
            my_file.close()

    with open("contig_data.txt") as contig_data:
        for line in contig_data:
            fields = line.split()
            node = fields[0]
            core = fields[-3]
            contig_dict[node] = core

    query_getter = operator.attrgetter("query_name")

    with pysam.AlignmentFile(sam_merged, "rb") as sam_handle:
        for (my_read_name,
             alignment_pool) in itertools.groupby(sam_handle, query_getter):
            my_read_set = dict()
            my_core_set = set()
            while "Reading alignments from alignment pool":
                try:
                    my_alignment = next(alignment_pool)
                    # print(contig_dict[my_alignment.reference_name])

                    is_reverse = my_alignment.is_reverse

                    my_seq = my_alignment.query_sequence

                    my_qual = my_alignment.query_qualities
                    my_qual_string = pysam.array_to_qualitystring(my_qual)

                    if is_reverse:
                        my_seq_string = str(Seq(my_seq).reverse_complement())
                        my_qual_string = my_qual_string[::-1]
                    else:
                        my_seq_string = my_seq

                    my_seq_tuple = (my_seq_string, my_qual_string)

                    if len(my_read_set) > 2:
                        logger.warning(
                            "Something's gone wrong with read set %s, as "
                            "there are %s of them",
                            my_read_name,
                            len(my_read_set),
                        )
                    elif len(my_read_set) == 0:
                        my_read_set[my_seq_tuple] = "forward"
                    elif my_seq_tuple not in my_read_set.keys():
                        my_read_set[my_seq_tuple] = "reverse"
                    try:
                        ref = contig_dict[my_alignment.reference_name]
                        my_core_set.add(ref)
                    except KeyError:
                        my_core_set.add("unknown")
                except StopIteration:
                    if len(my_read_set) == 2:
                        for core_name in my_core_set:
                            for my_tuple, file_id in my_read_set.items():
                                if file_id == "forward":
                                    file_end = ".end1"
                                elif file_id == "reverse":
                                    file_end = ".end2"

                                basename = "{core_name}{file_end}".format(
                                    core_name, file_end)
                                filename = os.path.join(output_dir, basename)
                                try:
                                    file_to_write = opened_files[filename]
                                except KeyError:
                                    file_to_write = open(filename, "w")
                                    opened_files[filename] = file_to_write
                                except IOError:
                                    logger.error(
                                        "Error when trying to handle"
                                        "%s. Maybe there are too many opened"
                                        "files at once: %s",
                                        filename,
                                        len(opened_files),
                                    )
                                    raise

                                seq, qual = my_tuple
                                line = "@{}\n{}\n+\n{}\n".format(
                                    my_read_name, seq, qual)
                                file_to_write.write(line)
                    elif len(my_read_set) == 1:
                        for core_name in my_core_set:
                            file_end = ".end"
                            basename = "{}{}".format(core_name, file_end)
                            filename = os.path.join(output_dir, basename)
                            try:
                                file_to_write = opened_files[filename]
                            except KeyError:
                                file_to_write = open(filename, "w")
                                opened_files[filename] = file_to_write
                            except IOError:
                                logger.error(
                                    "Error when trying to handle"
                                    "%s. Maybe there are too many opened"
                                    "files at once: %s",
                                    filename,
                                    len(opened_files),
                                )
                                raise
                            seq, qual = my_read_set.keys()[0]
                            line = "@{}\n{}\n+\n{}\n".format(
                                my_read_name, seq, qual)
                            file_to_write.write(line)
                    else:
                        logger.warning(
                            "Something's gone wrong with read set %s, as "
                            "there are %s of them",
                            my_read_name,
                            len(my_read_set),
                        )
                    break

    close_all_files()