示例#1
0
def test_get_blocked_alignment():
    bam = pysam.AlignmentFile(
        '/home/jgarthur/sv/analysis/alignments/bwa_mem/short-reads/jun_jul.mdup.merge.mdup.bam',
        'rb')
    blocks = [
        GenomeInterval('1', 0, 100),
        GenomeInterval('1', 110, 210),
        GenomeInterval('1', 210, 2000)
    ]
    aln = pysam.AlignedSegment()
    aln.pos = 0
    aln.cigarstring = '50M'
    aln.seq = 'A' * 50
    aln.is_reverse = False
    print(get_blocked_alignment(aln, blocks, 0, bam))
    assert (get_blocked_alignment(aln, blocks, 0, bam) == ([1], 0))
    assert (get_blocked_alignment(aln, blocks, 0, bam,
                                  is_rf=True) == ([0], 50))
    aln.is_reverse = True
    print(get_blocked_alignment(aln, blocks, 0, bam))
    assert (get_blocked_alignment(aln, blocks, 0, bam) == ([0], 50))
    assert (get_blocked_alignment(aln, blocks, 0, bam, is_rf=True) == ([1], 0))

    aln = pysam.AlignedSegment()
    aln.rname = 0
    aln.pos = 90
    aln.seq = 'A' * 40
    aln.cigarstring = '20M20S'
    aln.set_tag('SA', '1,191,-,20M20S,60,0;', 'Z')
    print(get_blocked_alignment(aln, blocks, 0, bam))
    assert (get_blocked_alignment(aln, blocks, 0, bam) == ([1, 2], -90))
    assert (get_blocked_alignment(aln, blocks, 0, bam,
                                  is_rf=True) == ([3, 0], -80))
示例#2
0
def pair_up(read_pair):
	r1_cp = pysam.AlignedSegment()
	r2_cp = pysam.AlignedSegment()
	r1_cp = copy.deepcopy(read_pair[0])
	r2_cp = copy.deepcopy(read_pair[1])
	if r1_cp.query_name != r2_cp.query_name:
		print("Error: read name unmathced.\n")
		sys.exit(1);
	# change flag
	flag_swag = flag_table_proper[(r1_cp.flag, r2_cp.flag)]
	r1_cp.flag = flag_swag[0]
	r2_cp.flag = flag_swag[1]
	# now change RNEXT and PNEXT
	if(r1_cp.reference_name  == r2_cp.reference_name):
		#r1_cp.next_reference_name = r1_cp.reference_name
		#r2_cp.next_reference_name = r1_cp.reference_name
		r1_cp.next_reference_start = r2_cp.reference_start
		r2_cp.next_reference_start = r1_cp.reference_start
		r1_cp.template_length  = r1_cp.next_reference_start -  r1_cp.reference_start
		r2_cp.template_length  = -r1_cp.template_length
		r1_cp.next_reference_name = r2_cp.next_reference_name = "="
	else:
		r1_cp.next_reference_name, r2_cp.next_reference_name = r2_cp.reference_name, r1_cp.reference_name
		r1_cp.next_reference_start = r2_cp.reference_start
		r2_cp.next_reference_start = r1_cp.reference_start
		r1_cp.template_length  = r1_cp.next_reference_start -  r1_cp.reference_start
		r2_cp.template_length  = -r1_cp.template_length
	return (r1_cp, r2_cp)
示例#3
0
 def setUp(self):
     parser = argparse.ArgumentParser()
     self.ref = 'AAAAAAAAAAAAAAAAAAAA'
     self.args = parser.parse_args([])
     self.args.min_mq = 30
     self.args.min_bq = 30
     self.args.cons_cov = 2
     self.aln1 = pysam.AlignedSegment()
     self.aln1.reference_start = 10
     self.aln1.query_name = 'read1'
     self.aln1.mapping_quality = 30
     self.aln1.query_sequence = "AAAAA"
     self.aln1.query_qualities = [30] * 5
     self.aln1.cigarstring = '5M'
     self.aln2 = pysam.AlignedSegment()
     self.aln2.reference_start = 13
     self.aln2.query_name = 'read2'
     self.aln2.mapping_quality = 30
     self.aln2.query_sequence = "AAAAA"
     self.aln2.query_qualities = [30] * 5
     self.aln2.cigarstring = '5M'
     self.aln3 = pysam.AlignedSegment()
     self.aln3.reference_start = 15
     self.aln3.query_name = 'read3'
     self.aln3.mapping_quality = 30
     self.aln3.query_sequence = "TAAAA"
     self.aln3.query_qualities = [30] * 5
     self.aln3.cigarstring = '5M'
     return
示例#4
0
    def setUp(self):
        self.mq = 30
        self.bq = 30
        aln1 = pysam.AlignedSegment()
        aln1.reference_start = 10
        aln1.query_name = 'read1'
        aln1.mapping_quality = 30
        aln1.query_sequence = "AAAAATAAAATAAAAT"
        aln1.query_qualities = [30] * 16
        aln1.cigarstring = '16M'

        aln2 = pysam.AlignedSegment()
        aln2.reference_start = 12
        aln2.query_name = 'read2'
        aln2.mapping_quality = 20
        aln2.query_sequence = "AAAGAAGAAAAG"
        qqual = [33] * 12
        qqual[3] = 20
        aln2.query_qualities = qqual
        aln2.cigarstring = '5M2D7M'

        aln3 = pysam.AlignedSegment()
        aln3.mapping_quality = 0
        aln3.query_name = 'read3'

        self.alns = [aln1, aln2, aln3]
示例#5
0
def simulate_read_pair(sequence, start, length=150, isize=400, flip=False):
    r1 = pysam.AlignedSegment()
    r1.query_sequence = sequence[start:start + length]

    r2 = pysam.AlignedSegment()
    pos2 = start + isize
    r2.query_sequence = reverse_comp(sequence[pos2 - length:pos2])

    if flip:
        r1, r2 = r2, r1
    return ReadPair(Alignment(r1), Alignment(r2), read_stats)
    def test_iterable_molecule_iter(self):

        from singlecellmultiomics.molecule import MoleculeIterator
        from singlecellmultiomics.fragment import Fragment

        with pysam.AlignmentFile('test.sam',
                                 'w',
                                 reference_names=['chr1', 'chr2'],
                                 reference_lengths=[1000, 1000]) as test_sam:
            read_A = pysam.AlignedSegment(test_sam.header)
            read_A.set_tag('SM', 'CELL_1')
            read_A.set_tag('RX', 'CAT')
            read_A.reference_name = 'chr1'
            read_A.reference_start = 100
            read_A.query_sequence = 'ATCGGG'
            read_A.cigarstring = '6M'
            read_A.mapping_quality = 60

            read_B = pysam.AlignedSegment(test_sam.header)
            read_B.set_tag('SM', 'CELL_1')
            read_B.set_tag('RX', 'CAT')
            read_B.reference_name = 'chr1'
            read_B.reference_start = 100
            read_B.query_sequence = 'ATCGG'
            read_B.cigarstring = '5M'
            read_B.mapping_quality = 60

            read_C = pysam.AlignedSegment(test_sam.header)
            read_C.set_tag('SM', 'CELL_2')
            read_C.set_tag('RX', 'CAT')
            read_C.reference_name = 'chr1'
            read_C.reference_start = 100
            read_C.query_sequence = 'ATCGG'
            read_C.cigarstring = '5M'
            read_C.mapping_quality = 60

            reads = [read_A, read_B, read_C]
            mi = MoleculeIterator(reads, yield_invalid=True)
            molecules = []
            for molecule in mi:
                molecules.append(molecule)

            self.assertEqual(len(molecules), 2)
            self.assertEqual(max((len(m) for m in molecules)), 2)
            self.assertEqual(min((len(m) for m in molecules)), 1)

            # Test tags:
            a = molecules[0]
            a.write_tags()
            self.assertEqual(a[0][0].get_tag('TF'), 2)

        os.remove('test.sam')
示例#7
0
def initialise_alignment(query_name,
                         reference_id,
                         reference_start,
                         query_sequence,
                         cigarstring,
                         flag,
                         mapping_quality=60,
                         query_qualities=None,
                         tags=None,
                         header=None):
    """Create a `pysam.AlignedSegment` object.

    :param query_name: name of the query sequence
    :param reference_id: index to the reference name
    :param reference_start: 0-based index of first leftmost reference
        coordinate
    :param query_sequence: read sequence bases, including those soft clipped
    :param cigarstring: cigar string representing the alignment of query
        and reference
    :param flag: bitwise flag representing some properties of the alignment
        (see SAM format)
    :param mapping_quality: optional quality of the mapping or query to
        reference
    :param query_qualities: optional base qualities of the query, including
        soft-clipped ones!
    :param header: optional `pysam.AlignmentHeader` object, enabling use of the
        reference_name attr of the returned `pysam.AlignedSegment` obj.

    :returns: `pysam.AlignedSegment` object
    """
    if tags is None:
        tags = dict()
    if header is None:
        a = pysam.AlignedSegment()
    else:
        a = pysam.AlignedSegment(header)
    a.query_name = query_name
    a.reference_id = reference_id
    a.reference_start = reference_start
    a.query_sequence = query_sequence
    a.cigarstring = cigarstring
    a.flag = flag
    a.mapping_quality = mapping_quality
    if query_qualities is not None:
        a.query_qualities = query_qualities

    for tag_name, tag_value in tags.items():
        a.set_tag(tag_name, tag_value)

    return a
示例#8
0
    def package(self):
        """
        Convert ``self.rec_1`` and ``self.rec_2`` from ``pysam.AlignedSegment`` to ``str``

        The user may want to implement multiprocessing to decrease the amount of time to classify all
        reads in a SAM/BAM file. ``self.rec_1`` and self.``rec_2`` and all ``pysam.AlignedSegment`` objects 
        are not pickleable and cannot be passed through a ``multiprocessing.Queue``. Instead of directly handling
        BAM/SAM strings, users can choose to create a ``Pair``, call ``package`` to convert the records to strings
        using the ``to_string()`` function from ``pysam`` and pass the ``Pair`` object through a ``Queue``.
        """
        if type(self.rec_1) == type(pysam.AlignedSegment()) and type(
                self.rec_2) == type(pysam.AlignedSegment()):
            self.rec_1 = self.rec_1.to_string()
            self.rec_2 = self.rec_2.to_string()
示例#9
0
def build_read(query_name="read_28833_29006_6945",
               query_sequence="AGCTTAGCTA",
               flag=99,
               reference_id=0,
               reference_start=32,
               mapping_quality=20,
               cigar=None,
               next_reference_id=0,
               next_reference_start=199,
               template_length=167,
               query_qualities=None):
    #pylint: disable=no-member,too-many-arguments
    a = pysam.AlignedSegment()
    a.query_name = query_name
    a.query_sequence = query_sequence
    a.flag = flag
    a.reference_id = reference_id
    a.reference_start = reference_start
    a.mapping_quality = mapping_quality
    if cigar is None:
        a.cigar = ((0, len(query_sequence)), )
    else:
        a.cigar = cigar
    a.next_reference_id = next_reference_id
    a.next_reference_start = next_reference_start
    a.template_length = template_length
    if query_qualities is None:
        a.query_qualities = [27] * len(query_sequence)
    return MicroMock(aligned_segment=a)
示例#10
0
def print_as_BAM(linked, header, path):
    with pysam.AlignmentFile(path, 'wb', header=header) as f:
        for n, introns in enumerate(linked):
            introns = sort_by_pos(introns)
            # calulate the postion, and distance to the next intron
            if len(introns) > 1:
                tlen = introns[-1][2] - introns[0][1] + 1
            else:
                tlen = 0
            # print out each intron as a seperate BAM entry
            for m, i in enumerate(introns):
                chrom, start, end, strand = i
                length = end - start + 1
                if m < len(introns) - 1:
                    next_ref = introns[m + 1][1]
                else:
                    next_ref = introns[0][1]
                    tlen = -tlen
                a = pysam.AlignedSegment()
                a.query_name = 'linked' + str(n)
                a.query_sequence = 'N' * length
                a.flag = 0
                a.reference_id = chrom
                a.reference_start = start
                a.mapping_quality = 60  # 60 = unqiuely mapped for HISAT2
                a.cigartuples = [(0, length)]
                a.next_reference_id = chrom
                a.next_reference_start = next_ref
                a.template_length = tlen
                a.query_qualities = pysam.qualitystring_to_array('/' * length)
                a.tags = [('XN', next_ref + 1), ('XI', len(introns))]
                f.write(a)
示例#11
0
def block_parser_handle_hanging(opts, aln, bam, g, blocks, block_ends,
                                insert_ranges, cached_dist, map_models,
                                block_idx):
    mate = pysam.AlignedSegment()
    mate.is_unmapped = aln.mate_is_unmapped
    # make sure we don't end up with mate.rname == -1 if mate is unmapped
    if mate.is_unmapped:
        mate.rname = aln.rname
        mate.pos = aln.pos
    else:
        mate.rname = aln.mrnm
        mate.pos = aln.mpos
    mate.mapq = 0
    if opts['use_mate_tags']:
        mate_rlen, mate_qmean = aln.get_tag('ZR'), aln.get_tag('ZQ')
    else:
        # values don't matter in this case since we won't condition on qmean/rlen
        mate_rlen, mate_qmean = aln.query_length, 0
    mate.query_sequence = 'A' * mate_rlen
    block_parser_handle_pair(opts,
                             aln,
                             mate,
                             bam,
                             g,
                             blocks,
                             block_ends,
                             insert_ranges,
                             cached_dist,
                             map_models,
                             block_idx1=block_idx,
                             qmean2=mate_qmean)
 def get_chic_read(header,
                   qname,
                   contig='chr1',
                   start=100,
                   sequence='ATCGGG',
                   cigar=None,
                   umi='CAT',
                   sample='CELL_1',
                   is_reverse=False,
                   read1=True,
                   paired=False,
                   proper_pair=True):
     read = pysam.AlignedSegment(header)
     read.set_tag(
         'SM', sample
     )  # The sample to which the sample belongs is extracted from the SM tag
     read.set_tag('RX', umi)  # The UMI is extracted from the RX tag
     read.set_tag('MX', 'scCHIC')
     # By default the molecule assignment is done based on the mapping location of read 1:
     read.reference_name = contig
     read.reference_start = start
     read.query_name = qname
     read.query_sequence = sequence
     read.is_reverse = is_reverse
     read.cigarstring = f'{len(sequence)}M' if cigar is None else cigar
     if read1:
         read.is_read1 = True
         read.is_read2 = False
     else:
         read.is_read1 = False
         read.is_read2 = True
     if paired:
         read.is_paired = True
         read.is_proper_pair = proper_pair
     return read
示例#13
0
def bamFile(tmpdir_factory):
    header = {'HD': {'VN': '1.0'}, 'SQ': [{'LN': 1000, 'SN': 'ref'}]}
    p = tmpdir_factory.mktemp('test').join('test.bam')
    outFile = pysam.AlignmentFile(str(p), "wb", header=header)
    a = pysam.AlignedSegment()
    a.query_name = "read3"
    a.query_sequence = "GGGGAAAAAT"
    a.reference_start = 28
    a.reference_id = 0
    a.mapping_quality = 20
    a.cigar = ((0, 10), )
    #a.query_qualities = pysam.qualitystring_to_array("((((((((((")
    a.flag = 16
    outFile.write(a)
    a.query_name = "read2"
    a.reference_start = 32
    a.query_sequence = "AAAAATTTTT"
    a.flag = 0
    outFile.write(a)
    a.query_name = "read1"
    a.query_sequence = "TTAAAAACCCCCGGC"
    #a.query_qualities = pysam.qualitystring_to_array("(((((((((((((")
    a.cigar = ((5, 5), (4, 2), (0, 10), (2, 2), (0, 1), (1, 1), (0, 1))
    outFile.write(a)
    outFile.close()
    pysam.index(str(p))
    return (p)
示例#14
0
    def test_simple(self):
        al = pysam.AlignedSegment()
        al.reference_start = 100
        al.cigar = [(MATCH, 100)]
        al.seq = 'A' * 100

        cropped = pysam.AlignedSegment()
        cropped.reference_start = 150
        cropped.cigar = [(SOFT_CLIP, 50), (MATCH, 10), (SOFT_CLIP, 40)]
        cropped.seq = 'A' * 100

        output = sam.crop_al_to_ref_int(al, 150, 159)
        self.assertEqual(output, cropped)

        output = sam.crop_al_to_ref_int(al, 150, 158)
        self.assertNotEqual(output, cropped)
示例#15
0
 def test_starts_just_before_deletion(self):
     al = pysam.AlignedSegment()
     al.reference_start = 100
     al.cigar = [(MATCH, 50), (DEL, 10), (MATCH, 50)]
     al.seq = 'A' * 100
     
     cropped = pysam.AlignedSegment()
     cropped.reference_start = 149
     cropped.cigar = [(SOFT_CLIP, 49), (MATCH, 1), (DEL, 10), (MATCH, 50)]
     cropped.seq = 'A' * 100
     
     output = sam.crop_al_to_ref_int(al, 149, 210)
     self.assertEqual(output, cropped)
     
     output = sam.crop_al_to_ref_int(al, 150, 210)
     self.assertNotEqual(output, cropped)
示例#16
0
def create_Bam(alignments, outbam):
    fa = pyfaidx.Fasta('chr1.fa')
    dict_fa = {'HD': {'VN': 1.6, 'SO': 'coordinate'}, 'SQ': [{'SN': x, 'LN': len(fa[x])} for x in fa.keys()]}
    alignmentsSorted = sorted(alignments, key = attrgetter('contig', 'pos'))
    fh=pysam.AlignmentFile(outbam, mode="wb", header=dict_fa)
    for i, subreads in enumerate(alignmentsSorted):
        s = pysam.AlignedSegment(fh.header)
        if subreads.flag == 4:
            s.is_unmapped = True
            s.query_name = subreads.Rname
            s.query_sequence = subreads.seq
            s.query_qualities = np.array([ord(x) - 33 for x in list(subreads.basequal)])
        else:
            #s = pysam.AlignedSegment(fh.header)
            s.is_unmapped = False
            s.reference_name = subreads.contig
            s.query_name = subreads.Rname
            s.query_sequence = subreads.seq
            s.reference_start = subreads.pos
            s.cigarstring = subreads.cigar
            s.is_reverse = True if subreads.flag == 16 else False
            s.mapping_quality = subreads.mapq
            s.set_tags([("MD", subreads.MDtag, "Z"), ("cs", subreads.cstag, "Z")])
            s.query_qualities = np.array([ord(x) - 33 for x in list(subreads.basequal)])
        fh.write(s)
    fh.close()
    pysam.sort("-o", "test.srt.bam", "test.bam")
    pysam.index("test.srt.bam")
示例#17
0
    def test_process_reads_read_obs_paired_end_overlap_1bad_base_qual(self):
        aln1b = pysam.AlignedSegment()
        aln1b.reference_start = 20
        aln1b.query_name = 'read1'
        aln1b.mapping_quality = 20
        aln1b.query_sequence = "AAAAATAAAACAAAAC"
        qqual = [30] * 16
        qqual[0] = 5
        aln1b.query_qualities = qqual
        aln1b.cigarstring = '16M'
        self.alns.append(aln1b)

        var_pos = [15, 20, 25, 35]

        res = preprocess.process_reads(self.alns, var_pos, 20, 10)
        exp = {
            'read1': {
                15: 'T',
                20: 'T',
                25: 'T',
                35: 'C'
            },
            'read2': {
                15: 'G',
                20: 'G',
                25: 'G'
            }
        }
        self.assertEqual(res, exp)
示例#18
0
def test_pysam():
  import pysam

  # Create BAM file from scratch
  # Code stolen from https://pysam.readthedocs.io/en/latest/usage.html#creating-bam-cram-sam-files-from-scratch
  header = { 'HD': {'VN': '1.0'},
            'SQ': [{'LN': 1575, 'SN': 'chr1'},
                   {'LN': 1584, 'SN': 'chr2'}] }

  file_name = "out.bam"
  with pysam.AlignmentFile(file_name, "wb", header=header) as outf:
    a = pysam.AlignedSegment()
    a.query_name = "read_28833_29006_6945"
    a.query_sequence="AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG"
    a.flag = 99
    a.reference_id = 0
    a.reference_start = 32
    a.mapping_quality = 20
    a.cigar = ((0,10), (2,1), (0,25))
    a.next_reference_id = 0
    a.next_reference_start=199
    a.template_length=167
    a.query_qualities = pysam.qualitystring_to_array("<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<")
    a.tags = (("NM", 1),
              ("RG", "L1"))
    outf.write(a)

  # Verify output file exists
  assert os.path.isfile(file_name)

  # Call samtools to sort the file
  # This will fail if the file is not a valid BAM file
  pysam.sort("-o", "sorted.bam", file_name)
  assert os.path.isfile("sorted.bam")
示例#19
0
def make_unmapped_mate(mate, template, add_tag=True):
    """ Create mate for read using sequence and quality from template
    :param mate:
    :param template:
    :param add_tag:
    :return:
    """
    a = pysam.AlignedSegment()
    a.query_name = template.query_name
    a.flag = template.flag
    a.reference_id = mate.reference_id
    a.reference_start = mate.reference_start
    a.mapping_quality = 0  # template.mapping_quality
    # a.cigar              = # Not set
    a.next_reference_id = mate.reference_id
    a.next_reference_start = mate.reference_start
    # a.template_length    = # Not set
    a.query_sequence = template.query_sequence
    a.query_qualities = template.query_qualities
    #a.tags                 = template.tags
    a.is_secondary = mate.is_secondary
    a.is_paired = True
    a.is_proper_pair = False
    a.is_unmapped = True
    # This tag indicates the segment is a "mock pair"
    a.setTag('YT', mate.get_tag('YT'))
    if add_tag: a.setTag('ZT', "MP")
    return a
示例#20
0
def convert_to_AlignedSegment(header, sequence, quality, barcode_sequence,
                              umi_sequence):
    """
    This function converts the input variables 
    (header,sequence,quality,barcode_sequence,umi_sequence)
    to a unaligned pysam.AlignedSegment with the umi and barcode 
    informations as the following tags:
        Tag  Value
        "B0" barcode_sequence
        "B3" umi_sequence
    :param header: string with the header information
    :param sequence: string with the DNA/RNA sequence
    :param quality: string with the base calling quality values
    :param barcode_sequence: string with the barcode sequence
    :param umi_sequence: string with the unique molecular identifier sequence
    """

    # create
    aligned_segment = pysam.AlignedSegment()

    # Set the standard values
    # Header must not contain empty spaces
    aligned_segment.query_name = header.split()[0]
    aligned_segment.query_sequence = sequence
    aligned_segment.query_qualities = pysam.qualitystring_to_array(quality)

    # setting the flag to un_mapped
    aligned_segment.flag |= pysam.FUNMAP

    # Set the tags
    aligned_segment.set_tag('B0', barcode_sequence)
    aligned_segment.set_tag('B3', umi_sequence)
    aligned_segment.set_tag('RG', '0')

    return aligned_segment
示例#21
0
def SPARKcreateBam(DataFrame, outbam):
    fa = pyfaidx.Fasta('chr1.fa')
    dict_fa = {'HD': {'VN': 1.6, 'SO': 'coordinate'}, 'SQ': [{'SN': x, 'LN': len(fa[x])} for x in fa.keys()]}
    dictSorted = DataFrame.take(DataFrame.count())
    fh = pysam.AlignmentFile(outbam, mode="wb", header=dict_fa)
    for i in range(0, DataFrame.count()):
        s = pysam.AlignedSegment(fh.header)
        if dictSorted[i].flag == 4:
            s.is_unmapped = True
            s.query_name = dictSorted[i].Rname
            s.query_sequence = dictSorted[i].seq
            s.query_qualities = np.array([ord(x) - 33 for x in list(dictSorted[i].QUAL)])
        else:
            s.is_unmapped = False
            s.reference_name = dictSorted[i].contig
            s.query_name = dictSorted[i].Rname
            s.query_sequence = dictSorted[i].seq
            s.reference_start = dictSorted[i].pos
            s.cigarstring = dictSorted[i].cigar
            s.is_reverse = True if dictSorted[i].flag == 16 else False
            s.mapping_quality = dictSorted[i].mapq
            s.set_tags([("MD", dictSorted[i].MDtag, "Z"), ("cs", dictSorted[i].cstag, "Z")])
            s.query_qualities = np.array([ord(x) - 33 for x in list(dictSorted[i].QUAL)])
        fh.write(s)
    fh.close()
    pysam.sort("-o", "test.srt.bam", "test.bam")
    pysam.index("test.srt.bam")
示例#22
0
def write_reads(sam_path, gene, alleles, reads, out_path):
    y = list(reads.values())
    y = sorted(
        y,
        key=lambda r: (
            alleles[r[0].reference_name][0][0] + r[0].reference_start,
            r[0].query_name,
        ),
    )

    with pysam.AlignmentFile(sam_path) as sam:
        with pysam.AlignmentFile(out_path, "wb", template=sam) as out:
            for x, _ in y:
                a = pysam.AlignedSegment()
                a.query_name = x.query_name.split("/")[0]
                a.query_sequence = x.query_sequence
                a.flag = x.flag
                a.reference_id = sam.get_tid("chr22")
                a.reference_start = x.reference_start + alleles[x.reference_name][0][0]
                a.mapping_quality = x.mapping_quality
                a.cigar = x.cigar
                a.next_reference_start = x.next_reference_start
                a.template_length = x.template_length
                a.query_qualities = x.query_qualities
                a.tags = x.tags
                out.write(a)
            cnv_chromosome, cnv_start, cnv_end = gene.cnv_region
            region = "chr{}:{}-{}".format(cnv_chromosome, cnv_start - 500, cnv_end + 1)
            for read in sam.fetch(region=region):
                out.write(read)
    cmd("samtools index {}".format(out_path))
示例#23
0
 def generate_read(self, read_length, query_name, cb, ub):
     reference_id = np.random.randint(len(self.chromosome2length))
     chromosome, chr_length = list(
         self.chromosome2length.items())[reference_id]
     seq = self.chromosome2sequence[chromosome]
     start = np.random.randint(0, chr_length - read_length)
     # straight mapping
     a = pysam.AlignedSegment()
     a.query_name = query_name
     a.query_sequence = ''.join(seq[start:start + read_length])
     # flag taken from pysam example, did not analyze
     a.flag = 99
     a.reference_id = reference_id
     a.reference_start = start
     a.mapping_quality = 255
     a.cigar = ((0, read_length), )
     # a.next_reference_id = reference_id
     # a.next_reference_start = 199
     a.template_length = read_length
     a.query_qualities = pysam.qualitystring_to_array("<" * read_length)
     a.tags = (
         ("NM", 1),
         ("RG", "L1"),
         ("NH", 1),
         # normally should also add number of mutations compared to reference
         ("AS", read_length - 2),
         ("CB", cb),
         ("UB", ub),
     )
     return a
示例#24
0
    def write_alignment(read_id, q_seq, chrm, strand, r_st, q_st, q_en, cigar):
        q_seq = q_seq[q_st:q_en]

        a = pysam.AlignedSegment()
        a.query_name = read_id
        a.query_sequence = q_seq if strand == 1 else mh.revcomp(q_seq)
        a.flag = 0 if strand == 1 else 16
        a.reference_id = map_fp.get_tid(chrm)
        a.reference_start = r_st
        a.cigartuples = [(op, op_l) for op_l, op in cigar]
        a.template_length = q_en - q_st
        map_fp.write(a)

        nalign, nmatch, ndel, nins = [
            0,
        ] * 4
        for op_len, op in cigar:
            if op not in (4, 5): nalign += op_len
            if op in (0, 7): nmatch += op_len
            elif op in (2, 3): ndel += op_len
            elif op == 1: nins += op_len
        # compute alignment stats
        summ_fp.write('{}\t{:.2f}\t{}\t{}\t{}\t{}\n'.format(
            read_id, 100 * nmatch / float(nalign), nalign, nmatch, ndel, nins))
        summ_fp.flush()

        return
示例#25
0
 def test_ends_in_deletion(self):
     al = pysam.AlignedSegment()
     al.reference_start = 100
     al.cigar = [(MATCH, 50), (DEL, 10), (MATCH, 50)]
     al.seq = 'A' * 100
     
     cropped = pysam.AlignedSegment()
     cropped.reference_start = 100
     cropped.cigar = [(MATCH, 50), (SOFT_CLIP, 50)]
     cropped.seq = 'A' * 100
     
     output = sam.crop_al_to_ref_int(al, 100, 155)
     self.assertEqual(output, cropped)
     
     output = sam.crop_al_to_ref_int(al, 100, 160)
     self.assertNotEqual(output, cropped)
示例#26
0
def sam_to_bam(sam_file, bam_file, check_sq=False):
    """
    Convert sam to bam file

    @sam_file: Input sam filename
    @bam_file: Output bam filename
    """
    in_f = pysam.AlignmentFile(sam_file, 'r', check_sq=check_sq)
    in_segs = [seg for seg in in_f.fetch(until_eof=True)]

    out_f = pysam.AlignmentFile(bam_file, 'wb', header=in_f.header)
    for seg in in_segs:
        a = pysam.AlignedSegment()
        a.query_name = seg.query_name
        a.query_sequence = seg.query_sequence
        a.flag = seg.flag
        a.reference_id = seg.reference_id
        a.reference_start = seg.reference_start
        a.mapping_quality = seg.mapping_quality
        a.cigar = seg.cigar
        a.next_reference_id = seg.next_reference_id
        a.next_reference_start = seg.next_reference_start
        a.template_length = seg.template_length
        a.query_qualities = seg.query_qualities
        a.tags = seg.tags
        out_f.write(a)

    in_f.close()
    out_f.close()
示例#27
0
 def test_ends_just_before_insertion(self):
     al = pysam.AlignedSegment()
     al.reference_start = 100
     al.cigar = [(MATCH, 50), (INS, 10), (MATCH, 50)]
     al.seq = 'A' * 110
     
     cropped = pysam.AlignedSegment()
     cropped.reference_start = 100
     cropped.cigar = [(MATCH, 50), (SOFT_CLIP, 60)]
     cropped.seq = 'A' * 110
     
     output = sam.crop_al_to_ref_int(al, 100, 149)
     self.assertEqual(output, cropped)
     
     output = sam.crop_al_to_ref_int(al, 100, 150)
     self.assertNotEqual(output, cropped)
示例#28
0
 def make_unaligned(read):
     unal = pysam.AlignedSegment()
     unal.query_name = read.name
     unal.is_unmapped = True
     unal.query_sequence = read.seq
     unal.query_qualities = fastq.decode_sanger(read.qual)
     return unal
示例#29
0
def test_goodFiles(tmpdir, bamFile):
    d = tmpdir.mkdir('dir')
    p = d.join('test.bam')
    header = {'HD': {'VN': '1.0'}, 'SQ': [{'LN': 1000, 'SN': 'ref'}]}
    outFile = pysam.AlignmentFile(str(p), "wb", header=header)
    a = pysam.AlignedSegment()
    a.query_name = "read1"
    a.query_sequence = "AAAAATTTTT"
    a.reference_id = 0
    a.reference_start = 32
    a.mapping_quality = 20
    a.cigar = ((0, 10), )
    #a.query_qualities = pysam.qualitystring_to_array("((((((((((")
    outFile.write(a)
    outFile.close()
    pysam.index(str(p))
    count = 0
    out = next(getstartends.getStartsInFile(str(p)))
    assert out['start'] == 33
    assert out['end'] == 42
    assert out['strand'] == '+'
    assert out['ref'] == 'ref'
    for read, start, strand, end in zip(
            getstartends.getStartsInFile(str(bamFile), maxGaps=10),
        [29, 33, 33], ['-', '+', '+'], [38, 42, 46]):
        assert read['start'] == start
        assert read['strand'] == strand
        assert read['end'] == end
    for read, start, strand, end in zip(
            getstartends.getStartsInFile(str(bamFile)), [29, 33], ['-', '+'],
        [38, 42]):
        assert read['start'] == start
        assert read['strand'] == strand
        assert read['end'] == end
示例#30
0
 def toAlignedSegment(cls, read, targetIds):
     ret = pysam.AlignedSegment()
     # QNAME
     ret.query_name = read.fragmentName.encode(cls._encoding)
     # SEQ
     ret.query_sequence = read.alignedSequence.encode(cls._encoding)
     # FLAG
     ret.flag = cls.toSamFlag(read)
     # RNAME
     refName = read.alignment.position.referenceName
     ret.reference_id = targetIds[refName]
     # POS
     ret.reference_start = int(read.alignment.position.position)
     # MAPQ
     ret.mapping_quality = read.alignment.mappingQuality
     # CIGAR
     ret.cigar = cls.toCigar(read)
     # RNEXT
     nextRefName = read.nextMatePosition.referenceName
     ret.next_reference_id = targetIds[nextRefName]
     # PNEXT
     ret.next_reference_start = int(read.nextMatePosition.position)
     # TLEN
     ret.template_length = read.fragmentLength
     # QUAL
     ret.query_qualities = read.alignedQuality
     ret.tags = cls.toTags(read)
     return ret