예제 #1
0
파일: test_mutate.py 프로젝트: rtcz/varlock
 def test_mutate_02(self):
     # EOF VAC case
     Vac.text2vac(self.RESOURCE_PATH + 'input_02.vac.txt', self.RESOURCE_PATH + 'input_02.vac')
     bdiff_file = self._mut.mutate(
         vac_filename=self.RESOURCE_PATH + 'input_02.vac',
         mut_bam_filename=self.RESOURCE_PATH + 'output_02.bam',
         secret=self.SECRET,
         mut_p=0,
         rng=self._rnd
     )
     
     self.assertEqual(21, self._mut.stat(BamMutator.STAT_ALIGNMENT_COUNT))
     self.assertEqual(13, self._mut.stat(BamMutator.STAT_COVERING_COUNT))
     self.assertEqual(7, self._mut.stat(BamMutator.STAT_VAC_COUNT))
     self.assertEqual(6, self._mut.stat(BamMutator.STAT_MUT_COUNT))
     self.assertEqual(4, self._mut.stat(BamMutator.STAT_DIFF_COUNT))
     self.assertEqual(6, self._mut.stat(BamMutator.STAT_ALIGNMENT_MUT_COUNT))
     
     cmn.bam2sam(self.RESOURCE_PATH + 'output_02.bam', self.RESOURCE_PATH + 'output_02.sam')
     self.assertTrue(filecmp.cmp(
         self.RESOURCE_PATH + 'desired_02.sam',
         self.RESOURCE_PATH + 'output_02.sam'
     ))
     
     BdiffIO.to_text_file(bdiff_file, self.RESOURCE_PATH + 'output_02.diff.txt')
     is_equal = filecmp.cmp(self.RESOURCE_PATH + 'desired_02.diff.txt', self.RESOURCE_PATH + 'output_02.diff.txt')
     self.assertTrue(is_equal)
예제 #2
0
    def test1_encrypt(self):
        cmn.sam2bam(self.RESOURCE_PATH + 'encrypt/input.sam',
                    self.RESOURCE_PATH + 'encrypt/input.bam')
        pysam.index(self.RESOURCE_PATH + 'encrypt/input.bam')
        Vac.text2vac(self.RESOURCE_PATH + 'encrypt/input.vac.txt',
                     self.RESOURCE_PATH + 'encrypt/input.vac')

        with open(self.RESOURCE_PATH + 'admin', 'r') as key_file, \
                open(self.RESOURCE_PATH + 'admin.pub', 'r') as pub_key_file:
            rsa_key = RSA.importKey(key_file.read(), passphrase=self.KEY_PASS)
            rsa_pub_key = RSA.importKey(pub_key_file.read())

            # creates DIFF with secret
            self.locker.encrypt(
                rsa_sign_key=rsa_key,
                rsa_enc_key=rsa_pub_key,
                bam_filename=self.RESOURCE_PATH + 'encrypt/input.bam',
                vac_filename=self.RESOURCE_PATH + 'encrypt/input.vac',
                out_bam_filename=self.RESOURCE_PATH + 'encrypt/output.mut.bam',
                out_enc_diff_filename=self.RESOURCE_PATH +
                'encrypt/output.diff.enc',
                mut_p=0)

        pysam.index(self.RESOURCE_PATH + 'encrypt/output.mut.bam')
        cmn.bam2sam(self.RESOURCE_PATH + 'encrypt/output.mut.bam',
                    self.RESOURCE_PATH + 'encrypt/output.mut.sam')
예제 #3
0
    def create_vac(self, bam_filename: str, vcf_filename: str,
                   out_vac_filename: str, ref_fasta_filename: str,
                   skip_indels: bool):
        """
        BAM and VCF should use same reference genome.
        VCF must contain INFO column with sub-fields AC and AN.
        :param bam_filename: filename of the SAM/BAM file, from which the header is extracted
        :param vcf_filename: filename of the input VCF file
        :param out_vac_filename: filename of the output VAC file
        :param ref_fasta_filename: filename to reference FASTA file
        :param skip_indels: whether to skip indels and keep only SNPs
        """
        # TODO use fasta index / vcf header instead of BAM header

        # load the reference FASTA
        ref_fasta = None
        if ref_fasta_filename is not None:
            if self._verbose:
                print('--- Loading Reference Fasta ---')
            ref_fasta = pyfaidx.Fasta(ref_fasta_filename)

        # is VCF gzipped?
        # is_gzipped = vcf_filename.endswith(('.gz', '.bgz'))

        # open all files and create the VAC file
        if self._verbose:
            print('--- Processing VCF %s ---' % vcf_filename)
        with pysam.VariantFile(vcf_filename) as vcf_file, \
                open_bam(bam_filename, 'rb') as sam_file, \
                open(out_vac_filename, 'wb') as out_vac_file:
            vac = Vac(FastaIndex.from_bam(sam_file), self._verbose)
            vac.vcf2vac(vcf_file, out_vac_file, ref_fasta, skip_indels)
예제 #4
0
def vac2df(filename: str) -> pd.DataFrame:
    new_filename = os.path.splitext(filename)[0] + '.txt'
    Vac.vac2text(filename, new_filename)

    return pd.read_csv(new_filename,
                       sep='\t',
                       skiprows=2,
                       index_col=0,
                       names=['ref_id', 'counts'])
예제 #5
0
    def test_mask(self):
        # EOF BAM case
        Vac.text2vac(self.RESOURCE_PATH + 'input.vac.txt',
                     self.RESOURCE_PATH + 'input.vac')
        bdiff_file = self._mut.mutate(
            vac_filename=self.RESOURCE_PATH + 'input.vac',
            mut_bam_filename=self.RESOURCE_PATH + 'output.bam',
            secret=self.SECRET,
            mut_p=0,
            rng=self._rng)
        cmn.bam2sam(self.RESOURCE_PATH + 'output.bam',
                    self.RESOURCE_PATH + 'output.sam')
        BdiffIO.to_text_file(bdiff_file,
                             self.RESOURCE_PATH + 'output.diff.txt')

        self.assertTrue(
            filecmp.cmp(self.RESOURCE_PATH + 'desired.sam',
                        self.RESOURCE_PATH + 'output.sam'))

        self.assertTrue(
            filecmp.cmp(self.RESOURCE_PATH + 'desired.diff.txt',
                        self.RESOURCE_PATH + 'output.diff.txt'))
예제 #6
0
 def test_io(self):
     vac_file = self.__build_vac_file()
     self.assertTupleEqual((3, 3), Vac.read_header(vac_file))
     self.assertTupleEqual((2000000000, 0, [3, 2, 1, 0]),
                           Vac.read_snv_record(vac_file))
     self.assertTupleEqual((2000000002, 1, [0, 1, 2, 3]),
                           Vac.read_snv_record(vac_file))
     self.assertTupleEqual((2000000004, 2, [1, 1, 1, 1]),
                           Vac.read_snv_record(vac_file))
     self.assertTupleEqual((2000000001, [10, 1], ['A', 'ATCG']),
                           Vac.read_indel_record(vac_file))
     self.assertTupleEqual((2000000003, [10, 1], ['AT', 'ATCGT']),
                           Vac.read_indel_record(vac_file))
     self.assertTupleEqual((2000000005, [10, 10, 0], ['AAAA', 'ATCG', 'A']),
                           Vac.read_indel_record(vac_file))
     self.assertRaises(EOFError, lambda: Vac.read_snv_record(vac_file))
예제 #7
0
파일: variant.py 프로젝트: rtcz/varlock
    def _read_snv(self) -> po.VariantOccurrence:
        if self._snv_count > 0:
            self._snv_count -= 1
            index, ref_id, freqs = Vac.read_snv_record(self._snv_file)
            ref_name, ref_pos = self._fai.index2pos(index)

            variant = po.VariantOccurrence(position=po.GenomicPosition(
                index, ref_name, ref_pos),
                                           vtype=po.VariantType.SNV,
                                           freqs=freqs,
                                           alleles=BASES,
                                           ref_allele=BASES[ref_id])
        else:
            variant = None
            self._snv_file.close()

        return variant
예제 #8
0
파일: variant.py 프로젝트: rtcz/varlock
    def _read_indel(self) -> po.VariantOccurrence:
        if self._indel_count > 0:
            self._indel_count -= 1
            index, counts, seqs = Vac.read_indel_record(self._indel_file)
            ref_name, ref_pos = self._fai.index2pos(index)

            variant = po.VariantOccurrence(position=po.GenomicPosition(
                index, ref_name, ref_pos),
                                           vtype=po.VariantType.INDEL,
                                           freqs=counts,
                                           alleles=seqs,
                                           ref_allele=seqs[0])

        else:
            variant = None
            self._indel_file.close()

        return variant
예제 #9
0
파일: variant.py 프로젝트: rtcz/varlock
    def __init__(self, vac_filename: str, fai: FastaIndex):
        """
        :param vac_filename:
        :param fai:
        does not affect bases listed in VAC file
        """
        with open(vac_filename, 'rb') as vac_file:
            # read header
            self._snv_count, self._indel_count = Vac.read_header(vac_file)

        self._snv_file = open(vac_filename, 'rb')
        self._snv_file.seek(Vac.HEADER_SIZE)

        self._indel_file = open(vac_filename, 'rb')
        self._indel_file.seek(Vac.HEADER_SIZE +
                              self._snv_count * Vac.SNV_RECORD_SIZE)

        self._fai = fai
        self._counter = 0

        # init iteration
        self._snv_variant = self._read_snv()
        self._indel_variant = self._read_indel()
예제 #10
0
    def test_is_indel(self):
        self.assertFalse(Vac.is_indel(['A', 'T']))
        self.assertFalse(Vac.is_indel(['A', 'T', 'G', 'C']))

        self.assertTrue(Vac.is_indel(['A', 'TT', 'G', 'C']))
        self.assertTrue(Vac.is_indel(['A', 'T', 'GG', 'CC']))
        self.assertTrue(Vac.is_indel(['AA', 'T', 'G', 'C']))
        self.assertTrue(Vac.is_indel(['AA', 'TT', 'GG', 'CC']))

        self.assertFalse(Vac.is_indel(['AA', '.']))

        self.assertFalse(Vac.is_indel(['AA', '*']))
        self.assertFalse(Vac.is_indel(['AA', '*', 'T']))
        self.assertFalse(Vac.is_indel(['AA', 'T', '*']))
        self.assertFalse(Vac.is_indel(['A', '*', 'TT']))
        self.assertFalse(Vac.is_indel(['A', 'TT', '*']))

        self.assertFalse(Vac.is_indel(['N', 'AA']))
        self.assertFalse(Vac.is_indel(['AA', 'N']))
        self.assertFalse(Vac.is_indel(['AA', 'T', 'N']))
        self.assertFalse(Vac.is_indel(['AA', 'N', 'T']))
        self.assertFalse(Vac.is_indel(['A', 'TT', 'N']))
        self.assertFalse(Vac.is_indel(['A', 'N', 'TT']))
        self.assertFalse(Vac.is_indel(['A', 'TN', 'G']))
        self.assertFalse(Vac.is_indel(['A', 'G', 'TN']))

        self.assertFalse(Vac.is_indel(['AA', '<CN1>', '<CN2>']))
        self.assertFalse(Vac.is_indel(['AA', '<INS:ME:ALU>']))
예제 #11
0
 def test_parse_an(self):
     self.assertEqual(5008, Vac.parse_an(['AC=5,3', 'AN=5008', 'NS=2504']))
예제 #12
0
 def test_parse_ac(self):
     self.assertTupleEqual((5, 3),
                           Vac.parse_ac(['AC=5,3', 'AN=5008', 'NS=2504']))
예제 #13
0
 def __build_vac_file(cls):
     vac_file = io.BytesIO()
     Vac.write_header(vac_file, 3, 3)
     Vac._write_snv_record(vac_file, 2000000000, 0, (3, 2, 1, 0))
     Vac._write_snv_record(vac_file, 2000000002, 1, (0, 1, 2, 3))
     Vac._write_snv_record(vac_file, 2000000004, 2, (1, 1, 1, 1))
     Vac._write_indel_record(vac_file, 2000000001, ((10, 'A'), (1, 'ATCG')))
     Vac._write_indel_record(vac_file, 2000000003,
                             ((10, 'AT'), (1, 'ATCGT')))
     Vac._write_indel_record(vac_file, 2000000005,
                             ((10, 'AAAA'), (10, 'ATCG'), (0, 'A')))
     vac_file.seek(0)
     return vac_file
예제 #14
0
 def vac(cls):
     with open_bam(cls.RESOURCE_PATH + 'input.sam', 'rb') as sam_file:
         return Vac(FastaIndex.from_bam(sam_file))