Пример #1
0
    def test_empty_input_succeed(self):
        novoalign = tools.novoalign.NovoalignTool()
        novoalign.install()

        # make the input fasta empty
        inFasta = util.file.mkstempfname('.input.fasta')
        util.file.touch(inFasta)
        novoalign.index_fasta(inFasta)

        inBam = os.path.join(util.file.get_test_input_path(), 'empty.bam')

        outFasta = util.file.mkstempfname('.refined.fasta')

        # run refine_assembly
        args = [
            inFasta, inBam, outFasta, "--chr_names", 'G5012.3',
            "--min_coverage", '3', "--novo_params",
            "-r Random -l 30 -g 40 -x 20 -t 502 -c {}".format(_CPUS)
        ]
        args = assembly.parser_refine_assembly(
            argparse.ArgumentParser()).parse_args(args)
        print(args)
        args.func_main(args)

        # the expected output is an empty fasta file
        self.assertTrue(os.path.isfile(outFasta))
        self.assertTrue(os.path.getsize(outFasta) == 0)
Пример #2
0
    def test_ref_assisted_assembly(self):
        novoalign = tools.novoalign.NovoalignTool()
        novoalign.install()

        # prep inputs
        orig_ref = os.path.join(util.file.get_test_input_path(), 'ebov-makona.fasta')
        refGenome = util.file.mkstempfname('.ref.fasta')
        shutil.copyfile(orig_ref, refGenome)
        novoalign.index_fasta(refGenome)
        inBam = os.path.join(util.file.get_test_input_path(), 'G5012.3.testreads.bam')
        outFasta = util.file.mkstempfname('.refined.fasta')

        # run refine_assembly
        args = [refGenome, inBam, outFasta, "--chr_names", 'G5012.3', "--min_coverage", '3', "--novo_params",
                "-r Random -l 30 -g 40 -x 20 -t 502"]
        args = assembly.parser_refine_assembly().parse_args(args)
        args.func_main(args)
        self.assertTrue(os.path.isfile(outFasta))
        self.assertTrue(os.path.getsize(outFasta) > 1000)

        # check assembly quality
        with open(outFasta, 'rt') as inf:
            seq = Bio.SeqIO.read(inf, 'fasta')
            self.assertGreater(len(seq), 17000)
            self.assertGreater(assembly.unambig_count(seq.seq), len(seq) * 0.95)
Пример #3
0
    def test_empty_input_bam_assembly(self):
        novoalign = tools.novoalign.NovoalignTool()
        novoalign.install()

        # prep inputs
        orig_ref = os.path.join(util.file.get_test_input_path(),
                                'ebov-makona.fasta')
        inFasta = util.file.mkstempfname('.ref.fasta')
        shutil.copyfile(orig_ref, inFasta)
        novoalign.index_fasta(inFasta)

        inBam = os.path.join(util.file.get_test_input_path(), 'empty.bam')

        outFasta = util.file.mkstempfname('.refined.fasta')

        # run refine_assembly
        args = [
            inFasta, inBam, outFasta, "--chr_names", 'G5012.3',
            "--min_coverage", '3', "--novo_params",
            "-r Random -l 30 -g 40 -x 20 -t 502"
        ]
        args = assembly.parser_refine_assembly(
            argparse.ArgumentParser()).parse_args(args)
        args.func_main(args)

        # the expected output is an empty fasta file
        self.assertTrue(os.path.isfile(outFasta))
        self.assertTrue(os.path.getsize(outFasta) == 0)
Пример #4
0
    def test_ref_assisted_assembly(self):
        novoalign = tools.novoalign.NovoalignTool()
        novoalign.install()

        # prep inputs
        orig_ref = os.path.join(util.file.get_test_input_path(), 'ebov-makona.fasta')
        refGenome = util.file.mkstempfname('.ref.fasta')
        shutil.copyfile(orig_ref, refGenome)
        novoalign.index_fasta(refGenome)
        inBam = os.path.join(util.file.get_test_input_path(), 'G5012.3.testreads.bam')
        outFasta = util.file.mkstempfname('.refined.fasta')

        # run refine_assembly
        args = [refGenome, inBam, outFasta, "--chr_names", 'G5012.3', "--min_coverage", '3', "--novo_params",
                "-r Random -l 30 -g 40 -x 20 -t 502"]
        args = assembly.parser_refine_assembly().parse_args(args)
        args.func_main(args)
        self.assertTrue(os.path.isfile(outFasta))
        self.assertTrue(os.path.getsize(outFasta) > 1000)

        # check assembly quality
        with open(outFasta, 'rt') as inf:
            seq = Bio.SeqIO.read(inf, 'fasta')
            self.assertGreater(len(seq), 17000)
            self.assertGreater(assembly.unambig_count(seq.seq), len(seq) * 0.95)
Пример #5
0
    def test_empty_input_fasta_assembly(self):
        novoalign = tools.novoalign.NovoalignTool()
        novoalign.install()

        # make the input fasta empty
        inFasta = util.file.mkstempfname('.input.fasta')
        util.file.touch(inFasta)
        novoalign.index_fasta(inFasta)

        inBam = os.path.join(util.file.get_test_input_path(), 'G5012.3.testreads.bam')

        outFasta = util.file.mkstempfname('.refined.fasta')

        # run refine_assembly
        args = [inFasta, inBam, outFasta, "--chr_names", 'G5012.3', "--min_coverage", '3', "--novo_params",
                "-r Random -l 30 -g 40 -x 20 -t 502 -c {}".format(_CPUS)]
        args = assembly.parser_refine_assembly(argparse.ArgumentParser()).parse_args(args)
        args.func_main(args)

        # the expected output is an empty fasta file
        self.assertTrue(os.path.isfile(outFasta))
        self.assertTrue(os.path.getsize(outFasta) == 0)
Пример #6
0
    def test_empty_input_bam_assembly(self):
        novoalign = tools.novoalign.NovoalignTool()
        novoalign.install()

        # prep inputs
        orig_ref = os.path.join(util.file.get_test_input_path(), 'ebov-makona.fasta')
        inFasta = util.file.mkstempfname('.ref.fasta')
        shutil.copyfile(orig_ref, inFasta)
        novoalign.index_fasta(inFasta)

        inBam = os.path.join(util.file.get_test_input_path(), 'empty.bam')
        
        outFasta = util.file.mkstempfname('.refined.fasta')

        # run refine_assembly
        args = [inFasta, inBam, outFasta, "--chr_names", 'G5012.3', "--min_coverage", '3', "--novo_params",
                "-r Random -l 30 -g 40 -x 20 -t 502"]
        args = assembly.parser_refine_assembly(argparse.ArgumentParser()).parse_args(args)
        args.func_main(args)

        # the expected output is an empty fasta file
        self.assertTrue(os.path.isfile(outFasta))
        self.assertTrue(os.path.getsize(outFasta) == 0)
Пример #7
0
def refine_assembly(inFasta, inBam, outFasta,
        outVcf=None, outBam=None, novo_params='', min_coverage=2,
        chr_names=[], keep_all_reads=False, JVMmemory=None):
    ''' This a refinement step where we take a crude assembly, align
        all reads back to it, and modify the assembly to the majority
        allele at each position based on read pileups.
        This step considers both SNPs as well as indels called by GATK
        and will correct the consensus based on GATK calls.
        Reads are aligned with Novoalign, then PCR duplicates are removed
        with Picard (in order to debias the allele counts in the pileups),
        and realigned with GATK's IndelRealigner (in order to call indels).
        Output FASTA file is indexed for Picard, Samtools, and Novoalign.
    '''
    # Get tools
    picard_index = tools.picard.CreateSequenceDictionaryTool()
    picard_mkdup = tools.picard.MarkDuplicatesTool()
    samtools = tools.samtools.SamtoolsTool()
    novoalign = tools.novoalign.NovoalignTool()
    gatk = tools.gatk.GATKTool()
    
    # Create deambiguated genome for GATK
    deambigFasta = util.file.mkstempfname('.deambig.fasta')
    deambig_fasta(inFasta, deambigFasta)
    picard_index.execute(deambigFasta, overwrite=True)
    samtools.faidx(deambigFasta, overwrite=True)
    
    # Novoalign reads to self
    novoBam = util.file.mkstempfname('.novoalign.bam')
    min_qual = 0 if keep_all_reads else 1
    novoalign.execute(inBam, inFasta, novoBam,
        options=novo_params.split(), min_qual=min_qual, JVMmemory=JVMmemory)
    rmdupBam = util.file.mkstempfname('.rmdup.bam')
    opts = ['CREATE_INDEX=true']
    if not keep_all_reads:
        opts.append('REMOVE_DUPLICATES=true')
    picard_mkdup.execute([novoBam], rmdupBam,
        picardOptions=opts, JVMmemory=JVMmemory)
    os.unlink(novoBam)
    realignBam = util.file.mkstempfname('.realign.bam')
    gatk.local_realign(rmdupBam, deambigFasta, realignBam, JVMmemory=JVMmemory)
    os.unlink(rmdupBam)
    if outBam:
        shutil.copyfile(realignBam, outBam)
    
    # Modify original assembly with VCF calls from GATK
    tmpVcf = util.file.mkstempfname('.vcf.gz')
    tmpFasta = util.file.mkstempfname('.fasta')
    gatk.ug(realignBam, deambigFasta, tmpVcf, JVMmemory=JVMmemory)
    os.unlink(realignBam)
    os.unlink(deambigFasta)
    name_opts = []
    if chr_names:
        name_opts = ['--name'] + chr_names
    main_vcf_to_fasta(parser_vcf_to_fasta().parse_args([
        tmpVcf, tmpFasta, '--trim_ends', '--min_coverage', str(min_coverage),
        ] + name_opts))
    if outVcf:
        shutil.copyfile(tmpVcf, outVcf)
        if outVcf.endswith('.gz'):
            shutil.copyfile(tmpVcf+'.tbi', outVcf+'.tbi')
    os.unlink(tmpVcf)
    shutil.copyfile(tmpFasta, outFasta)
    os.unlink(tmpFasta)
    
    # Index final output FASTA for Picard/GATK, Samtools, and Novoalign
    picard_index.execute(outFasta, overwrite=True)
    samtools.faidx(outFasta, overwrite=True)
    novoalign.index_fasta(outFasta)
    return 0
Пример #8
0
def refine_assembly(inFasta,
                    inBam,
                    outFasta,
                    outVcf=None,
                    outBam=None,
                    novo_params='',
                    min_coverage=2,
                    chr_names=None,
                    keep_all_reads=False,
                    JVMmemory=None,
                    threads=1):
    ''' This a refinement step where we take a crude assembly, align
        all reads back to it, and modify the assembly to the majority
        allele at each position based on read pileups.
        This step considers both SNPs as well as indels called by GATK
        and will correct the consensus based on GATK calls.
        Reads are aligned with Novoalign, then PCR duplicates are removed
        with Picard (in order to debias the allele counts in the pileups),
        and realigned with GATK's IndelRealigner (in order to call indels).
        Output FASTA file is indexed for Picard, Samtools, and Novoalign.
    '''
    chr_names = chr_names or []

    # Get tools
    picard_index = tools.picard.CreateSequenceDictionaryTool()
    picard_mkdup = tools.picard.MarkDuplicatesTool()
    samtools = tools.samtools.SamtoolsTool()
    novoalign = tools.novoalign.NovoalignTool()
    gatk = tools.gatk.GATKTool()

    # Create deambiguated genome for GATK
    deambigFasta = util.file.mkstempfname('.deambig.fasta')
    deambig_fasta(inFasta, deambigFasta)
    picard_index.execute(deambigFasta, overwrite=True)
    samtools.faidx(deambigFasta, overwrite=True)

    # Novoalign reads to self
    novoBam = util.file.mkstempfname('.novoalign.bam')
    min_qual = 0 if keep_all_reads else 1
    novoalign.execute(inBam,
                      inFasta,
                      novoBam,
                      options=novo_params.split(),
                      min_qual=min_qual,
                      JVMmemory=JVMmemory)
    rmdupBam = util.file.mkstempfname('.rmdup.bam')
    opts = ['CREATE_INDEX=true']
    if not keep_all_reads:
        opts.append('REMOVE_DUPLICATES=true')
    picard_mkdup.execute([novoBam],
                         rmdupBam,
                         picardOptions=opts,
                         JVMmemory=JVMmemory)
    os.unlink(novoBam)
    realignBam = util.file.mkstempfname('.realign.bam')
    gatk.local_realign(rmdupBam,
                       deambigFasta,
                       realignBam,
                       JVMmemory=JVMmemory,
                       threads=threads)
    os.unlink(rmdupBam)
    if outBam:
        shutil.copyfile(realignBam, outBam)

    # Modify original assembly with VCF calls from GATK
    tmpVcf = util.file.mkstempfname('.vcf.gz')
    tmpFasta = util.file.mkstempfname('.fasta')
    gatk.ug(realignBam,
            deambigFasta,
            tmpVcf,
            JVMmemory=JVMmemory,
            threads=threads)
    os.unlink(realignBam)
    os.unlink(deambigFasta)
    name_opts = []
    if chr_names:
        name_opts = ['--name'] + chr_names
    main_vcf_to_fasta(parser_vcf_to_fasta().parse_args([
        tmpVcf,
        tmpFasta,
        '--trim_ends',
        '--min_coverage',
        str(min_coverage),
    ] + name_opts))
    if outVcf:
        shutil.copyfile(tmpVcf, outVcf)
        if outVcf.endswith('.gz'):
            shutil.copyfile(tmpVcf + '.tbi', outVcf + '.tbi')
    os.unlink(tmpVcf)
    shutil.copyfile(tmpFasta, outFasta)
    os.unlink(tmpFasta)

    # Index final output FASTA for Picard/GATK, Samtools, and Novoalign
    picard_index.execute(outFasta, overwrite=True)
    samtools.faidx(outFasta, overwrite=True)
    novoalign.index_fasta(outFasta)
    return 0