示例#1
0
文件: mpileup.py 项目: ncod3/indelvcf
    def run(self):

        mod_name = 'mpileup'
        out_dir = utl.mk_outdir(mod_name)

        # continue to next phase
        glv.outlist.outfile[mod_name] = list()

        start = time.time()

        # for each region
        for region in glv.conf.region_bed_list:

            out_file1 = "{}/{}_{}.{}{}".format(
                out_dir, mod_name, region, 1, '.vcf.gz')

            glv.outlist.outfile[mod_name].append(out_file1)
            log.debug("{}".format(out_file1))

            if utl.progress_check(mod_name) == False:
                log.info("progress={} so skip {}.".format(
                    glv.conf.progress,
                    mod_name))
                continue

            log.info("go on {}".format(mod_name))

            mpileup = '{} {} {} -O u -r {} -f {} {}'
            mp_cmd = mpileup.format(
                'bcftools',
                'mpileup',
                glv.conf.mpl_mpileup_param,
                region,
                glv.conf.ref_fasta,
                " ".join(glv.conf.bam_list))

            pipe_call = '{} {} {} -O u'
            ca_cmd = pipe_call.format(
                'bcftools',
                'call',
                glv.conf.mpl_call_param)

            pipe_filter = '{} {} {} -O z --threads {} -o {}'
            fi_cmd = pipe_filter.format(
                'bcftools',
                'filter',
                glv.conf.mpl_filter_param,
                glv.conf.thread,
                out_file1)

            cmd1 = "{} | {} | {}".format(mp_cmd, ca_cmd, fi_cmd)

            utl.save_to_tmpfile(out_file1)
            utl.try_exec(cmd1)
            utl.tabix(out_file1)

        log.info("mpileup finished {}".format(
            utl.elapsed_time(time.time(), start)))
示例#2
0
    def _copy_ini_file(self):

        # ini file
        self.ini_file_path
        # out_dir
        self.out_dir

        # back up
        ini_base = os.path.basename(self.ini_file_path)
        out_dir_ini_file = "{}/{}".format(self.out_dir, ini_base)
        utl.save_to_tmpfile(out_dir_ini_file)

        cmd = "cp {} {}".format(self.ini_file_path, out_dir_ini_file)
        utl.try_exec(cmd)
示例#3
0
    def run(self):

        mod_name = 'concat'
        out_dir = utl.mk_outdir(mod_name)
        heterozygosity = glv.conf.heterozygosity

        dname = os.path.basename(glv.conf.out_dir)

        out_file1 = "{}/{}.{}.{}.{}{}".format(out_dir, mod_name, dname,
                                              'SNP_INDEL', heterozygosity,
                                              '.vcf.gz')

        glv.outlist.outfile[mod_name] = list()
        glv.outlist.outfile[mod_name].append(out_file1)
        log.debug("{}".format(out_file1))

        all_vcf = " ".join(
            glv.outlist.outfile['snpfilter'] + \
            glv.outlist.outfile['indelfilter'])
        log.debug("{}".format(all_vcf))

        utl.save_to_tmpfile(out_file1)

        start = time.time()

        if heterozygosity != 'hetero':

            concat_nohetero = '{} {} {} -O v {} --threads {}'
            cmd1 = concat_nohetero.format('bcftools', 'concat',
                                          glv.conf.concat_nh_param, all_vcf,
                                          glv.conf.thread)

            pipe_view = '{} {} {} -O z -o {}'
            cmd2 = pipe_view.format('bcftools', 'view',
                                    glv.conf.concat_nh_view_param, out_file1)

            cmd1 = "{} | {}".format(cmd1, cmd2)

        else:
            concat_hetero = '{} {} {} -O z {} --threads {} -o {}'
            cmd1 = concat_hetero.format('bcftools', 'concat',
                                        glv.conf.concat_hetero_param, all_vcf,
                                        glv.conf.thread, out_file1)

        utl.try_exec(cmd1)
        utl.tabix(out_file1)

        log.info("concat finished {}".format(
            utl.elapsed_time(time.time(), start)))
示例#4
0
    def _make_bwaidx(self):

        bwaidx = "{}{}".format(glv.conf.ref_fasta, '.bwt')
        bwaidx_title = os.path.basename(glv.conf.ref_fasta)

        if os.path.isfile(bwaidx):
            log.debug("{} exist.".format(bwaidx))

        else:
            os.chdir(glv.conf.ref_dir)
            cmd1 = "bwa index -p {} {}".format(bwaidx_title,
                                               glv.conf.ref_fasta)
            utl.try_exec(cmd1)

        log.info("pwd {}".format(os.getcwd()))
        os.chdir(glv.conf.cwd)
示例#5
0
    def run(self):

        mod_name = 'indelfilter'
        out_dir = utl.mk_outdir(mod_name)
        heterozygosity = glv.conf.heterozygosity

        # continue to next phase
        glv.outlist.outfile[mod_name] = list()

        start = time.time()

        # for each region
        for (input_file, region) in zip(glv.outlist.outfile['svaba'],
                                        glv.conf.region_bed_list):

            # for sbava debug
            input_base_gz = os.path.basename(input_file)
            input_base = re.sub(r"\.gz$", "", input_base_gz)
            out_file0 = "{}/{}".format(out_dir, input_base)
            out_file0_gz = "{}/{}".format(out_dir, input_base_gz)

            out_file1 = "{}/{}_{}.{}{}".format(out_dir, mod_name, region,
                                               'annote', '.vcf.gz')
            out_file2 = "{}/{}_{}.{}{}".format(out_dir, mod_name, region,
                                               'norm', '.vcf.gz')
            out_file3 = "{}/{}_{}.{}{}".format(out_dir, mod_name, region,
                                               'h**o', '.vcf.gz')

            if heterozygosity == 'h**o':
                glv.outlist.outfile[mod_name].append(out_file3)
            else:
                glv.outlist.outfile[mod_name].append(out_file2)

            if utl.progress_check(mod_name) == False:
                log.info("progress={} so skip {}.".format(
                    glv.conf.progress, mod_name))
                continue

            log.info("go on {}".format(mod_name))

            # for svaba bug
            with open(out_file0, mode='w') as f:
                with gzip.open(input_file, "rt") as fi:
                    for liner in fi:
                        r_line = liner.strip()
                        if r_line.startswith('#'):
                            f.write("{}\n".format(r_line))
                        else:
                            w_line = list()
                            for col_n, item in enumerate(r_line.split('\t')):
                                if col_n <= 8:
                                    w_line += [item]
                                else:
                                    if '/' in item:
                                        w_line += [item]
                            f.write("{}\n".format('\t'.join(w_line)))

            utl.save_to_tmpfile(out_file0_gz)

            cmd1 = "bgzip -@ {} {}".format(glv.conf.thread, out_file0)

            utl.try_exec(cmd1)
            utl.tabix(out_file0_gz)

            input_valid_vcf = "{}/{}".format(out_dir, input_base_gz)

            view1 = '{} {} {} -O v -r {} {}'
            v1_cmd = view1.format('bcftools', 'view',
                                  glv.conf.indf_view1_param, region,
                                  input_valid_vcf)

            pipe_annotate = '{} {} {} -O z --threads {} -o {}'
            # use threads only -O z|b
            an_cmd = pipe_annotate.format('bcftools', 'annotate',
                                          glv.conf.indf_annotate_param,
                                          glv.conf.thread, out_file1)

            cmd1 = '{} | {}'.format(v1_cmd, an_cmd)
            utl.try_exec(cmd1)
            utl.tabix(out_file1)

            norm = '{} {} {} -O z --threads {} -f {} -o {} {}'
            # use threads only -O z|b
            cmd2 = norm.format('bcftools', 'norm', glv.conf.indf_norm_param,
                               glv.conf.thread, glv.conf.ref_fasta, out_file2,
                               out_file1)

            utl.try_exec(cmd2)
            utl.tabix(out_file2)

            tabix1 = "{}.tbi".format(out_file1)
            os.remove(out_file1)
            os.remove(tabix1)
            log.info("remove {} {}".format(out_file1, tabix1))

            #-------------------------
            if heterozygosity != 'h**o':
                continue

            view2 = '{} {} {} -O z --threads {} -r {} -o {} {}'
            cmd3 = view2.format('bcftools', 'view', glv.conf.indf_view2_param,
                                glv.conf.thread, region, out_file3, out_file2)

            #utl.save_to_tmpfile(out_file3)
            utl.try_exec(cmd3)
            utl.tabix(out_file3)

            tabix2 = "{}.tbi".format(out_file2)
            os.remove(out_file2)
            os.remove(tabix2)

            log.info("remove {} {}".format(out_file2, tabix2))

        log.info("indelfilter finished {}".format(
            utl.elapsed_time(time.time(), start)))
示例#6
0
    def prepare_ref(self):

        # user's fasta: convert relative path to absolute path based on cwd
        if glv.conf.ref.startswith('/'):
            # originally absolute path
            glv.conf.ref_fasta_user = glv.conf.ref
        else:
            # cwd + relative path
            glv.conf.ref_fasta_user = "******".format(glv.conf.cwd,
                                                     glv.conf.ref)

        log.info("glv.conf.ref_fasta_user {}".format(glv.conf.ref_fasta_user))

        # ref_fasta_user: existence confirmation
        if os.path.isfile(glv.conf.ref_fasta_user):
            log.info("{} found.".format(glv.conf.ref_fasta_user))
        else:
            log.info("{} not found. exit.".format(glv.conf.ref_fasta_user))
            sys.exit(1)

        # ext, basename, without_ext
        # https://note.nkmk.me/python-os-basename-dirname-split-splitext/
        basename_user = os.path.basename(glv.conf.ref_fasta_user)
        root_ext_pair = os.path.splitext(glv.conf.ref_fasta_user)
        without_ext = root_ext_pair[0]
        basename_without_ext = os.path.basename(without_ext)
        ext = root_ext_pair[1]

        # ref_fasta_slink_system
        # make symlink user's fasta to sys_ref_dir as .org(.gz)
        if ext != '.gz':
            glv.conf.ref_fasta_slink_system = "{}/{}{}".format(
                glv.conf.ref_dir, basename_user, '.org_slink')

            glv.conf.ref_fasta = "{}/{}".format(glv.conf.ref_dir,
                                                basename_user)

        else:
            glv.conf.ref_fasta_slink_system = "{}/{}{}".format(
                glv.conf.ref_dir, basename_user, '.org_slink.gz')

            glv.conf.ref_fasta = "{}/{}".format(glv.conf.ref_dir,
                                                basename_without_ext)

        if os.path.isfile(glv.conf.ref_fasta_slink_system):
            log.info("{} exist.".format(glv.conf.ref_fasta_slink_system))
        else:
            log.info("os.symlink {} {}.".format(
                glv.conf.ref_fasta_user, glv.conf.ref_fasta_slink_system))

            os.symlink(glv.conf.ref_fasta_user,
                       glv.conf.ref_fasta_slink_system)

        log.info("ext ({}).".format(ext))

        # convert to bgz if ext is .gz and set to ref_fasta
        if ext != '.gz':
            # it should be convert to bgz in ref_dir
            glv.conf.ref_fasta = "{}/{}".format(glv.conf.ref_dir,
                                                basename_user)

            if os.path.isfile(glv.conf.ref_fasta):
                log.info("symlink exist {}".format(glv.conf.ref_fasta))
            else:
                os.symlink(glv.conf.ref_fasta_user, glv.conf.ref_fasta)
                log.info("symlink {} {}".format(glv.conf.ref_fasta_user,
                                                glv.conf.ref_fasta))

        else:

            # it should be convert to bgz in ref_dir
            cmd1 = 'bgzip -cd -@ {} {} > {}'.format(
                glv.conf.thread, glv.conf.ref_fasta_slink_system,
                glv.conf.ref_fasta)

            # execute
            if os.path.isfile(glv.conf.ref_fasta):
                log.debug("{} exist.".format(glv.conf.ref_fasta))

            else:
                log.debug("{} not exist. do cmd={}".format(
                    glv.conf.ref_fasta, cmd1))

                utl.try_exec(cmd1)

        # make fai file
        cmd2 = 'samtools faidx {}'.format(glv.conf.ref_fasta, glv.conf.log_dir)

        glv.conf.ref_fasta_fai = "{}{}".format(glv.conf.ref_fasta, '.fai')

        if os.path.isfile(glv.conf.ref_fasta_fai):
            log.debug("{} exist.".format(glv.conf.ref_fasta_fai))

        else:
            log.debug("{} not exist. do {}".format(glv.conf.ref_fasta_fai,
                                                   cmd2))
            utl.try_exec(cmd2)

        # ref to makeblastdb
        self._make_bwaidx()

        return self
示例#7
0
    def run(self):

        mod_name = 'svaba'
        out_dir = utl.mk_outdir(mod_name)

        #        target_vcfs = [
        #    'svaba.indel.vcf', 'svaba.sv.vcf',
        #    'svaba.unfiltered.indel.vcf', 'svaba.unfiltered.sv.vcf']

        # continue to next phase
        glv.outlist.outfile[mod_name] = list()

        start = time.time()
        os.chdir(out_dir)

        for region in glv.conf.region_bed_list:

            title = "{}/indel_{}".format(out_dir, region)

            if len(glv.conf.svaba_normalize_bams_list) != 0:
                norm = " -n ".join(glv.conf.svaba_normalize_bams_list)
                norm = "-n " + norm
                glv.outlist.outfile[mod_name].append(
                    "{}.{}{}".format(title, 'svaba.indel.vcf', '.gz'))
            else:
                glv.outlist.outfile[mod_name].append(
                    "{}.{}{}".format(title, 'svaba.indel.vcf', '.gz'))

            # normalize option
            norm = ''
            log.debug("{}".format(norm))

            if utl.progress_check(mod_name) == False:
                log.info("progress={} so skip {}.".format(
                    glv.conf.progress,
                    mod_name))
                continue

            log.info("go on {}".format(mod_name))


            svaba = '{} {} -t {} -G {} -k {} {} -p {} {} -a {}'
            cmd1 = svaba.format(
                'svaba',
                'run',
                " -t ".join(glv.conf.bam_list),
                glv.conf.ref_fasta,
                region,
                norm,
                glv.conf.thread,
                glv.conf.svb_svaba_param,
                title)

            utl.try_exec(cmd1)

            # *.vcf
            #target_vcfs = list()
            #for fpath in glob.glob("{}*.vcf".format(out_dir)):
            #    target_vcfs.append(fpath)   
            target_vcfs = utl.check_for_files("{}/*.vcf".format(out_dir))
            log.debug("{}".format(target_vcfs))

            for t_vcf in target_vcfs:

                cmd2 = "bgzip -@ {} {}".format(
                    glv.conf.thread,
                    t_vcf)

                utl.try_exec(cmd2)
                utl.tabix("{}{}".format(t_vcf, '.gz'))

        
        os.chdir(glv.conf.cwd)

        log.info("svaba finished {}".format(
            utl.elapsed_time(time.time(), start)))
示例#8
0
    def run(self):

        mod_name = 'snpfilter'
        out_dir = utl.mk_outdir(mod_name)
        heterozygosity = glv.conf.heterozygosity

        # continue to next phase
        glv.outlist.outfile[mod_name] = list()

        start = time.time()

        # for each region
        for (input_file, region) in zip(glv.outlist.outfile['mpileup'],
                                        glv.conf.region_bed_list):

            out_file1 = "{}/{}_{}.{}{}".format(out_dir, mod_name, region,
                                               'annote', '.vcf.gz')
            out_file2 = "{}/{}_{}.{}{}".format(out_dir, mod_name, region,
                                               'h**o', '.vcf.gz')

            if heterozygosity == 'h**o':
                glv.outlist.outfile[mod_name].append(out_file2)
            else:
                glv.outlist.outfile[mod_name].append(out_file1)

            if utl.progress_check(mod_name) == False:
                log.info("progress={} so skip {}.".format(
                    glv.conf.progress, mod_name))
                continue

            log.info("go on {}".format(mod_name))

            view1 = '{} {} {} -O v -r {} {}'
            v1_cmd = view1.format('bcftools', 'view',
                                  glv.conf.snpf_view1_param, region,
                                  input_file)

            pipe_annotate = '{} {} {} -O z --threads {} -o {}'
            an_cmd = pipe_annotate.format('bcftools', 'annotate',
                                          glv.conf.snpf_annotate_param,
                                          glv.conf.thread, out_file1)

            cmd1 = '{} | {}'.format(v1_cmd, an_cmd)

            utl.try_exec(cmd1)
            utl.tabix(out_file1)

            #-------------------------
            if heterozygosity != 'h**o':
                continue

            view2 = '{} {} {} -O z --threads {} -r {} -o {} {}'
            cmd2 = view2.format('bcftools', 'view', glv.conf.snpf_view2_param,
                                glv.conf.thread, region, out_file2, out_file1)

            utl.save_to_tmpfile(out_file2)

            utl.try_exec(cmd2)
            utl.tabix(out_file2)

            tabix1 = "{}.tbi".format(out_file1)
            os.remove(out_file1)
            os.remove(tabix1)
            log.info("remove {} {}".format(out_file1, tabix1))

        log.info("snpfilter finished {}".format(
            utl.elapsed_time(time.time(), start)))