Exemplo n.º 1
0
    def _iterate_vcf(self, vcf_ittr, distin_dict, reg):
        """
        """

        pick_mode = distin_dict['pick_mode']
        # 辞書のキーが0。名前の文字列を示している。
        gr_list = [distin_dict[0], distin_dict[1]]
        log.info("gr_list {}.".format(gr_list))

        # At first, we check difference of genotype between two sample
        # that described at the beginning of each group
        top_smpl_list = [
            glv.conf.g_members_dict[gr_list[0]][0],
            glv.conf.g_members_dict[gr_list[1]][0]
        ]
        log.info("top_smpl_list {}.".format(top_smpl_list))

        # ================================================================
        start = time.time()
        # write out to file
        out_txt_file = distin_dict['variant']['out_path']
        utl.save_to_tmpfile(out_txt_file)

        # ここがparallele化できるか
        # f.writeの最後のflash必要か。
        with open(out_txt_file, mode='a') as f:

            # write header
            f.write("{}\n".format(distin_dict['variant']['hdr_text']))

            # access to vcf using iterater
            for record in vcf_ittr:

                # 1. Skip same GT between top two sample
                if self._skip_same_GT_between_top2sample(
                        record, top_smpl_list) > 0:
                    continue

                # 2. Check GT in your own group
                if self._skip_different_GT_in_own_group(
                        record, top_smpl_list, gr_list) > 0:
                    continue

                # 3. Select different allele combination among 2x2 allele
                asel = AlleleSelect()
                asel.select_diff_allele(record, top_smpl_list, gr_list)

                # skip if pick_mode is different
                #                if utl.is_my_pick_mode(
                #                    asel.var_type, distin_dict['pick_mode']) != True:
                #                    continue

                # 4. Save variant information as text file
                for var_type, line in zip(asel.var_types, asel.lines):
                    if utl.is_my_pick_mode(var_type,
                                           distin_dict['pick_mode']) == True:
                        f.write("{}\n".format(line))

        log.info("variant {} {}".format(utl.elapsed_time(time.time(), start),
                                        distin_dict['variant']['base_nam']))
Exemplo n.º 2
0
    def construct_primer(self):

        # progress check
        if utl.progress_check('primer') == False:
            log.info("progress={} so skip primer.".format(glv.conf.progress))
            return
        log.info("Start processing {}".format('primer'))

        # for each distinguish_groups
        for distin_dict in glv.outlist.distin_files:

            marker_file = distin_dict['marker']['out_path']
            df_distin = pd.read_csv(marker_file,
                                    sep='\t',
                                    header=0,
                                    index_col=None)

            out_txt_file = distin_dict['primer']['out_path']
            utl.save_to_tmpfile(out_txt_file)

            with open(out_txt_file, mode='a') as f:
                # write header
                #f.write("{}\n".format(distin_dict['primer']['hdr_text']))

                start = time.time()

                if glv.conf.parallel == True:
                    log.info(
                        "do Parallel cpu {}, parallele {} blast {}".format(
                            glv.conf.thread, glv.conf.parallel_blast_cnt,
                            glv.conf.blast_num_threads))

                    Parallel(
                        n_jobs=glv.conf.parallel_blast_cnt,
                        backend="threading")(
                        [
                            delayed(self._loop_primer3_check_blast) \
                                (distin_dict, marker_df_row, f) \
                                for marker_df_row in df_distin.itertuples()
                        ]
                    )

                else:
                    log.info("do Serial cpu {} / serial {} blast {}".format(
                        glv.conf.thread, 1, glv.conf.blast_num_threads))

                    for marker_df_row in df_distin.itertuples():

                        self._loop_primer3_check_blast(distin_dict,
                                                       marker_df_row, f)

            utl.sort_file('primer', distin_dict, out_txt_file, 'chrom', 'pos',
                          'try_cnt', 'number')

            log.info("primer {} {}".format(
                utl.elapsed_time(time.time(), start),
                distin_dict['primer']['base_nam']))
Exemplo n.º 3
0
def main():

    log.info('program started at {}'.format(glv.now_datetime_str))

    # run
    vpr = VPrimer()
    vpr.run()

    log.info("program finished {}\n".format(
        utl.elapsed_time(time.time(), glv.now_epochtime)))
Exemplo n.º 4
0
    def run(self):

        self.prepare()

        #self.variant.print_allele_int()
        #self.variant.print_all_allele_int()
        #glv.conf.get_vcf_pos_info()
        if glv.conf.show_genotype != "no":
            self.variant.print_allele()
            log.info("program finished {}\n".format(
                utl.elapsed_time(time.time(), glv.now_epochtime)))
            sys.exit(1)

        # variant
        self.variant.pick_variant()
        # marker
        self.marker.design_marker()
        # primer
        self.primer.construct_primer()
        # format
        self.formtxt.format_text()
Exemplo n.º 5
0
    def _read_fasta_first(self):

# glv.conf.ref_fasta_fai
#chr01   43270923    7   60  61
#chr02   35937250    43992120    60  61
#chr03   36413819    80528332    60  61
#chr04   35502694    117549055   60  61
#chr05   29958434    153643468   60  61
#chr06   31248787    184101217   60  61
#chr07   29697621    215870825   60  61
#chr08   28443022    246063414   60  61
#chr09   23012720    274980494   60  61
#chr10   23207287    298376767   60  61
#chr11   29021106    321970850   60  61
#chr12   27531856    351475649   60  61

        # get chrom list from fai text
        df_fai = pd.read_csv(
            glv.conf.ref_fasta_fai, sep = '\t',
            header = None, index_col = None)

        # ref_fasta_chrom_list from fai column 0 (chrom name)
        glv.conf.ref_fasta_chrom_list = df_fai.loc[:, 0].to_list()

        log.info("fai {}, chrom cnt={}".format(
            glv.conf.ref_fasta_fai, len(glv.conf.ref_fasta_chrom_list)))

        # for each chrom name
        log.info("read refseq by samtools faidx from fasta {}".format(
            glv.conf.ref_fasta))
        start = time.time()

        chrom_seq_list = []
        last_chrom = ''
        for chrom in glv.conf.ref_fasta_chrom_list:

            # get sequence from samtools command
            cmd1 = "samtools faidx {} {}".format(glv.conf.ref_fasta, chrom)
            cmd_list = cmd1.split(' ')

            # log.info("{}".format(cmd_list))

            # using command output by pipe, get sequence into python
            proc = sbp.Popen(
                cmd_list, stdout = sbp.PIPE, stderr = sbp.PIPE)

            # got bytes (b'')
            for byte_line in proc.stdout:
                # bytes to str, strip \n
                b_line = byte_line.decode().strip()

                #print("{}={}(top)".format(chrom, len(chrom_seq_list)))
                # fasta header
                if b_line.startswith('>'):
                    # not the first time
                    if len(chrom_seq_list) != 0:
                        # dictionary
                        glv.ref.refseq[last_chrom] = ''.join(chrom_seq_list)
                        chrom_seq_list = []
                        continue

                else:
                    # append to list
                    chrom_seq_list.append(b_line)
                    #print("{}={}(append)".format(chrom, len(chrom_seq_list)))

            last_chrom = chrom

        if len(chrom_seq_list) != 0:
            glv.ref.refseq[last_chrom] = ''.join(chrom_seq_list)
            print("{},last_len={}".format(
                chrom, len(glv.ref.refseq[last_chrom])))

        log.info("read refseq done {}".format(
            utl.elapsed_time(time.time(), start)))

        # pickle
        with open(glv.conf.ref_fasta_pickle, 'wb') as f:
            pickle.dump(glv.ref.refseq, f)

        log.info('dumped glv.ref.refseq->{}'.format(
            glv.conf.ref_fasta_pickle))
Exemplo n.º 6
0
    def _read_fasta_first(self):
        '''
        '''

        # read fai and set dictionary
        glv.conf.ref_fasta_chrom_dict_list, \
        glv.conf.ref_fasta_chrom_list, \
        glv.conf.ref_fasta_chrom_region_list = \
            self._get_fai_info()

        # for each chrom name
        log.info("read refseq by samtools faidx from fasta {}".format(
            glv.conf.ref_fasta_path))
        start = time.time()

        chrom_seq_list = []
        last_chrom = ''
        for chrom in glv.conf.ref_fasta_chrom_list:

            # get sequence from samtools command
            cmd1 = "samtools faidx {} {}".format(glv.conf.ref_fasta_path,
                                                 chrom)
            cmd_list = cmd1.split(' ')

            # log.info("{}".format(cmd_list))

            # using command output by pipe, get sequence into python
            proc = sbp.Popen(cmd_list, stdout=sbp.PIPE, stderr=sbp.PIPE)

            # got bytes (b'')
            for byte_line in proc.stdout:
                # bytes to str, strip \n
                b_line = byte_line.decode().strip()

                #print("{}={}(top)".format(chrom, len(chrom_seq_list)))
                # fasta header
                if b_line.startswith('>'):
                    # not the first time
                    if len(chrom_seq_list) != 0:
                        # dictionary
                        glv.ref.refseq[last_chrom] = ''.join(chrom_seq_list)
                        chrom_seq_list = []
                        continue

                else:
                    # append to list
                    chrom_seq_list.append(b_line)
                    #print("{}={}(append)".format(chrom, len(chrom_seq_list)))

            last_chrom = chrom

        if len(chrom_seq_list) != 0:
            glv.ref.refseq[last_chrom] = ''.join(chrom_seq_list)
            #print("{},last_len={}".format(
            #    chrom, len(glv.ref.refseq[last_chrom])))

        log.info("read refseq done {}\n".format(
            utl.elapsed_time(time.time(), start)))

        # pickle
        with open(glv.conf.ref_fasta_pickle, 'wb') as f:
            pickle.dump(glv.ref.refseq, f)

        log.info('dumped glv.ref.refseq->{}'.format(glv.conf.ref_fasta_pickle))
Exemplo n.º 7
0
    def _get_fai_info(self):
        '''
        '''
        # glv.conf.ref_fasta_fai
        #chr01   43270923    7   60  61
        #chr02   35937250    43992120    60  61
        #chr03   36413819    80528332    60  61
        #chr04   35502694    117549055   60  61
        #chr05   29958434    153643468   60  61
        #chr06   31248787    184101217   60  61
        #chr07   29697621    215870825   60  61
        #chr08   28443022    246063414   60  61
        #chr09   23012720    274980494   60  61
        #chr10   23207287    298376767   60  61
        #chr11   29021106    321970850   60  61
        #chr12   27531856    351475649   60  61

        # get chrom list from fai text
        df_fai = pd.read_csv(glv.conf.ref_fasta_fai,
                             sep='\t',
                             header=None,
                             index_col=None)

        ref_fasta_chrom_dict_list = list()
        ref_fasta_chrom_list = list()
        ref_fasta_chrom_region_list = list()

        for row in df_fai.itertuples():
            chrom_dict = dict()
            chrom_dict = {
                'chrom': row[1],
                'start': 1,
                'end': row[2],
                'length': row[2]
            }
            ref_fasta_chrom_dict_list.append(chrom_dict)
            ref_fasta_chrom_list.append(row[1])
            region = "{}:{}-{}".format(row[1], 1, row[2])
            ref_fasta_chrom_region_list.append(region)

        log.info(
            "ref_fasta_chrom_dict_list={}.".format(ref_fasta_chrom_dict_list))

        log.info("ref_fasta_chrom_list={}.".format(ref_fasta_chrom_list))

        log.info("fai {}, chrom cnt={}".format(glv.conf.ref_fasta_fai,
                                               len(ref_fasta_chrom_list)))

        pd_json = pd.json_normalize(ref_fasta_chrom_dict_list)
        log.info("\n{}\n".format(pd_json))

        # exist or not, vcf_sample_name_file
        if os.path.isfile(glv.conf.ref_fasta_chrom_txt):
            log.info("found. {}".format(glv.conf.ref_fasta_chrom_txt))
        else:
            # write to vcf_sample_name_file
            with open(glv.conf.ref_fasta_chrom_txt, mode='w') as f:
                f.write("{}\n".format(pd_json))
            log.info("save. {}".format(glv.conf.ref_fasta_chrom_txt))
        '''
        Creating dataframe by converting dict to list of items
        '''
        if glv.conf.show_fasta == True:
            log.info("only show_fasta mode, exit.")
            log.info("program finished {}\n".format(
                utl.elapsed_time(time.time(), glv.now_epochtime)))
            sys.exit(1)

        return ref_fasta_chrom_dict_list, \
            ref_fasta_chrom_list, \
            ref_fasta_chrom_region_list
Exemplo n.º 8
0
    def construct_primer(self):

        proc_name = "primer"
        log.info("-------------------------------")
        log.info("Start processing {}\n".format(proc_name))

        # stop, action, gothrough
        ret_status = utl.decide_action_stop(proc_name)

        if ret_status == "stop":
            msg = "STOP. "
            msg += "Current process \'{}\' ".format(proc_name)
            msg += "has exceeded the User-specified stop point "
            msg += "\'{}', ".format(glv.conf.stop)
            msg += "so stop program. exit."
            log.info(msg)
            sys.exit(1)

        elif ret_status == "gothrough":
            msg = "SKIP \'{}\' proc, ".format(proc_name)
            msg += "glv.conf.progress = {}, ".format(glv.conf.progress)
            msg += "glv.conf.stop = {}, ".format(glv.conf.stop)
            msg += "so skip program."
            log.info(msg)
            return

        # for each distinguish_groups
        for proc_cnt, distin_dict in enumerate(glv.outlist.distin_files, 1):

            # logging current target
            utl.print_distin_info("primer", distin_dict, proc_cnt)

            marker_file = distin_dict['marker']['out_path']
            df_distin = pd.read_csv(marker_file,
                                    sep='\t',
                                    header=0,
                                    index_col=None)

            out_txt_file = distin_dict['primer']['out_path']
            utl.save_to_tmpfile(out_txt_file)

            with open(out_txt_file, mode='a') as f:
                # write header
                #f.write("{}\n".format(distin_dict['primer']['hdr_text']))

                start = time.time()

                if glv.conf.parallel == True:
                    log.info("do Parallel cpu {}, parallel {} blast {}".format(
                        glv.conf.thread, glv.conf.parallel_blast_cnt,
                        glv.conf.blast_num_threads))

                    Parallel(
                        n_jobs=glv.conf.parallel_blast_cnt,
                        backend="threading")(
                        [
                            delayed(self._loop_primer3_check_blast) \
                                (distin_dict, marker_df_row, f) \
                                for marker_df_row in df_distin.itertuples()
                        ]
                    )

                else:
                    log.info("do Serial cpu {} / serial {} blast {}".format(
                        glv.conf.thread, 1, glv.conf.blast_num_threads))

                    for marker_df_row in df_distin.itertuples():

                        self._loop_primer3_check_blast(distin_dict,
                                                       marker_df_row, f)

            utl.sort_file('primer', distin_dict, out_txt_file, 'chrom', 'pos',
                          'try_cnt', 'number')

            log.info("primer {} > {}.txt\n".format(
                utl.elapsed_time(time.time(), start),
                distin_dict['primer']['base_nam']))
Exemplo n.º 9
0
    def choice_variables(self):
        ''' decide variable values
        '''

        # print param and ini variables
        self._print_param_ini()

        # for debug
        self.analyse_caps = self._value_choice('analyse_caps')

        # out_dir ---------------------------------------------
        self.user_out_dir = self._value_choice('out_dir')
        self.out_dir_path = utl.full_path(self.user_out_dir)

        # vcf -------------------------------------------------
        self.user_vcf_file = self._value_choice('vcf')
        self.user_vcf_file_path = utl.full_path(self.user_vcf_file)

        # ref -------------------------------------------------
        self.user_ref_fasta = self._value_choice('ref')
        self.user_ref_fasta_path = utl.full_path(self.user_ref_fasta)

        # thread ----------------------------------------------
        self.thread = self._value_choice('thread')


        if self.out_dir_path == "" or \
            self.user_vcf_file_path == "" or \
            self.user_ref_fasta_path == "":
            err_mes = "out_dir={} and vcf={} and ref={} ".format(
                self.out_dir_path, self.user_vcf_file_path,
                self.user_ref_fasta_path)
            err_mes += "are all required. exit."
            log.error(err_mes)
            sys.exit(1)

        log.info("thread={}".format(self.thread))

        # out_dir ---------------------------------------------
        self.out_dir_path = utl.full_path(self.user_out_dir)
        self.log_dir_path = "{}/{}".format(self.out_dir_path, "logs")
        self.out_bak_dir_path = "{}/{}".format(self.out_dir_path, "bak")

        #pprint.pprint(self.conf_dict)
        # INI show_genotype
        self.show_genotype = self._value_choice('show_genotype')
        if self.show_genotype == "":
            self.show_genotype = "gt"

        if not self.show_genotype in glv.show_genotype_list:
            err_mes = "show_genotype is selected from one of "
            err_mes += ", ".join(glv.show_genotype_list)
            log.error("{}. exit.".format(err_mes))
            log.error("show_genotype={}".format(self.show_genotype))
            sys.exit(1)

        # ini_file --------------------------------------------
        # INI
        self.ini_version_user = self._value_choice('ini_version')
        self.ini_version_system = self.conf_dict['ini_version']['default']

        self.user_ini_file = self.conf_dict['ini_file']['param']
        self.ini_file_path = utl.full_path(self.user_ini_file)

        # ref_dir ---------------------------------------------
        self.ref_dir_path = utl.full_path("refs")
        # make ref_dir
        utl.makedirs(self.ref_dir_path)

        # out_curr_setting ------------------------------------
        self.curr_setting_file = "current_setting_ini.txt"
        self.curr_setting_file_path = "{}/{}".format(self.out_dir_path,
                                                     self.curr_setting_file)

        # thread ----------------------------------------------
        self.use_joblib_threading = self._value_choice('use_joblib_threading')

        if not self.use_joblib_threading in ['yes', 'no']:
            err_mes = "use_joblib_threading Choose from Yes or No."
            log.error("{} exit.".format(err_mes))
            log.error("use_joblib_threading={}".format(
                self.use_joblib_threading))
            sys.exit(1)

        # thread adjust
        self.parallel, \
        self.parallel_full_thread, \
        self.parallel_blast_cnt, \
        self.blast_num_threads \
            = self._thread_adjusting()

        # vcf -------------------------------------------------
        basename_user_vcf = os.path.basename(self.user_vcf_file_path)
        self.vcf_file_slink_system = "{}/{}{}".format(self.ref_dir_path,
                                                      'slink_',
                                                      basename_user_vcf)

        # gtonly.gz
        self.vcf_file_path = "{}/{}{}".format(self.ref_dir_path,
                                              basename_user_vcf,
                                              "_GTonly.vcf.gz")

        # read
        self.prepare_vcf()

        # sample_nickname -------------------------------------
        basename_vcf_file = os.path.basename(self.vcf_file_path)
        self.vcf_sample_name_file = "{}/sample_name_{}.txt".format(
            self.ref_dir_path, basename_vcf_file)
        self.save_vcf_sample_name_txt()

        self.vcf_sample_nickname_list, \
        self.vcf_sample_basename_list, \
        self.vcf_sample_fullname_list, \
        self.vcf_sample_nickname_dict, \
        self.vcf_sample_basename_dict, \
        self.vcf_sample_fullname_dict, \
        self.group_members_vcf_str \
            = self.make_vcf_sample_variable()

        # illegal
        self.conf_dict['group_members']['default'] = \
            self.group_members_vcf_str

        # show_fasta-------------------------------------------
        self.show_fasta = self._value_choice('show_fasta')

        # show_samples-----------------------------------------
        self.show_samples = self._value_choice('show_samples')

        # Because it stops at show_fasta
        if self.show_fasta != True and self.show_samples == True:
            log.info("only show_samples mode, exit.")
            log.info("program finished {}\n".format(
                utl.elapsed_time(time.time(), glv.now_epochtime)))
            sys.exit(1)

        # pick_mode
        self.pick_mode = self._value_choice('pick_mode')

        # indel len
        self.indel_size = self._value_choice('indel_size')
        self.min_indel_len, self.max_indel_len = \
            [int(i) for i in self.indel_size.split('-')]

        # product size
        self.product_size = self._value_choice('product_size')
        self.min_product_size, self.max_product_size = \
            [int(i) for i in self.product_size.split('-')]

        # ref -------------------------------------------------
        # It will be set in main later
        # glv.ref = glv.ref.prepare_ref()
        self.ref_fasta_slink_system = ""
        self.ref_fasta_path = ""
        self.ref_fasta_chrom_list = []
        self.ref_fasta_fai = ""
        self.ref_fasta_chrom_txt = ""
        self.ref_fasta_pickle = ""

        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # enzyme
        self.enzyme_files_user_str = self._value_choice('enzyme_file')
        #self.enzyme_files_user_list = list()
        #self.enzyme_files_list = list()
        self.enzyme_str = self._value_choice('enzyme')
        #self.enzyme_name_list = list()

        # start stop
        self.progress = self._value_choice('progress')
        self.stop = self._value_choice('stop')

        # primer3
        self.fragment_pad_len = self._value_choice('fragment_pad_len')
        self.p3_params_file = self._value_choice('p3_params')

        # blast
        self.blast_distance = self._value_choice('blast_distance')

        # not set now
        self.blast_word_size = 0
        self.blastdb_title = ""
        self.blastdb = ""

        # region group member string ---------------------------------
        # select string by priority, next make dict or list variables

        self.regions_str = self.set_regions_str()
        self.group_members_str = self.set_group_members_str()
        self.distinguish_groups_str = self.set_distinguish_groups_str()
Exemplo n.º 10
0
    def design_marker(self):

        self.enzyme_name_list = glv.conf.enzyme_name_list

        proc_name = "marker"
        log.info("-------------------------------")
        log.info("Start processing {}\n".format(proc_name))

        # stop, action, gothrough
        ret_status = utl.decide_action_stop(proc_name)

        if ret_status == "stop":
            msg = "STOP. "
            msg += "Current process \'{}\' ".format(proc_name)
            msg += "has exceeded the User-specified stop point "
            msg += "\'{}', ".format(glv.conf.stop)
            msg += "so stop program. exit."
            log.info(msg)
            sys.exit(1)


        elif ret_status == "gothrough":
            msg = "SKIP \'{}\' proc, ".format(proc_name)
            msg += "glv.conf.progress = {}, ".format(glv.conf.progress)
            msg += "glv.conf.stop = {}, ".format(glv.conf.stop)
            msg += "so skip program."
            log.info(msg)
            return


        # Design a fragment sequence for primer3
        for proc_cnt, distin_dict in enumerate(glv.outlist.distin_files, 1):

            # logging current target
            utl.print_distin_info("marker", distin_dict, proc_cnt)

            # read variant file 
            variant_file = distin_dict['variant']['out_path']
            log.info("variant_file {}".format(variant_file))

            df_distin = pd.read_csv(
                variant_file, sep='\t', header=0, index_col=None)

            # file name to write out result to text
            out_txt_file = distin_dict['marker']['out_path']
            utl.save_to_tmpfile(out_txt_file)

            start = time.time()
            with open(out_txt_file, mode='a') as f:

                ''' eval_variant.py
                class EvalVariant(object):
                def _check_effect_of_enzyme(
                    self, seq_target, enzyme_name_list):
                    http://biopython.org/DIST/docs/cookbook/Restriction.html
                    biopython <= 1.76 for IUPACAmbiguousDNA()

                    multi_site_seq = Seq(seq_target, IUPACAmbiguousDNA())
                    rb = Restriction.RestrictionBatch(enzyme_name_list)
                    Analong = Restriction.Analysis(rb, multi_site_seq)
                    caps_ResTyp_dict = Analong.with_sites()

                This RestrictionBatch method sometimes returned slightly
                inaccurate results when executed in parallel.
                Therefore, parallel is not used now.
                '''

                #if glv.conf.parallel == True:
                if False:
                    log.info("do Parallel cpu {} parallel {}".format(
                        glv.conf.thread,
                        glv.conf.parallel_full_thread))

                    Parallel(
                        n_jobs=glv.conf.parallel_full_thread,
                        backend="threading")(
                        [
                            delayed(self._loop_evaluate_for_marker)
                                (distin_dict, variant_df_row, f) \
                                for variant_df_row in df_distin.itertuples()
                        ]
                    )

                else:
                    log.info("do Serial cpu 1")

                    # each variant
                    for variant_df_row in df_distin.itertuples():

                        # Determine if the variant can be used as a marker.
                        # For those that can be marked, prepare the
                        # information for primer3.
                        self._loop_evaluate_for_marker(
                            distin_dict, variant_df_row, f)

            utl.sort_file(
                'marker', distin_dict, out_txt_file,
                'chrom', 'pos', 'marker_info', 'string')

            log.info("marker {} > {}.txt\n".format(
                utl.elapsed_time(time.time(), start),
                distin_dict['marker']['base_nam']))
Exemplo n.º 11
0
    def _iterate_vcf(self, vcf_ittr, distin_dict, proc_cnt):
        """
        """

        # basic informations
        gr_list = [distin_dict[0], distin_dict[1]]

        reg = distin_dict['region']
        reg_dict = glv.conf.regions_dict[reg]
        pick_mode = distin_dict['pick_mode']
        indel_size = distin_dict['indel_size']
        min_indel_len, max_indel_len = \
            [int(i) for i in indel_size.split('-')]

        # At first, we check difference of genotype between two sample
        # that described at the beginning of each group
        top_smpl_list = [
            glv.conf.group_members_dict[gr_list[0]][0],
            glv.conf.group_members_dict[gr_list[1]][0]
        ]

        # logging current target
        utl.print_distin_info("variant", distin_dict, proc_cnt)

        start = time.time()

        # File name to export variant
        out_txt_file = distin_dict['variant']['out_path']

        utl.save_to_tmpfile(out_txt_file)

        #------------------------------------------------------
        # To add an allele_int column for all sample
        # Members of the specified group come first
        # gr0:s1 g0:s2 g0:s3 g1:s4 g1:s5 g1:s6 s7 s8 s9 s10

        sample_nickname_ordered_list, \
        sample_fullname_ordered_list = \
            utl.get_ordered_sample_list(gr_list)

        sample_added_header = "{}\t{}".format(
            distin_dict['variant']['hdr_text'],
            "\t".join(sample_nickname_ordered_list))

        # Can I parallelize here?
        with open(out_txt_file, mode='a') as f:

            # write sample added header
            f.write("{}\n".format(sample_added_header))

            # access to vcf using iterater
            for record in vcf_ittr:

                # 1. Skip same GT between top two sample
                if self._skip_same_GT_between_top2sample(
                        record, top_smpl_list) > 0:
                    continue

                # 2. Check GT in your own group
                if self._skip_different_GT_in_own_group(
                        record, top_smpl_list, gr_list) > 0:
                    continue

                # 3. Select different allele combination among 2x2 allele
                asel = AlleleSelect(min_indel_len, max_indel_len)
                asel.select_diff_allele(record, top_smpl_list, gr_list)

                # from record, construct allele_int of the member
                # who is paying attention
                allele_int_line = ""

                # 4. Save variant information as text file
                for var_type, line in zip(asel.var_types, asel.lines):
                    if utl.is_my_pick_mode(var_type,
                                           distin_dict['pick_mode']) == True:

                        # make allele_int line
                        if allele_int_line == "":
                            #self._get_ai_line(
                            allele_int_line = \
                                self._get_allele_line(
                                    record, sample_fullname_ordered_list)

                        # add allele line
                        f.write("{}\t{}\n".format(line, allele_int_line))

        log.info("variant {} > {}.txt\n".format(
            utl.elapsed_time(time.time(), start),
            distin_dict['variant']['base_nam']))
Exemplo n.º 12
0
    def print_allele(self):
        ''' When show_genotype is specified, the genotype of the specified
        regions and members are output to a file.
            main
            variant.py print_allele
            allele_select.py cls allele_int
        '''

        proc_name = "genotype"
        log.info("-------------------------------")
        log.info("Start processing {}\n".format(proc_name))

        # header
        header = list()
        header += ["CHROM", "POS", "Rlen", "Alen", "diff", "REF", "ALT"]
        header += glv.conf.group_members_dict['all']

        # reader
        reader = vcfpy.Reader.from_path(glv.conf.vcf_file_path)

        total_cnt = len(glv.conf.region_name_list)

        # Save to file for each region
        for proc_cnt, region_name in enumerate(glv.conf.region_name_list, 1):

            region = glv.conf.regions_dict[region_name]['reg']

            # Create a list of fullname for the specified members
            sample_fullname_list = list()
            for nickname in glv.conf.group_members_dict['all']:
                sample_fullname_list.append(utl.get_fullname(nickname))

            # if group priority
            #sample_fullname_list = \
            #    utl.get_sample_list_from_groupname(
            #        group_list, "fullname")

            # out file name
            outf_pref = "005_genotype"
            basename = "{}~{}~{}".format(outf_pref, region_name,
                                         glv.conf.show_genotype)
            out_file_path = "{}/{}.txt".format(glv.conf.out_dir_path, basename)

            # backup
            utl.save_to_tmpfile(out_file_path)

            log.info("")
            log.info("{} / {}, {}({}) > {}".format(proc_cnt, total_cnt,
                                                   region_name, region,
                                                   out_file_path))

            start = time.time()
            with open(out_file_path, mode='w') as f:

                f.write("{}\n".format('\t'.join(map(str, header))))

                vcf_ittr = reader.fetch(region)
                for record in vcf_ittr:

                    # Main informations
                    line = [record.CHROM, record.POS]

                    alt_list = [alt.value for alt in record.ALT]

                    # variant length and diff
                    len_ref = len(record.REF)
                    lens_alt_list = list()
                    for alt in alt_list:
                        lens_alt_list.append(len(alt))

                    diff_len = abs(len_ref - lens_alt_list[0])
                    lens_alt = ",".join(map(str, lens_alt_list))

                    line += [len_ref]
                    line += [lens_alt]
                    line += [diff_len]

                    line += [record.REF]
                    line += [",".join(alt_list)]

                    line += [
                        AlleleSelect.allele_convert(
                            "{}/{}".format(
                                record.call_for_sample[fn].gt_alleles[0],
                                record.call_for_sample[fn].gt_alleles[1]),
                            glv.conf.show_genotype)
                        for fn in sample_fullname_list
                    ]

                    f.write("{}\n".format('\t'.join(map(str, line))))

            log.info("genotype {} > {}.txt\n".format(
                utl.elapsed_time(time.time(), start), out_file_path))
Exemplo n.º 13
0
    def design_marker(self):

        # progress check
        if utl.progress_check('marker') == False:
            log.info("progress={} so skip variant.".format(glv.conf.progress))
            return
        log.info("Start processing {}".format('marker'))

        # primer3用フラグメントを作成する
        # for each distinguish_groups
        for distin_dict in glv.outlist.distin_files:

            # read variant file
            variant_file = distin_dict['variant']['out_path']
            log.info("variant_file {}".format(variant_file))

            df_distin = pd.read_csv(variant_file,
                                    sep='\t',
                                    header=0,
                                    index_col=None)

            # Bio.Restriction.Restriction_Dictionary
            self.enzyme.read_enzyme_file()

            # file name to write out result to text
            out_txt_file = distin_dict['marker']['out_path']
            utl.save_to_tmpfile(out_txt_file)

            start = time.time()
            with open(out_txt_file, mode='a') as f:

                # write header
                #f.write("{}\n".format(distin_dict['marker']['hdr_text']))

                if glv.conf.parallel == True:
                    log.info("do Parallel cpu {} parallel {}".format(
                        glv.conf.thread, glv.conf.parallele_full_thread))

                    Parallel(
                        n_jobs=glv.conf.parallele_full_thread,
                        backend="threading")(
                        [
                            delayed(self._loop_evaluate_for_marker)
                                (distin_dict, variant_df_row, f) \
                                for variant_df_row in df_distin.itertuples()
                        ]
                    )

                else:
                    log.info("do Serial cpu 1")

                    # each variant
                    for variant_df_row in df_distin.itertuples():
                        # バリアントがマーカーとして使えるかどうか、判断する。
                        # マーカー化可能なものはprimer3用の情報を準備する。
                        self._loop_evaluate_for_marker(distin_dict,
                                                       variant_df_row, f)

            utl.sort_file('marker', distin_dict, out_txt_file, 'chrom', 'pos',
                          'marker_info', 'string')

            log.info("marker {} {}".format(
                utl.elapsed_time(time.time(), start),
                distin_dict['marker']['base_nam']))