示例#1
0
    def save_vcf_sample_name_txt(self):
        '''
        '''

        # exist or not, vcf_sample_name_file
        if os.path.isfile(self.vcf_sample_name_file):
            log.info("found.\n{}".format(self.vcf_sample_name_file))
            # Make a backup of vcf_sample_name_file
            # as it may have been edited by the user
            utl.save_to_tmpfile(self.vcf_sample_name_file, True, True)

        else:
            sample_name_list = list()
            # if not, read vcf and pick sample_name
            log.info("not found {}.".format(self.vcf_sample_name_file))
            sample_name_list += [
                "#{}\t{}\t{}\t{}\t{}".format('no', 'group', 'nickname',
                                             'basename', 'fullname')
            ]

            sample_name_list += self._pick_vcf_sample_list(self.vcf_file_path)

            # backup
            utl.save_to_tmpfile(self.vcf_sample_name_file)

            # write to vcf_sample_name_file
            with open(self.vcf_sample_name_file, mode='w') as f:
                f.write("{}\n".format("\n".join(sample_name_list)))

            log.info("save.\n{}".format(self.vcf_sample_name_file))
示例#2
0
    def _iterate_vcf(self, vcf_ittr, distin_dict, reg):
        """
        """

        pick_mode = distin_dict['pick_mode']
        # 辞書のキーが0。名前の文字列を示している。
        gr_list = [distin_dict[0], distin_dict[1]]
        log.info("gr_list {}.".format(gr_list))

        # At first, we check difference of genotype between two sample
        # that described at the beginning of each group
        top_smpl_list = [
            glv.conf.g_members_dict[gr_list[0]][0],
            glv.conf.g_members_dict[gr_list[1]][0]
        ]
        log.info("top_smpl_list {}.".format(top_smpl_list))

        # ================================================================
        start = time.time()
        # write out to file
        out_txt_file = distin_dict['variant']['out_path']
        utl.save_to_tmpfile(out_txt_file)

        # ここがparallele化できるか
        # f.writeの最後のflash必要か。
        with open(out_txt_file, mode='a') as f:

            # write header
            f.write("{}\n".format(distin_dict['variant']['hdr_text']))

            # access to vcf using iterater
            for record in vcf_ittr:

                # 1. Skip same GT between top two sample
                if self._skip_same_GT_between_top2sample(
                        record, top_smpl_list) > 0:
                    continue

                # 2. Check GT in your own group
                if self._skip_different_GT_in_own_group(
                        record, top_smpl_list, gr_list) > 0:
                    continue

                # 3. Select different allele combination among 2x2 allele
                asel = AlleleSelect()
                asel.select_diff_allele(record, top_smpl_list, gr_list)

                # skip if pick_mode is different
                #                if utl.is_my_pick_mode(
                #                    asel.var_type, distin_dict['pick_mode']) != True:
                #                    continue

                # 4. Save variant information as text file
                for var_type, line in zip(asel.var_types, asel.lines):
                    if utl.is_my_pick_mode(var_type,
                                           distin_dict['pick_mode']) == True:
                        f.write("{}\n".format(line))

        log.info("variant {} {}".format(utl.elapsed_time(time.time(), start),
                                        distin_dict['variant']['base_nam']))
示例#3
0
    def construct_primer(self):

        # progress check
        if utl.progress_check('primer') == False:
            log.info("progress={} so skip primer.".format(glv.conf.progress))
            return
        log.info("Start processing {}".format('primer'))

        # for each distinguish_groups
        for distin_dict in glv.outlist.distin_files:

            marker_file = distin_dict['marker']['out_path']
            df_distin = pd.read_csv(marker_file,
                                    sep='\t',
                                    header=0,
                                    index_col=None)

            out_txt_file = distin_dict['primer']['out_path']
            utl.save_to_tmpfile(out_txt_file)

            with open(out_txt_file, mode='a') as f:
                # write header
                #f.write("{}\n".format(distin_dict['primer']['hdr_text']))

                start = time.time()

                if glv.conf.parallel == True:
                    log.info(
                        "do Parallel cpu {}, parallele {} blast {}".format(
                            glv.conf.thread, glv.conf.parallel_blast_cnt,
                            glv.conf.blast_num_threads))

                    Parallel(
                        n_jobs=glv.conf.parallel_blast_cnt,
                        backend="threading")(
                        [
                            delayed(self._loop_primer3_check_blast) \
                                (distin_dict, marker_df_row, f) \
                                for marker_df_row in df_distin.itertuples()
                        ]
                    )

                else:
                    log.info("do Serial cpu {} / serial {} blast {}".format(
                        glv.conf.thread, 1, glv.conf.blast_num_threads))

                    for marker_df_row in df_distin.itertuples():

                        self._loop_primer3_check_blast(distin_dict,
                                                       marker_df_row, f)

            utl.sort_file('primer', distin_dict, out_txt_file, 'chrom', 'pos',
                          'try_cnt', 'number')

            log.info("primer {} {}".format(
                utl.elapsed_time(time.time(), start),
                distin_dict['primer']['base_nam']))
示例#4
0
    def logging_start(self, mod_name, out_dir, log_dir):

        file_name = 'vprimer_log.txt'
        log_file_name = "{}/{}".format(log_dir, file_name)

        self.config['handlers']['fileHandler']['filename'] = log_file_name
        # before logging
        utl.save_to_tmpfile(log_file_name, False)

        log = LogConf.open_log(mod_name)
        return log
示例#5
0
    def _copy_ini_file(self):

        # ini file
        self.ini_file_path
        # out_dir
        self.out_dir

        # back up
        ini_base = os.path.basename(self.ini_file_path)
        out_dir_ini_file = "{}/{}".format(self.out_dir, ini_base)
        utl.save_to_tmpfile(out_dir_ini_file)

        cmd = "cp {} {}".format(self.ini_file_path, out_dir_ini_file)
        utl.try_exec(cmd)
示例#6
0
    def _set_primer3_header_dict(self):
        '''
        '''

        primer3_header_dict = dict()

        if os.path.isfile(self.p3_params_file_path):
            log.info("found {}.".format(self.p3_params_file_path))
            # This file may have been edited by the user, so copy it
            utl.save_to_tmpfile(self.p3_params_file_path, True, True)

        else:
            log.info("not found {}.".format(self.p3_params_file_path))
            with open(self.p3_params_file_path, mode='w') as f:
                f.write("{}={}\n".format('#PARAM', 'VALUE'))

                for key, value in list(self.p3key.items()):
                    f.write("{}={}\n".format(key, value))

        # 1.1) open and read parameters
        with open(self.p3_params_file_path, mode='r') as f:
            # iterator
            for r_liner in f:
                r_line = r_liner.strip()  # cr, ws

                if r_line.startswith('#') or r_line == '':
                    continue

                r_line = utl.strip_hash_comment(r_line)
                vname, value = r_line.split('=')
                if vname == 'PRIMER_PRODUCT_SIZE_RANGE' or \
                    vname == 'PRIMER_NUM_RETURN':
                    continue

                primer3_header_dict[vname] = value

        # constant value for primer3

        # PRIMER_FIRST_BASE_INDEX=1
        primer3_header_dict['PRIMER_FIRST_BASE_INDEX'] = str(1)
        # PRIMER_PRODUCT_SIZE_RANGE=???-???
        primer3_header_dict['PRIMER_PRODUCT_SIZE_RANGE'] = \
            "{}-{}".format(self.min_product_size, self.max_product_size)
        # PRIMER_NUM_RETURN=1
        primer3_header_dict['PRIMER_NUM_RETURN'] = str(1)

        return primer3_header_dict
示例#7
0
    def construct_primer(self):

        proc_name = "primer"
        log.info("-------------------------------")
        log.info("Start processing {}\n".format(proc_name))

        # stop, action, gothrough
        ret_status = utl.decide_action_stop(proc_name)

        if ret_status == "stop":
            msg = "STOP. "
            msg += "Current process \'{}\' ".format(proc_name)
            msg += "has exceeded the User-specified stop point "
            msg += "\'{}', ".format(glv.conf.stop)
            msg += "so stop program. exit."
            log.info(msg)
            sys.exit(1)

        elif ret_status == "gothrough":
            msg = "SKIP \'{}\' proc, ".format(proc_name)
            msg += "glv.conf.progress = {}, ".format(glv.conf.progress)
            msg += "glv.conf.stop = {}, ".format(glv.conf.stop)
            msg += "so skip program."
            log.info(msg)
            return

        # for each distinguish_groups
        for proc_cnt, distin_dict in enumerate(glv.outlist.distin_files, 1):

            # logging current target
            utl.print_distin_info("primer", distin_dict, proc_cnt)

            marker_file = distin_dict['marker']['out_path']
            df_distin = pd.read_csv(marker_file,
                                    sep='\t',
                                    header=0,
                                    index_col=None)

            out_txt_file = distin_dict['primer']['out_path']
            utl.save_to_tmpfile(out_txt_file)

            with open(out_txt_file, mode='a') as f:
                # write header
                #f.write("{}\n".format(distin_dict['primer']['hdr_text']))

                start = time.time()

                if glv.conf.parallel == True:
                    log.info("do Parallel cpu {}, parallel {} blast {}".format(
                        glv.conf.thread, glv.conf.parallel_blast_cnt,
                        glv.conf.blast_num_threads))

                    Parallel(
                        n_jobs=glv.conf.parallel_blast_cnt,
                        backend="threading")(
                        [
                            delayed(self._loop_primer3_check_blast) \
                                (distin_dict, marker_df_row, f) \
                                for marker_df_row in df_distin.itertuples()
                        ]
                    )

                else:
                    log.info("do Serial cpu {} / serial {} blast {}".format(
                        glv.conf.thread, 1, glv.conf.blast_num_threads))

                    for marker_df_row in df_distin.itertuples():

                        self._loop_primer3_check_blast(distin_dict,
                                                       marker_df_row, f)

            utl.sort_file('primer', distin_dict, out_txt_file, 'chrom', 'pos',
                          'try_cnt', 'number')

            log.info("primer {} > {}.txt\n".format(
                utl.elapsed_time(time.time(), start),
                distin_dict['primer']['base_nam']))
示例#8
0
    def design_marker(self):

        self.enzyme_name_list = glv.conf.enzyme_name_list

        proc_name = "marker"
        log.info("-------------------------------")
        log.info("Start processing {}\n".format(proc_name))

        # stop, action, gothrough
        ret_status = utl.decide_action_stop(proc_name)

        if ret_status == "stop":
            msg = "STOP. "
            msg += "Current process \'{}\' ".format(proc_name)
            msg += "has exceeded the User-specified stop point "
            msg += "\'{}', ".format(glv.conf.stop)
            msg += "so stop program. exit."
            log.info(msg)
            sys.exit(1)


        elif ret_status == "gothrough":
            msg = "SKIP \'{}\' proc, ".format(proc_name)
            msg += "glv.conf.progress = {}, ".format(glv.conf.progress)
            msg += "glv.conf.stop = {}, ".format(glv.conf.stop)
            msg += "so skip program."
            log.info(msg)
            return


        # Design a fragment sequence for primer3
        for proc_cnt, distin_dict in enumerate(glv.outlist.distin_files, 1):

            # logging current target
            utl.print_distin_info("marker", distin_dict, proc_cnt)

            # read variant file 
            variant_file = distin_dict['variant']['out_path']
            log.info("variant_file {}".format(variant_file))

            df_distin = pd.read_csv(
                variant_file, sep='\t', header=0, index_col=None)

            # file name to write out result to text
            out_txt_file = distin_dict['marker']['out_path']
            utl.save_to_tmpfile(out_txt_file)

            start = time.time()
            with open(out_txt_file, mode='a') as f:

                ''' eval_variant.py
                class EvalVariant(object):
                def _check_effect_of_enzyme(
                    self, seq_target, enzyme_name_list):
                    http://biopython.org/DIST/docs/cookbook/Restriction.html
                    biopython <= 1.76 for IUPACAmbiguousDNA()

                    multi_site_seq = Seq(seq_target, IUPACAmbiguousDNA())
                    rb = Restriction.RestrictionBatch(enzyme_name_list)
                    Analong = Restriction.Analysis(rb, multi_site_seq)
                    caps_ResTyp_dict = Analong.with_sites()

                This RestrictionBatch method sometimes returned slightly
                inaccurate results when executed in parallel.
                Therefore, parallel is not used now.
                '''

                #if glv.conf.parallel == True:
                if False:
                    log.info("do Parallel cpu {} parallel {}".format(
                        glv.conf.thread,
                        glv.conf.parallel_full_thread))

                    Parallel(
                        n_jobs=glv.conf.parallel_full_thread,
                        backend="threading")(
                        [
                            delayed(self._loop_evaluate_for_marker)
                                (distin_dict, variant_df_row, f) \
                                for variant_df_row in df_distin.itertuples()
                        ]
                    )

                else:
                    log.info("do Serial cpu 1")

                    # each variant
                    for variant_df_row in df_distin.itertuples():

                        # Determine if the variant can be used as a marker.
                        # For those that can be marked, prepare the
                        # information for primer3.
                        self._loop_evaluate_for_marker(
                            distin_dict, variant_df_row, f)

            utl.sort_file(
                'marker', distin_dict, out_txt_file,
                'chrom', 'pos', 'marker_info', 'string')

            log.info("marker {} > {}.txt\n".format(
                utl.elapsed_time(time.time(), start),
                distin_dict['marker']['base_nam']))
示例#9
0
    def out_current_settings(self):
        ''' Output to a file with config (ini format)
        '''

        current_setting_ini = list()
        whole_command_line = ' '.join(sys.argv)

        # [vprimer]
        current_setting_ini.append("{}".format(glv.ini_section))

        # date
        date_stamp = "\n# {}".format(glv.now_datetime_form)
        current_setting_ini.append(date_stamp)

        # whole_command_line
        whole_command_line = "\n# {}".format(whole_command_line)
        current_setting_ini.append(whole_command_line)

        current_setting_ini.append("\n#")

        for vname in self.conf_dict.keys():

            if 'chosen' in self.conf_dict[vname]:
                key_value = "{} = {}".format(vname,
                                             self.conf_dict[vname]['chosen'])

                current_setting_ini.append(key_value)

                if vname == "ref" or vname == "stop" or \
                    vname == "product_size" or vname == "enzyme" or \
                    vname == "group_members" or vname == "blast_distance" or \
                    vname == "use_joblib_threading":

                    current_setting_ini.append("\n#")

        # exist or not, self.curr_setting_file_path
        if os.path.isfile(self.curr_setting_file_path):
            # If the file exists, move it to bak
            log.info("found {}".format(self.curr_setting_file_path))
            utl.save_to_tmpfile(self.curr_setting_file_path)
        else:
            log.info("not found {}".format(self.curr_setting_file_path))

        # write to sample_name_file
        with open(self.curr_setting_file_path, mode='w') as f:
            # Export while adjusting
            #line = self._convert_setting_ini(current_setting_ini)
            #f.write("{}\n".format("\n".join(current_setting_ini)))
            line = self._convert_setting_ini(current_setting_ini)
            f.write("{}\n".format(line))

        log.info("save {}".format(self.curr_setting_file_path))

        # ====
        log.info("self.conf_dict=\n{}".format(pprint.pformat(self.conf_dict)))

        log.info("self.regions_dict=\n{}".format(
            pprint.pformat(self.regions_dict)))
        log.info("self.group_members_dict=\n{}".format(
            pprint.pformat(self.group_members_dict)))
        log.info("self.distinguish_groups_list=\n{}".format(
            pprint.pformat(self.distinguish_groups_list)))
示例#10
0
    def _iterate_vcf(self, vcf_ittr, distin_dict, proc_cnt):
        """
        """

        # basic informations
        gr_list = [distin_dict[0], distin_dict[1]]

        reg = distin_dict['region']
        reg_dict = glv.conf.regions_dict[reg]
        pick_mode = distin_dict['pick_mode']
        indel_size = distin_dict['indel_size']
        min_indel_len, max_indel_len = \
            [int(i) for i in indel_size.split('-')]

        # At first, we check difference of genotype between two sample
        # that described at the beginning of each group
        top_smpl_list = [
            glv.conf.group_members_dict[gr_list[0]][0],
            glv.conf.group_members_dict[gr_list[1]][0]
        ]

        # logging current target
        utl.print_distin_info("variant", distin_dict, proc_cnt)

        start = time.time()

        # File name to export variant
        out_txt_file = distin_dict['variant']['out_path']

        utl.save_to_tmpfile(out_txt_file)

        #------------------------------------------------------
        # To add an allele_int column for all sample
        # Members of the specified group come first
        # gr0:s1 g0:s2 g0:s3 g1:s4 g1:s5 g1:s6 s7 s8 s9 s10

        sample_nickname_ordered_list, \
        sample_fullname_ordered_list = \
            utl.get_ordered_sample_list(gr_list)

        sample_added_header = "{}\t{}".format(
            distin_dict['variant']['hdr_text'],
            "\t".join(sample_nickname_ordered_list))

        # Can I parallelize here?
        with open(out_txt_file, mode='a') as f:

            # write sample added header
            f.write("{}\n".format(sample_added_header))

            # access to vcf using iterater
            for record in vcf_ittr:

                # 1. Skip same GT between top two sample
                if self._skip_same_GT_between_top2sample(
                        record, top_smpl_list) > 0:
                    continue

                # 2. Check GT in your own group
                if self._skip_different_GT_in_own_group(
                        record, top_smpl_list, gr_list) > 0:
                    continue

                # 3. Select different allele combination among 2x2 allele
                asel = AlleleSelect(min_indel_len, max_indel_len)
                asel.select_diff_allele(record, top_smpl_list, gr_list)

                # from record, construct allele_int of the member
                # who is paying attention
                allele_int_line = ""

                # 4. Save variant information as text file
                for var_type, line in zip(asel.var_types, asel.lines):
                    if utl.is_my_pick_mode(var_type,
                                           distin_dict['pick_mode']) == True:

                        # make allele_int line
                        if allele_int_line == "":
                            #self._get_ai_line(
                            allele_int_line = \
                                self._get_allele_line(
                                    record, sample_fullname_ordered_list)

                        # add allele line
                        f.write("{}\t{}\n".format(line, allele_int_line))

        log.info("variant {} > {}.txt\n".format(
            utl.elapsed_time(time.time(), start),
            distin_dict['variant']['base_nam']))
示例#11
0
    def print_allele(self):
        ''' When show_genotype is specified, the genotype of the specified
        regions and members are output to a file.
            main
            variant.py print_allele
            allele_select.py cls allele_int
        '''

        proc_name = "genotype"
        log.info("-------------------------------")
        log.info("Start processing {}\n".format(proc_name))

        # header
        header = list()
        header += ["CHROM", "POS", "Rlen", "Alen", "diff", "REF", "ALT"]
        header += glv.conf.group_members_dict['all']

        # reader
        reader = vcfpy.Reader.from_path(glv.conf.vcf_file_path)

        total_cnt = len(glv.conf.region_name_list)

        # Save to file for each region
        for proc_cnt, region_name in enumerate(glv.conf.region_name_list, 1):

            region = glv.conf.regions_dict[region_name]['reg']

            # Create a list of fullname for the specified members
            sample_fullname_list = list()
            for nickname in glv.conf.group_members_dict['all']:
                sample_fullname_list.append(utl.get_fullname(nickname))

            # if group priority
            #sample_fullname_list = \
            #    utl.get_sample_list_from_groupname(
            #        group_list, "fullname")

            # out file name
            outf_pref = "005_genotype"
            basename = "{}~{}~{}".format(outf_pref, region_name,
                                         glv.conf.show_genotype)
            out_file_path = "{}/{}.txt".format(glv.conf.out_dir_path, basename)

            # backup
            utl.save_to_tmpfile(out_file_path)

            log.info("")
            log.info("{} / {}, {}({}) > {}".format(proc_cnt, total_cnt,
                                                   region_name, region,
                                                   out_file_path))

            start = time.time()
            with open(out_file_path, mode='w') as f:

                f.write("{}\n".format('\t'.join(map(str, header))))

                vcf_ittr = reader.fetch(region)
                for record in vcf_ittr:

                    # Main informations
                    line = [record.CHROM, record.POS]

                    alt_list = [alt.value for alt in record.ALT]

                    # variant length and diff
                    len_ref = len(record.REF)
                    lens_alt_list = list()
                    for alt in alt_list:
                        lens_alt_list.append(len(alt))

                    diff_len = abs(len_ref - lens_alt_list[0])
                    lens_alt = ",".join(map(str, lens_alt_list))

                    line += [len_ref]
                    line += [lens_alt]
                    line += [diff_len]

                    line += [record.REF]
                    line += [",".join(alt_list)]

                    line += [
                        AlleleSelect.allele_convert(
                            "{}/{}".format(
                                record.call_for_sample[fn].gt_alleles[0],
                                record.call_for_sample[fn].gt_alleles[1]),
                            glv.conf.show_genotype)
                        for fn in sample_fullname_list
                    ]

                    f.write("{}\n".format('\t'.join(map(str, line))))

            log.info("genotype {} > {}.txt\n".format(
                utl.elapsed_time(time.time(), start), out_file_path))
示例#12
0
    def format_text(self):

        # progress check
        if utl.progress_check('formsafe') == False and \
            utl.progress_check('formfail') == False:

            log.info("progress={} so skip form.".format(glv.conf.progress))
            return
        log.info("Start processing {}".format('formsafe'))

        # for each distinguish_groups
        for distin_dict in glv.outlist.distin_files:

            # read variant file
            primer_file = distin_dict['primer']['out_path']
            df_distin = pd.read_csv(primer_file,
                                    sep='\t',
                                    header=0,
                                    index_col=None)

            # complete == 1 or == 0
            safe = 1
            fail = 0
            for complete, proc in zip([fail, safe], ['formfail', 'formsafe']):
                log.info("{} {}".format(complete, proc))

                df_distin_complete = \
                    df_distin[df_distin['complete'] == complete]

                #------------------------
                # check chrom-pos duplicate marker
                df_chrom_pos = df_distin_complete.loc[:, ['chrom', 'pos']]
                df_chrom_pos_duplicated = \
                    df_chrom_pos[df_chrom_pos.duplicated()]

                duplicate_pos_dict = dict()
                for c_p_row in df_chrom_pos_duplicated.itertuples():

                    chrom = c_p_row[1]
                    pos = c_p_row[2]

                    if not chrom in duplicate_pos_dict:
                        duplicate_pos_dict[chrom] = dict()

                    if not pos in duplicate_pos_dict[chrom]:
                        duplicate_pos_dict[chrom][pos] = pos

                #------------------------
                # file name to write out result to text
                out_txt_file = distin_dict[proc]['out_path']
                log.info("out_txt_file={}.".format(out_txt_file))

                utl.save_to_tmpfile(out_txt_file)

                with open(out_txt_file, mode='a') as f:

                    # write header
                    f.write("{}\n".format(distin_dict['formsafe']['hdr_text']))

                    # each variant
                    for primer_df_row in df_distin_complete.itertuples():

                        self._prepare_from_primer_file(primer_df_row,
                                                       distin_dict)

                        self._format_product(duplicate_pos_dict)

                        # 書き出す
                        f.write("{}\n".format(self.line))
示例#13
0
文件: formtxt.py 项目: ncod3/vprimer
    def format_text(self):
        '''
        '''

        # for each distinguish_groups
        for proc_cnt, distin_dict in enumerate(glv.outlist.distin_files, 1):

            #
            # read primer file
            primer_file = distin_dict['primer']['out_path']

            # read variant file and set allele int informations
            # to a dictionary.
            variant_file = distin_dict['variant']['out_path']

            df_variant = pd.read_csv(variant_file,
                                     sep='\t',
                                     header=0,
                                     index_col=None)
            header_list = distin_dict['variant']['hdr_text'].split("\t")
            existing_column_cnt = len(header_list)
            # not including REF,ALT
            #alint_start = existing_column_cnt + 1 - 1
            alint_start = existing_column_cnt + 1

            variant_alint_dict = dict()
            alint_list = list()

            for variant_df_row in df_variant.itertuples():
                chrom_name = variant_df_row[1]
                pos = variant_df_row[2]
                alint_list = [variant_df_row[6]]
                alint_list += list(variant_df_row[alint_start:])

                if chrom_name not in variant_alint_dict.keys():
                    variant_alint_dict[chrom_name] = dict()

                variant_alint_dict[chrom_name][pos] = alint_list

            #--------------------------------------------------------

            df_distin = pd.read_csv(primer_file,
                                    sep='\t',
                                    header=0,
                                    index_col=None)

            # complete == 1 or == 0
            fail = 0
            safe = 1
            for complete, proc in zip([fail, safe], ['formfail', 'formsafe']):

                # stop, action, gothrough
                proc_name = proc
                ret_status = utl.decide_action_stop(proc_name)

                if ret_status == "stop":
                    msg = "STOP. "
                    msg += "Current process \'{}\' ".format(proc_name)
                    msg += "has exceeded the User-specified stop point "
                    msg += "\'{}', ".format(glv.conf.stop)
                    msg += "so stop program. exit."
                    log.info(msg)
                    #sys.exit(1)
                    continue

                elif ret_status == "gothrough":
                    msg = "SKIP \'{}\' proc, ".format(proc_name)
                    msg += "glv.conf.progress = {}, ".format(glv.conf.progress)
                    msg += "glv.conf.stop = {}, ".format(glv.conf.stop)
                    msg += "so skip program."
                    log.info(msg)
                    continue

                log.info("-------------------------------")
                log.info("Start processing {} complete={}\n".format(
                    proc_name, complete))

                # logging current target
                sub_proc = "{}_{}".format(proc, complete)
                utl.print_distin_info(sub_proc, distin_dict, proc_cnt, True)

                df_distin_complete = \
                    df_distin[df_distin['complete'] == complete]

                #------------------------
                # check chrom-pos duplicate marker
                df_chrom_pos = df_distin_complete.loc[:, ['chrom', 'pos']]
                df_chrom_pos_duplicated = \
                    df_chrom_pos[df_chrom_pos.duplicated()]

                duplicate_pos_dict = dict()
                for c_p_row in df_chrom_pos_duplicated.itertuples():

                    chrom = c_p_row[1]
                    pos = c_p_row[2]

                    if not chrom in duplicate_pos_dict:
                        duplicate_pos_dict[chrom] = dict()

                    if not pos in duplicate_pos_dict[chrom]:
                        duplicate_pos_dict[chrom][pos] = pos

                #------------------------
                # file name to write out result to text
                out_txt_file = distin_dict[proc]['out_path']
                log.info("out_txt_file={}.".format(out_txt_file))

                utl.save_to_tmpfile(out_txt_file)

                with open(out_txt_file, mode='a') as f:

                    header = distin_dict['formsafe']['hdr_text']
                    if (proc == "formsafe"):
                        #alint_header = ["targ_ano", "vseq_ano_str"]
                        alint_header = ["vseq_ano_str"]
                        sample_nickname_ordered_list, \
                        sample_fullname_ordered_list = \
                            utl.get_ordered_sample_list(
                                [distin_dict[0], distin_dict[1]])
                        alint_header += sample_nickname_ordered_list
                        header = "{}\t{}".format(header,
                                                 "\t".join(alint_header))

                    # write header
                    f.write("{}\n".format(header))

                    # each variant
                    for primer_df_row in df_distin_complete.itertuples():

                        chrom_name = primer_df_row[2]
                        pos = primer_df_row[3]

                        self._prepare_from_primer_file(primer_df_row,
                                                       distin_dict)

                        self._format_product(duplicate_pos_dict)

                        if (proc == "formsafe"):

                            #print("chrom_name={}, pos={}".format(
                            #    chrom_name, pos))
                            #print("{}, {}".format(chrom_name, pos))
                            #slice_one = variant_alint_dict[chrom_name][pos]
                            #print(type(slice_one))

                            #pprint.pprint(
                            #   variant_alint_dict[chrom_name][pos])

                            line = "{}\t{}".format(
                                self.line, "\t".join(
                                    map(str,
                                        variant_alint_dict[chrom_name][pos])))

                            f.write("{}\n".format(line))

                        else:
                            # 書き出す
                            f.write("{}\n".format(self.line))
示例#14
0
    def design_marker(self):

        # progress check
        if utl.progress_check('marker') == False:
            log.info("progress={} so skip variant.".format(glv.conf.progress))
            return
        log.info("Start processing {}".format('marker'))

        # primer3用フラグメントを作成する
        # for each distinguish_groups
        for distin_dict in glv.outlist.distin_files:

            # read variant file
            variant_file = distin_dict['variant']['out_path']
            log.info("variant_file {}".format(variant_file))

            df_distin = pd.read_csv(variant_file,
                                    sep='\t',
                                    header=0,
                                    index_col=None)

            # Bio.Restriction.Restriction_Dictionary
            self.enzyme.read_enzyme_file()

            # file name to write out result to text
            out_txt_file = distin_dict['marker']['out_path']
            utl.save_to_tmpfile(out_txt_file)

            start = time.time()
            with open(out_txt_file, mode='a') as f:

                # write header
                #f.write("{}\n".format(distin_dict['marker']['hdr_text']))

                if glv.conf.parallel == True:
                    log.info("do Parallel cpu {} parallel {}".format(
                        glv.conf.thread, glv.conf.parallele_full_thread))

                    Parallel(
                        n_jobs=glv.conf.parallele_full_thread,
                        backend="threading")(
                        [
                            delayed(self._loop_evaluate_for_marker)
                                (distin_dict, variant_df_row, f) \
                                for variant_df_row in df_distin.itertuples()
                        ]
                    )

                else:
                    log.info("do Serial cpu 1")

                    # each variant
                    for variant_df_row in df_distin.itertuples():
                        # バリアントがマーカーとして使えるかどうか、判断する。
                        # マーカー化可能なものはprimer3用の情報を準備する。
                        self._loop_evaluate_for_marker(distin_dict,
                                                       variant_df_row, f)

            utl.sort_file('marker', distin_dict, out_txt_file, 'chrom', 'pos',
                          'marker_info', 'string')

            log.info("marker {} {}".format(
                utl.elapsed_time(time.time(), start),
                distin_dict['marker']['base_nam']))