def _iterate_vcf(self, vcf_ittr, distin_dict, reg): """ """ pick_mode = distin_dict['pick_mode'] # 辞書のキーが0。名前の文字列を示している。 gr_list = [distin_dict[0], distin_dict[1]] log.info("gr_list {}.".format(gr_list)) # At first, we check difference of genotype between two sample # that described at the beginning of each group top_smpl_list = [ glv.conf.g_members_dict[gr_list[0]][0], glv.conf.g_members_dict[gr_list[1]][0] ] log.info("top_smpl_list {}.".format(top_smpl_list)) # ================================================================ start = time.time() # write out to file out_txt_file = distin_dict['variant']['out_path'] utl.save_to_tmpfile(out_txt_file) # ここがparallele化できるか # f.writeの最後のflash必要か。 with open(out_txt_file, mode='a') as f: # write header f.write("{}\n".format(distin_dict['variant']['hdr_text'])) # access to vcf using iterater for record in vcf_ittr: # 1. Skip same GT between top two sample if self._skip_same_GT_between_top2sample( record, top_smpl_list) > 0: continue # 2. Check GT in your own group if self._skip_different_GT_in_own_group( record, top_smpl_list, gr_list) > 0: continue # 3. Select different allele combination among 2x2 allele asel = AlleleSelect() asel.select_diff_allele(record, top_smpl_list, gr_list) # skip if pick_mode is different # if utl.is_my_pick_mode( # asel.var_type, distin_dict['pick_mode']) != True: # continue # 4. Save variant information as text file for var_type, line in zip(asel.var_types, asel.lines): if utl.is_my_pick_mode(var_type, distin_dict['pick_mode']) == True: f.write("{}\n".format(line)) log.info("variant {} {}".format(utl.elapsed_time(time.time(), start), distin_dict['variant']['base_nam']))
def construct_primer(self): # progress check if utl.progress_check('primer') == False: log.info("progress={} so skip primer.".format(glv.conf.progress)) return log.info("Start processing {}".format('primer')) # for each distinguish_groups for distin_dict in glv.outlist.distin_files: marker_file = distin_dict['marker']['out_path'] df_distin = pd.read_csv(marker_file, sep='\t', header=0, index_col=None) out_txt_file = distin_dict['primer']['out_path'] utl.save_to_tmpfile(out_txt_file) with open(out_txt_file, mode='a') as f: # write header #f.write("{}\n".format(distin_dict['primer']['hdr_text'])) start = time.time() if glv.conf.parallel == True: log.info( "do Parallel cpu {}, parallele {} blast {}".format( glv.conf.thread, glv.conf.parallel_blast_cnt, glv.conf.blast_num_threads)) Parallel( n_jobs=glv.conf.parallel_blast_cnt, backend="threading")( [ delayed(self._loop_primer3_check_blast) \ (distin_dict, marker_df_row, f) \ for marker_df_row in df_distin.itertuples() ] ) else: log.info("do Serial cpu {} / serial {} blast {}".format( glv.conf.thread, 1, glv.conf.blast_num_threads)) for marker_df_row in df_distin.itertuples(): self._loop_primer3_check_blast(distin_dict, marker_df_row, f) utl.sort_file('primer', distin_dict, out_txt_file, 'chrom', 'pos', 'try_cnt', 'number') log.info("primer {} {}".format( utl.elapsed_time(time.time(), start), distin_dict['primer']['base_nam']))
def main(): log.info('program started at {}'.format(glv.now_datetime_str)) # run vpr = VPrimer() vpr.run() log.info("program finished {}\n".format( utl.elapsed_time(time.time(), glv.now_epochtime)))
def run(self): self.prepare() #self.variant.print_allele_int() #self.variant.print_all_allele_int() #glv.conf.get_vcf_pos_info() if glv.conf.show_genotype != "no": self.variant.print_allele() log.info("program finished {}\n".format( utl.elapsed_time(time.time(), glv.now_epochtime))) sys.exit(1) # variant self.variant.pick_variant() # marker self.marker.design_marker() # primer self.primer.construct_primer() # format self.formtxt.format_text()
def _read_fasta_first(self): # glv.conf.ref_fasta_fai #chr01 43270923 7 60 61 #chr02 35937250 43992120 60 61 #chr03 36413819 80528332 60 61 #chr04 35502694 117549055 60 61 #chr05 29958434 153643468 60 61 #chr06 31248787 184101217 60 61 #chr07 29697621 215870825 60 61 #chr08 28443022 246063414 60 61 #chr09 23012720 274980494 60 61 #chr10 23207287 298376767 60 61 #chr11 29021106 321970850 60 61 #chr12 27531856 351475649 60 61 # get chrom list from fai text df_fai = pd.read_csv( glv.conf.ref_fasta_fai, sep = '\t', header = None, index_col = None) # ref_fasta_chrom_list from fai column 0 (chrom name) glv.conf.ref_fasta_chrom_list = df_fai.loc[:, 0].to_list() log.info("fai {}, chrom cnt={}".format( glv.conf.ref_fasta_fai, len(glv.conf.ref_fasta_chrom_list))) # for each chrom name log.info("read refseq by samtools faidx from fasta {}".format( glv.conf.ref_fasta)) start = time.time() chrom_seq_list = [] last_chrom = '' for chrom in glv.conf.ref_fasta_chrom_list: # get sequence from samtools command cmd1 = "samtools faidx {} {}".format(glv.conf.ref_fasta, chrom) cmd_list = cmd1.split(' ') # log.info("{}".format(cmd_list)) # using command output by pipe, get sequence into python proc = sbp.Popen( cmd_list, stdout = sbp.PIPE, stderr = sbp.PIPE) # got bytes (b'') for byte_line in proc.stdout: # bytes to str, strip \n b_line = byte_line.decode().strip() #print("{}={}(top)".format(chrom, len(chrom_seq_list))) # fasta header if b_line.startswith('>'): # not the first time if len(chrom_seq_list) != 0: # dictionary glv.ref.refseq[last_chrom] = ''.join(chrom_seq_list) chrom_seq_list = [] continue else: # append to list chrom_seq_list.append(b_line) #print("{}={}(append)".format(chrom, len(chrom_seq_list))) last_chrom = chrom if len(chrom_seq_list) != 0: glv.ref.refseq[last_chrom] = ''.join(chrom_seq_list) print("{},last_len={}".format( chrom, len(glv.ref.refseq[last_chrom]))) log.info("read refseq done {}".format( utl.elapsed_time(time.time(), start))) # pickle with open(glv.conf.ref_fasta_pickle, 'wb') as f: pickle.dump(glv.ref.refseq, f) log.info('dumped glv.ref.refseq->{}'.format( glv.conf.ref_fasta_pickle))
def _read_fasta_first(self): ''' ''' # read fai and set dictionary glv.conf.ref_fasta_chrom_dict_list, \ glv.conf.ref_fasta_chrom_list, \ glv.conf.ref_fasta_chrom_region_list = \ self._get_fai_info() # for each chrom name log.info("read refseq by samtools faidx from fasta {}".format( glv.conf.ref_fasta_path)) start = time.time() chrom_seq_list = [] last_chrom = '' for chrom in glv.conf.ref_fasta_chrom_list: # get sequence from samtools command cmd1 = "samtools faidx {} {}".format(glv.conf.ref_fasta_path, chrom) cmd_list = cmd1.split(' ') # log.info("{}".format(cmd_list)) # using command output by pipe, get sequence into python proc = sbp.Popen(cmd_list, stdout=sbp.PIPE, stderr=sbp.PIPE) # got bytes (b'') for byte_line in proc.stdout: # bytes to str, strip \n b_line = byte_line.decode().strip() #print("{}={}(top)".format(chrom, len(chrom_seq_list))) # fasta header if b_line.startswith('>'): # not the first time if len(chrom_seq_list) != 0: # dictionary glv.ref.refseq[last_chrom] = ''.join(chrom_seq_list) chrom_seq_list = [] continue else: # append to list chrom_seq_list.append(b_line) #print("{}={}(append)".format(chrom, len(chrom_seq_list))) last_chrom = chrom if len(chrom_seq_list) != 0: glv.ref.refseq[last_chrom] = ''.join(chrom_seq_list) #print("{},last_len={}".format( # chrom, len(glv.ref.refseq[last_chrom]))) log.info("read refseq done {}\n".format( utl.elapsed_time(time.time(), start))) # pickle with open(glv.conf.ref_fasta_pickle, 'wb') as f: pickle.dump(glv.ref.refseq, f) log.info('dumped glv.ref.refseq->{}'.format(glv.conf.ref_fasta_pickle))
def _get_fai_info(self): ''' ''' # glv.conf.ref_fasta_fai #chr01 43270923 7 60 61 #chr02 35937250 43992120 60 61 #chr03 36413819 80528332 60 61 #chr04 35502694 117549055 60 61 #chr05 29958434 153643468 60 61 #chr06 31248787 184101217 60 61 #chr07 29697621 215870825 60 61 #chr08 28443022 246063414 60 61 #chr09 23012720 274980494 60 61 #chr10 23207287 298376767 60 61 #chr11 29021106 321970850 60 61 #chr12 27531856 351475649 60 61 # get chrom list from fai text df_fai = pd.read_csv(glv.conf.ref_fasta_fai, sep='\t', header=None, index_col=None) ref_fasta_chrom_dict_list = list() ref_fasta_chrom_list = list() ref_fasta_chrom_region_list = list() for row in df_fai.itertuples(): chrom_dict = dict() chrom_dict = { 'chrom': row[1], 'start': 1, 'end': row[2], 'length': row[2] } ref_fasta_chrom_dict_list.append(chrom_dict) ref_fasta_chrom_list.append(row[1]) region = "{}:{}-{}".format(row[1], 1, row[2]) ref_fasta_chrom_region_list.append(region) log.info( "ref_fasta_chrom_dict_list={}.".format(ref_fasta_chrom_dict_list)) log.info("ref_fasta_chrom_list={}.".format(ref_fasta_chrom_list)) log.info("fai {}, chrom cnt={}".format(glv.conf.ref_fasta_fai, len(ref_fasta_chrom_list))) pd_json = pd.json_normalize(ref_fasta_chrom_dict_list) log.info("\n{}\n".format(pd_json)) # exist or not, vcf_sample_name_file if os.path.isfile(glv.conf.ref_fasta_chrom_txt): log.info("found. {}".format(glv.conf.ref_fasta_chrom_txt)) else: # write to vcf_sample_name_file with open(glv.conf.ref_fasta_chrom_txt, mode='w') as f: f.write("{}\n".format(pd_json)) log.info("save. {}".format(glv.conf.ref_fasta_chrom_txt)) ''' Creating dataframe by converting dict to list of items ''' if glv.conf.show_fasta == True: log.info("only show_fasta mode, exit.") log.info("program finished {}\n".format( utl.elapsed_time(time.time(), glv.now_epochtime))) sys.exit(1) return ref_fasta_chrom_dict_list, \ ref_fasta_chrom_list, \ ref_fasta_chrom_region_list
def construct_primer(self): proc_name = "primer" log.info("-------------------------------") log.info("Start processing {}\n".format(proc_name)) # stop, action, gothrough ret_status = utl.decide_action_stop(proc_name) if ret_status == "stop": msg = "STOP. " msg += "Current process \'{}\' ".format(proc_name) msg += "has exceeded the User-specified stop point " msg += "\'{}', ".format(glv.conf.stop) msg += "so stop program. exit." log.info(msg) sys.exit(1) elif ret_status == "gothrough": msg = "SKIP \'{}\' proc, ".format(proc_name) msg += "glv.conf.progress = {}, ".format(glv.conf.progress) msg += "glv.conf.stop = {}, ".format(glv.conf.stop) msg += "so skip program." log.info(msg) return # for each distinguish_groups for proc_cnt, distin_dict in enumerate(glv.outlist.distin_files, 1): # logging current target utl.print_distin_info("primer", distin_dict, proc_cnt) marker_file = distin_dict['marker']['out_path'] df_distin = pd.read_csv(marker_file, sep='\t', header=0, index_col=None) out_txt_file = distin_dict['primer']['out_path'] utl.save_to_tmpfile(out_txt_file) with open(out_txt_file, mode='a') as f: # write header #f.write("{}\n".format(distin_dict['primer']['hdr_text'])) start = time.time() if glv.conf.parallel == True: log.info("do Parallel cpu {}, parallel {} blast {}".format( glv.conf.thread, glv.conf.parallel_blast_cnt, glv.conf.blast_num_threads)) Parallel( n_jobs=glv.conf.parallel_blast_cnt, backend="threading")( [ delayed(self._loop_primer3_check_blast) \ (distin_dict, marker_df_row, f) \ for marker_df_row in df_distin.itertuples() ] ) else: log.info("do Serial cpu {} / serial {} blast {}".format( glv.conf.thread, 1, glv.conf.blast_num_threads)) for marker_df_row in df_distin.itertuples(): self._loop_primer3_check_blast(distin_dict, marker_df_row, f) utl.sort_file('primer', distin_dict, out_txt_file, 'chrom', 'pos', 'try_cnt', 'number') log.info("primer {} > {}.txt\n".format( utl.elapsed_time(time.time(), start), distin_dict['primer']['base_nam']))
def choice_variables(self): ''' decide variable values ''' # print param and ini variables self._print_param_ini() # for debug self.analyse_caps = self._value_choice('analyse_caps') # out_dir --------------------------------------------- self.user_out_dir = self._value_choice('out_dir') self.out_dir_path = utl.full_path(self.user_out_dir) # vcf ------------------------------------------------- self.user_vcf_file = self._value_choice('vcf') self.user_vcf_file_path = utl.full_path(self.user_vcf_file) # ref ------------------------------------------------- self.user_ref_fasta = self._value_choice('ref') self.user_ref_fasta_path = utl.full_path(self.user_ref_fasta) # thread ---------------------------------------------- self.thread = self._value_choice('thread') if self.out_dir_path == "" or \ self.user_vcf_file_path == "" or \ self.user_ref_fasta_path == "": err_mes = "out_dir={} and vcf={} and ref={} ".format( self.out_dir_path, self.user_vcf_file_path, self.user_ref_fasta_path) err_mes += "are all required. exit." log.error(err_mes) sys.exit(1) log.info("thread={}".format(self.thread)) # out_dir --------------------------------------------- self.out_dir_path = utl.full_path(self.user_out_dir) self.log_dir_path = "{}/{}".format(self.out_dir_path, "logs") self.out_bak_dir_path = "{}/{}".format(self.out_dir_path, "bak") #pprint.pprint(self.conf_dict) # INI show_genotype self.show_genotype = self._value_choice('show_genotype') if self.show_genotype == "": self.show_genotype = "gt" if not self.show_genotype in glv.show_genotype_list: err_mes = "show_genotype is selected from one of " err_mes += ", ".join(glv.show_genotype_list) log.error("{}. exit.".format(err_mes)) log.error("show_genotype={}".format(self.show_genotype)) sys.exit(1) # ini_file -------------------------------------------- # INI self.ini_version_user = self._value_choice('ini_version') self.ini_version_system = self.conf_dict['ini_version']['default'] self.user_ini_file = self.conf_dict['ini_file']['param'] self.ini_file_path = utl.full_path(self.user_ini_file) # ref_dir --------------------------------------------- self.ref_dir_path = utl.full_path("refs") # make ref_dir utl.makedirs(self.ref_dir_path) # out_curr_setting ------------------------------------ self.curr_setting_file = "current_setting_ini.txt" self.curr_setting_file_path = "{}/{}".format(self.out_dir_path, self.curr_setting_file) # thread ---------------------------------------------- self.use_joblib_threading = self._value_choice('use_joblib_threading') if not self.use_joblib_threading in ['yes', 'no']: err_mes = "use_joblib_threading Choose from Yes or No." log.error("{} exit.".format(err_mes)) log.error("use_joblib_threading={}".format( self.use_joblib_threading)) sys.exit(1) # thread adjust self.parallel, \ self.parallel_full_thread, \ self.parallel_blast_cnt, \ self.blast_num_threads \ = self._thread_adjusting() # vcf ------------------------------------------------- basename_user_vcf = os.path.basename(self.user_vcf_file_path) self.vcf_file_slink_system = "{}/{}{}".format(self.ref_dir_path, 'slink_', basename_user_vcf) # gtonly.gz self.vcf_file_path = "{}/{}{}".format(self.ref_dir_path, basename_user_vcf, "_GTonly.vcf.gz") # read self.prepare_vcf() # sample_nickname ------------------------------------- basename_vcf_file = os.path.basename(self.vcf_file_path) self.vcf_sample_name_file = "{}/sample_name_{}.txt".format( self.ref_dir_path, basename_vcf_file) self.save_vcf_sample_name_txt() self.vcf_sample_nickname_list, \ self.vcf_sample_basename_list, \ self.vcf_sample_fullname_list, \ self.vcf_sample_nickname_dict, \ self.vcf_sample_basename_dict, \ self.vcf_sample_fullname_dict, \ self.group_members_vcf_str \ = self.make_vcf_sample_variable() # illegal self.conf_dict['group_members']['default'] = \ self.group_members_vcf_str # show_fasta------------------------------------------- self.show_fasta = self._value_choice('show_fasta') # show_samples----------------------------------------- self.show_samples = self._value_choice('show_samples') # Because it stops at show_fasta if self.show_fasta != True and self.show_samples == True: log.info("only show_samples mode, exit.") log.info("program finished {}\n".format( utl.elapsed_time(time.time(), glv.now_epochtime))) sys.exit(1) # pick_mode self.pick_mode = self._value_choice('pick_mode') # indel len self.indel_size = self._value_choice('indel_size') self.min_indel_len, self.max_indel_len = \ [int(i) for i in self.indel_size.split('-')] # product size self.product_size = self._value_choice('product_size') self.min_product_size, self.max_product_size = \ [int(i) for i in self.product_size.split('-')] # ref ------------------------------------------------- # It will be set in main later # glv.ref = glv.ref.prepare_ref() self.ref_fasta_slink_system = "" self.ref_fasta_path = "" self.ref_fasta_chrom_list = [] self.ref_fasta_fai = "" self.ref_fasta_chrom_txt = "" self.ref_fasta_pickle = "" # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # enzyme self.enzyme_files_user_str = self._value_choice('enzyme_file') #self.enzyme_files_user_list = list() #self.enzyme_files_list = list() self.enzyme_str = self._value_choice('enzyme') #self.enzyme_name_list = list() # start stop self.progress = self._value_choice('progress') self.stop = self._value_choice('stop') # primer3 self.fragment_pad_len = self._value_choice('fragment_pad_len') self.p3_params_file = self._value_choice('p3_params') # blast self.blast_distance = self._value_choice('blast_distance') # not set now self.blast_word_size = 0 self.blastdb_title = "" self.blastdb = "" # region group member string --------------------------------- # select string by priority, next make dict or list variables self.regions_str = self.set_regions_str() self.group_members_str = self.set_group_members_str() self.distinguish_groups_str = self.set_distinguish_groups_str()
def design_marker(self): self.enzyme_name_list = glv.conf.enzyme_name_list proc_name = "marker" log.info("-------------------------------") log.info("Start processing {}\n".format(proc_name)) # stop, action, gothrough ret_status = utl.decide_action_stop(proc_name) if ret_status == "stop": msg = "STOP. " msg += "Current process \'{}\' ".format(proc_name) msg += "has exceeded the User-specified stop point " msg += "\'{}', ".format(glv.conf.stop) msg += "so stop program. exit." log.info(msg) sys.exit(1) elif ret_status == "gothrough": msg = "SKIP \'{}\' proc, ".format(proc_name) msg += "glv.conf.progress = {}, ".format(glv.conf.progress) msg += "glv.conf.stop = {}, ".format(glv.conf.stop) msg += "so skip program." log.info(msg) return # Design a fragment sequence for primer3 for proc_cnt, distin_dict in enumerate(glv.outlist.distin_files, 1): # logging current target utl.print_distin_info("marker", distin_dict, proc_cnt) # read variant file variant_file = distin_dict['variant']['out_path'] log.info("variant_file {}".format(variant_file)) df_distin = pd.read_csv( variant_file, sep='\t', header=0, index_col=None) # file name to write out result to text out_txt_file = distin_dict['marker']['out_path'] utl.save_to_tmpfile(out_txt_file) start = time.time() with open(out_txt_file, mode='a') as f: ''' eval_variant.py class EvalVariant(object): def _check_effect_of_enzyme( self, seq_target, enzyme_name_list): http://biopython.org/DIST/docs/cookbook/Restriction.html biopython <= 1.76 for IUPACAmbiguousDNA() multi_site_seq = Seq(seq_target, IUPACAmbiguousDNA()) rb = Restriction.RestrictionBatch(enzyme_name_list) Analong = Restriction.Analysis(rb, multi_site_seq) caps_ResTyp_dict = Analong.with_sites() This RestrictionBatch method sometimes returned slightly inaccurate results when executed in parallel. Therefore, parallel is not used now. ''' #if glv.conf.parallel == True: if False: log.info("do Parallel cpu {} parallel {}".format( glv.conf.thread, glv.conf.parallel_full_thread)) Parallel( n_jobs=glv.conf.parallel_full_thread, backend="threading")( [ delayed(self._loop_evaluate_for_marker) (distin_dict, variant_df_row, f) \ for variant_df_row in df_distin.itertuples() ] ) else: log.info("do Serial cpu 1") # each variant for variant_df_row in df_distin.itertuples(): # Determine if the variant can be used as a marker. # For those that can be marked, prepare the # information for primer3. self._loop_evaluate_for_marker( distin_dict, variant_df_row, f) utl.sort_file( 'marker', distin_dict, out_txt_file, 'chrom', 'pos', 'marker_info', 'string') log.info("marker {} > {}.txt\n".format( utl.elapsed_time(time.time(), start), distin_dict['marker']['base_nam']))
def _iterate_vcf(self, vcf_ittr, distin_dict, proc_cnt): """ """ # basic informations gr_list = [distin_dict[0], distin_dict[1]] reg = distin_dict['region'] reg_dict = glv.conf.regions_dict[reg] pick_mode = distin_dict['pick_mode'] indel_size = distin_dict['indel_size'] min_indel_len, max_indel_len = \ [int(i) for i in indel_size.split('-')] # At first, we check difference of genotype between two sample # that described at the beginning of each group top_smpl_list = [ glv.conf.group_members_dict[gr_list[0]][0], glv.conf.group_members_dict[gr_list[1]][0] ] # logging current target utl.print_distin_info("variant", distin_dict, proc_cnt) start = time.time() # File name to export variant out_txt_file = distin_dict['variant']['out_path'] utl.save_to_tmpfile(out_txt_file) #------------------------------------------------------ # To add an allele_int column for all sample # Members of the specified group come first # gr0:s1 g0:s2 g0:s3 g1:s4 g1:s5 g1:s6 s7 s8 s9 s10 sample_nickname_ordered_list, \ sample_fullname_ordered_list = \ utl.get_ordered_sample_list(gr_list) sample_added_header = "{}\t{}".format( distin_dict['variant']['hdr_text'], "\t".join(sample_nickname_ordered_list)) # Can I parallelize here? with open(out_txt_file, mode='a') as f: # write sample added header f.write("{}\n".format(sample_added_header)) # access to vcf using iterater for record in vcf_ittr: # 1. Skip same GT between top two sample if self._skip_same_GT_between_top2sample( record, top_smpl_list) > 0: continue # 2. Check GT in your own group if self._skip_different_GT_in_own_group( record, top_smpl_list, gr_list) > 0: continue # 3. Select different allele combination among 2x2 allele asel = AlleleSelect(min_indel_len, max_indel_len) asel.select_diff_allele(record, top_smpl_list, gr_list) # from record, construct allele_int of the member # who is paying attention allele_int_line = "" # 4. Save variant information as text file for var_type, line in zip(asel.var_types, asel.lines): if utl.is_my_pick_mode(var_type, distin_dict['pick_mode']) == True: # make allele_int line if allele_int_line == "": #self._get_ai_line( allele_int_line = \ self._get_allele_line( record, sample_fullname_ordered_list) # add allele line f.write("{}\t{}\n".format(line, allele_int_line)) log.info("variant {} > {}.txt\n".format( utl.elapsed_time(time.time(), start), distin_dict['variant']['base_nam']))
def print_allele(self): ''' When show_genotype is specified, the genotype of the specified regions and members are output to a file. main variant.py print_allele allele_select.py cls allele_int ''' proc_name = "genotype" log.info("-------------------------------") log.info("Start processing {}\n".format(proc_name)) # header header = list() header += ["CHROM", "POS", "Rlen", "Alen", "diff", "REF", "ALT"] header += glv.conf.group_members_dict['all'] # reader reader = vcfpy.Reader.from_path(glv.conf.vcf_file_path) total_cnt = len(glv.conf.region_name_list) # Save to file for each region for proc_cnt, region_name in enumerate(glv.conf.region_name_list, 1): region = glv.conf.regions_dict[region_name]['reg'] # Create a list of fullname for the specified members sample_fullname_list = list() for nickname in glv.conf.group_members_dict['all']: sample_fullname_list.append(utl.get_fullname(nickname)) # if group priority #sample_fullname_list = \ # utl.get_sample_list_from_groupname( # group_list, "fullname") # out file name outf_pref = "005_genotype" basename = "{}~{}~{}".format(outf_pref, region_name, glv.conf.show_genotype) out_file_path = "{}/{}.txt".format(glv.conf.out_dir_path, basename) # backup utl.save_to_tmpfile(out_file_path) log.info("") log.info("{} / {}, {}({}) > {}".format(proc_cnt, total_cnt, region_name, region, out_file_path)) start = time.time() with open(out_file_path, mode='w') as f: f.write("{}\n".format('\t'.join(map(str, header)))) vcf_ittr = reader.fetch(region) for record in vcf_ittr: # Main informations line = [record.CHROM, record.POS] alt_list = [alt.value for alt in record.ALT] # variant length and diff len_ref = len(record.REF) lens_alt_list = list() for alt in alt_list: lens_alt_list.append(len(alt)) diff_len = abs(len_ref - lens_alt_list[0]) lens_alt = ",".join(map(str, lens_alt_list)) line += [len_ref] line += [lens_alt] line += [diff_len] line += [record.REF] line += [",".join(alt_list)] line += [ AlleleSelect.allele_convert( "{}/{}".format( record.call_for_sample[fn].gt_alleles[0], record.call_for_sample[fn].gt_alleles[1]), glv.conf.show_genotype) for fn in sample_fullname_list ] f.write("{}\n".format('\t'.join(map(str, line)))) log.info("genotype {} > {}.txt\n".format( utl.elapsed_time(time.time(), start), out_file_path))
def design_marker(self): # progress check if utl.progress_check('marker') == False: log.info("progress={} so skip variant.".format(glv.conf.progress)) return log.info("Start processing {}".format('marker')) # primer3用フラグメントを作成する # for each distinguish_groups for distin_dict in glv.outlist.distin_files: # read variant file variant_file = distin_dict['variant']['out_path'] log.info("variant_file {}".format(variant_file)) df_distin = pd.read_csv(variant_file, sep='\t', header=0, index_col=None) # Bio.Restriction.Restriction_Dictionary self.enzyme.read_enzyme_file() # file name to write out result to text out_txt_file = distin_dict['marker']['out_path'] utl.save_to_tmpfile(out_txt_file) start = time.time() with open(out_txt_file, mode='a') as f: # write header #f.write("{}\n".format(distin_dict['marker']['hdr_text'])) if glv.conf.parallel == True: log.info("do Parallel cpu {} parallel {}".format( glv.conf.thread, glv.conf.parallele_full_thread)) Parallel( n_jobs=glv.conf.parallele_full_thread, backend="threading")( [ delayed(self._loop_evaluate_for_marker) (distin_dict, variant_df_row, f) \ for variant_df_row in df_distin.itertuples() ] ) else: log.info("do Serial cpu 1") # each variant for variant_df_row in df_distin.itertuples(): # バリアントがマーカーとして使えるかどうか、判断する。 # マーカー化可能なものはprimer3用の情報を準備する。 self._loop_evaluate_for_marker(distin_dict, variant_df_row, f) utl.sort_file('marker', distin_dict, out_txt_file, 'chrom', 'pos', 'marker_info', 'string') log.info("marker {} {}".format( utl.elapsed_time(time.time(), start), distin_dict['marker']['base_nam']))