def MC_sample_matrix_stdlone(sim,out_db= 'out_db.txt',min_size= 80, samp= [5,20,10], stepup= "increment", diffs= False, frequency_range= [0,1],indfile= 'ind_assignments.txt', outemp= 'ind_assignments{}.txt',chrom_idx= 0, prop_gen_used= 1, sim_dir= 'mutation_counter/data/sims/', segregating= False, scale_genSize= False, outlog= 'indy.log', row= 24,col= 4, single= True, exclude= False, print_summ= False, sample_sim= 0, collapsed= True,bases= 'ACGT',ksize= 3,ploidy= 2, freq_extract= False, sim_del= 'C', tag_sim= '_ss', genome_size= 1,haps_extract= False, return_private= True): ''' ''' ### prepare sim_specific db ### ### mutation labels mutations= get_mutations(bases= bases,ksize= ksize) kmers, kmer_idx= kmer_comp_index(mutations) mut_lib= kmer_mut_index(mutations) if collapsed: labels= [kmer_idx[x][0] for x in sorted(kmer_idx.keys())] else: labels= ['_'.join(x) for x in mutations] # write: out_db= out_db.format(sim) db_dir= out_db.split('/') db_name= db_dir[-1] db_dir= '/'.join(db_dir[:-1])+'/' db_neigh= os.listdir(db_dir) header= ['SIM','POP','N'] + labels header= '\t'.join(header) + '\n' if db_name not in db_neigh: with open(out_db,'w') as fp: fp.write(header) ### ### ti= time.time() ## chromosome chrom= sim.split('.')[chrom_idx].split(sim_del)[-1].strip('chr') pop_dict, inds= get_pop_dict(sim,dir_sim= sim_dir,indfile= indfile,haps_extract= haps_extract, return_inds= True) if haps_extract: inds= list(inds) + list(inds) inds= np.array(inds) print(len(inds)) print(inds[:10]) total_inds= sum([len(x) for x in pop_dict.values()]) if exclude: files= read_exclude() else: files= {} data_kmer= {} tags= [] sim_extend= [] chroms= [] ### read vcf t0= time.time() Window, mut_matrix, scale= VCF_read_filter(sim, sim_dir= sim_dir,chrom= chrom,haps_extract= haps_extract, scale_genSize= scale_genSize, collapsed= collapsed,min_size= min_size, samp= samp, stepup= stepup, outemp= outemp, indfile= indfile,diffs= diffs,bases= bases, ksize= ksize, ploidy= ploidy) mut_idx= np.argmax(mut_matrix,axis= 0) tag_list, tag_dict, pop_dict= ind_assignment_scatter_v1(sim,dir_sim= sim_dir, min_size= min_size, samp= samp, stepup= stepup, outemp= outemp,indfile= indfile, inds= inds, pop_dict= pop_dict,pop_sub= True) t1= time.time() read_time= t1- t0 if not len(Window) or Window.shape[0] < total_inds: return '' ## counts for no tag sim: s0= time.time() PA_dict= parse_PA(Window, pop_dict,frequency_range= frequency_range) t2= time.time() print('time elapsed ref: {} m'.format((t2 - t1)/60)) new_wind= {} for pop in pop_dict.keys(): klist= list(pop_dict[pop]) private= list(PA_dict[pop]) pop_wind= Window[klist,:] pop_wind= pop_wind[:,private] ## new_wind[pop]= { 'array': pop_wind, 'mut': [mut_idx[x] for x in private] } ## # for pop in pop_dict.keys(): dict_pop= {pop: list(range(len(pop_dict[pop])))} pop_summary, dummy= count_popKmers(new_wind[pop]['array'], mut_matrix, new_wind[pop]['mut'], dict_pop, row=row,col=col) nline= [sim,pop,len(pop_dict[pop])] ncounts= pop_summary['counts'][pop] ncounts= ncounts.reshape(1,np.prod(ncounts.shape))[0] nline= nline + list(ncounts) nline= [str(x) for x in nline] with open(out_db,'a') as fp: fp.write('\t'.join(nline) + '\n') t3= time.time() print('time elapsed ref PA: {} m'.format((t3 - t2)/60)) if len(tag_list): ### print('len tag list: {}'.format(len(tag_list))) ### for idx in range(len(tag_list)): tag= tag_list[idx] pop_dict= tag_dict[tag] pop_ori= list(pop_dict.keys())[0] if tag_sim in pop_ori: pop_ori= pop_ori[len(tag_sim):].split('.')[0] pop_summary, dummy= count_popKmers(new_wind[pop_ori]['array'], mut_matrix, new_wind[pop_ori]['mut'], pop_dict, row=row,col=col,segregating= True,single= True) for pop in pop_summary['counts'].keys(): nline= [sim,pop,len(pop_dict[pop])] ncounts= pop_summary['counts'][pop] ncounts= ncounts.reshape(1,np.prod(ncounts.shape))[0] nline= nline + list(ncounts) nline= [str(x) for x in nline] with open(out_db,'a') as fp: fp.write('\t'.join(nline) + '\n') return ''
def MC_sample_matrix_dict(pop_names, pop_lengths,min_size= 80, samp= [5,20,10], stepup= "increment", diffs= False, frequency_range= [0,1],indfile= 'ind_assignments.txt', outemp= 'ind_assignments{}.txt',chrom_idx= 0, count_dir= './count/', dir_launch= '..',main_dir= './', sim_dir= 'mutation_counter/data/sims/', muted_dir= 'mutation_counter/data/mutation_count/', segregating= False, genome_size= 1, outlog= 'indy.log', row= 24,col= 4, single= True, exclude= False, print_summ= False, sample_sim= 0,collapsed= True,bases= 'ACGT',ksize= 3,ploidy= 2, freq_extract= False, sim_del= 'C', distances= 'PCA', Lsteps= 1,scale_genSize= False,prop_gen_used= 1,return_private= True): ''' launch mutation counter pipeline on manipulated population assignments. Use matrix multiplication to extract counts. - v1 relies on count_popKmers() function to count mutations per pop. allows freq. filter and single mutaiton count. ''' ti= time.time() sims= process_dir(sims_dir= sim_dir) print('available {}'.format(len(sims))) tags= [] sim_extend= [] chroms= [] data_kmer= {} data_freqs= {} #sim_sample= np.random.choice(sims,8,replace= False) if sample_sim == 0: sample_sim= len(sims) print('sample {}'.format(sample_sim)) sim_sub= np.random.choice(sims,sample_sim,replace= False) for sim in sim_sub: ## chromosome chrom= sim.split('.')[chrom_idx].split(sim_del)[-1].strip('chr') if exclude: files= read_exclude() else: files= {} ### read vcf t0= time.time() Window, mut_matrix, scale= VCF_read_filter( sim, sim_dir= sim_dir,chrom= chrom,haps_extract= haps_extract, scale_genSize= scale_genSize, collapsed= collapsed,min_size= min_size, samp= samp, stepup= stepup, outemp= outemp, indfile= indfile, diffs= diffs,bases= bases, ksize= ksize, ploidy= ploidy ) tag_list, tag_dict, pop_dict= ind_assignment_scatter_v1(sim,dir_sim= sim_dir, haps_extract= haps_extract, min_size= min_size, samp= samp, stepup= stepup, outemp= outemp,indfile= indfile) total_inds= sum([len(x) for x in pop_dict.values()]) if not len(Window) or Window.shape[0] < total_inds: continue ## counts for no tag sim: s0= time.time() t1= time.time() count_time= t1- t0 if len(tag_list): ### sim_extend.extend([sim]*len(tag_list)) chroms.extend([chrom]*len(tag_list)) ### Window_lengths= np.linspace(1,len(refseq),Lsteps,dtype= int) ### for idx in range(len(tag_list)): seq_idx= 0 present_state= 0 for snp_n in Window_lengths: if snp_n < 10: lrange= list(range(Window.shape[1])) tag_l= 'full' else: while present_state < snp_n: if seq_idx >= (subset_summary.shape[0]-1): present_state= len(refseq) seq_idx= subset_summary.shape[0]-1 else: present_state= subset_summary['POS'][seq_idx] seq_idx += 1 lrange= list(range(seq_idx)) tag_l= str(snp_n * scale) # tag_here= tag_list[idx] + '.' + tag_l tags.append(tag_here) ## tag= tag_list[idx] # new_sim= sim + tag_here pop_dict= tag_dict[tag] ######### ######### pop_summary, PA_dict= count_popKmers(Window[:,lrange], mut_matrix[:,lrange], pop_dict, single= single, prop_gen_used= prop_gen_used, frequency_range= frequency_range,row=row,col=col,segregating= segregating,scale= scale, return_private= return_private) data_kmer[new_sim]= pop_summary if return_private: pop_summary, dummy= count_popKmers(Window[:,lrange], mut_matrix[:,lrange], pop_dict, single= single, prop_gen_used= prop_gen_used, frequency_range= frequency_range,row=row,col=col,segregating= segregating,scale= scale, PA= PA_dict) data_kmer[new_sim]= pop_summary if freq_extract: pop_freqs= pop_dict_SFS(Window,pop_dict) data_freqs[new_sim]= pop_freqs if distances: data_kmer[new_sim]['pairDist']= pop_distances_PCA(Window,pop_dict) if print_summ: print('mut_matrix time: {} s'.format(time_mut / 60)) print('count time: {} s'.format(count_time / 60)) print('est total count time: {} s'.format(count_time*len(tag_list) / 60)) print('replicates: {}'.format(len(tag_list))) print('read time: {} s'.format(read_time / 60)) tf= time.time() time_elapsed= tf - ti print('time elapsed: {}s'.format(time_elapsed)) return data_kmer, data_freqs
def MC_sample_matrix_v1(min_size= 80, samp= [5,20,10], stepup= "increment", diffs= False, frequency_range= [0,1],indfile= 'ind_assignments.txt', outemp= 'ind_assignments{}.txt',chrom_idx= 0, prop_gen_used= 1, count_dir= './count/', dir_launch= '..',main_dir= './', sim_dir= 'mutation_counter/data/sims/', muted_dir= 'mutation_counter/data/mutation_count/', segregating= False, scale_genSize= False, outlog= 'indy.log', row= 24,col= 4, single= True, exclude= False, print_summ= False, sample_sim= 0,collapsed= True,bases= 'ACGT',ksize= 3,ploidy= 2, freq_extract= False, sim_del= 'C', genome_size= 1,haps_extract= False, return_private= True): ''' launch mutation counter pipeline on population assignments. Use matrix multiplication to extract counts. - v1 relies on count_popKmers() function to count mutations per pop. allows freq. filter and single mutaiton count. ''' ti= time.time() sims= process_dir(sims_dir= sim_dir) print('available {}'.format(len(sims))) tags= [] sim_extend= [] chroms= [] data_kmer= {} data_freqs= {} if sample_sim == 0: sample_sim= len(sims) print('sample {}'.format(sample_sim)) sim_sub= np.random.choice(sims,sample_sim,replace= False) for sim in sim_sub: ## chromosome chrom= sim.split('.')[chrom_idx].split(sim_del)[-1].strip('chr') if exclude: files= read_exclude() else: files= {} ### read vcf t0= time.time() Window, mut_matrix, scale= VCF_read_filter(sim, sim_dir= sim_dir,chrom= chrom,haps_extract= haps_extract, scale_genSize= scale_genSize, collapsed= collapsed,min_size= min_size, samp= samp, stepup= stepup, outemp= outemp, indfile= indfile,diffs= diffs,bases= bases, ksize= ksize, ploidy= ploidy) tag_list, tag_dict, pop_dict= ind_assignment_scatter_v1(sim,dir_sim= sim_dir, haps_extract= haps_extract, min_size= min_size, samp= samp, stepup= stepup, outemp= outemp,indfile= indfile) total_inds= sum([len(x) for x in pop_dict.values()]) t1= time.time() read_time= t1- t0 if not len(Window) or Window.shape[0] < total_inds: continue ## counts for no tag sim: s0= time.time() pop_summary, PA_dict= count_popKmers(Window, mut_matrix, pop_dict, single= single, prop_gen_used= prop_gen_used, frequency_range= frequency_range,row=row,col=col,segregating= segregating,scale= scale, return_private= return_private) data_kmer[sim]= pop_summary if return_private: pop_summary, dummy= count_popKmers(Window, mut_matrix, pop_dict, single= single, prop_gen_used= prop_gen_used, frequency_range= frequency_range,row=row,col=col,segregating= segregating,scale= scale, PA= PA_dict) data_kmer[sim]= pop_summary if freq_extract: pop_freqs= pop_dict_SFS(Window,pop_dict) data_freqs[sim]= pop_freqs t1= time.time() count_time= t1- t0 if len(tag_list): ### sim_extend.append(sim) tags.append('') chroms.append(chrom) ### for idx in range(len(tag_list)): sim_extend.extend([sim]*len(tag_list)) tags.extend(tag_list) chroms.extend([chrom]*len(tag_list)) ## tag= tag_list[idx] ind_file= outemp.format(tags[idx]) new_sim= sim + tag pop_dict= tag_dict[tag] pop_summary, dummy= count_popKmers(pop_summary['array'], mut_matrix, pop_dict, single= single, prop_gen_used= prop_gen_used, frequency_range= frequency_range,row=row,col=col,segregating= segregating,scale= scale, PA= PA_dict,counted= True) data_kmer[new_sim]= pop_summary if freq_extract: pop_freqs= pop_dict_SFS(Window,pop_dict) data_freqs[new_sim]= pop_freqs if print_summ: print('mut_matrix time: {} s'.format(time_mut / 60)) print('count time: {} s'.format(count_time / 60)) print('est total count time: {} s'.format(count_time*len(tag_list) / 60)) print('replicates: {}'.format(len(tag_list))) print('read time: {} s'.format(read_time / 60)) tf= time.time() time_elapsed= tf - ti print('time elapsed: {}s'.format(time_elapsed)) return data_kmer, data_freqs
def MC_sample_matrix_simple(min_size= 80, samp= [5,20,10], stepup= "increment", diffs= False, frequency_range= [0,1],indfile= 'ind_assignments.txt', outemp= 'ind_assignments{}.txt', count_dir= './count/', dir_launch= '..',main_dir= './', sim_dir= 'mutation_counter/data/sims/', muted_dir= 'mutation_counter/data/mutation_count/', segregating= False, outlog= 'indy.log', row= 24,col= 4, single= False, exclude= False, print_summ= False, sample_sim= 0,collapsed= True,bases= 'ACGT',ksize= 3,ploidy= 2, freq_extract= False, distances= 'PCA',prop_gen_used= 1, scale_genSize= False, return_private= True,haps_extract= True): ''' launch mutation counter pipeline on manipulated population assignments. Use matrix multiplication to extract counts. - v1 relies on count_popKmers() function to count mutations per pop. allows freq. filter and single mutaiton count. ''' ti= time.time() sims= process_dir(sims_dir= sim_dir) print('available {}'.format(len(sims))) tags= [] sim_extend= [] chroms= [] data_kmer= {} data_freqs= {} # if sample_sim == 0: sample_sim= len(sims) print('sample {}'.format(sample_sim)) sim_sub= np.random.choice(sims,sample_sim,replace= False) for sim in sim_sub: ## chromosome chrom= sim.split('.')[0].split('C')[-1].strip('chr') chromosomes= [sim.split('.')[0].split('C')[1]] chromosome_groups = [chromosomes] if exclude: files= read_exclude() else: files= {} ### read vcf t0= time.time() Window, mut_matrix, scale= VCF_read_filter(sim, sim_dir= sim_dir,chrom= chrom,haps_extract= haps_extract, scale_genSize= scale_genSize, collapsed= collapsed,min_size= min_size, samp= samp, stepup= stepup, outemp= outemp, indfile= indfile,diffs= diffs,bases= bases, ksize= ksize, ploidy= ploidy) tag_list, tag_dict, pop_dict= ind_assignment_scatter_v1(sim,dir_sim= sim_dir, haps_extract= haps_extract, min_size= min_size, samp= samp, stepup= stepup, outemp= outemp,indfile= indfile) total_inds= sum([len(x) for x in pop_dict.values()]) if not len(Window) or Window.shape[0] < total_inds: continue ## counts for no tag sim: s0= time.time() pop_summary, PA_dict= count_popKmers(Window, mut_matrix, pop_dict, single= single, prop_gen_used= prop_gen_used, frequency_range= frequency_range,row=row,col=col,segregating= segregating,scale= scale, return_private= return_private) data_kmer[sim]= pop_summary if return_private: pop_summary, dummy= count_popKmers(Window, mut_matrix, pop_dict, single= single, prop_gen_used= prop_gen_used, frequency_range= frequency_range,row=row,col=col,segregating= segregating,scale= scale, PA= PA_dict) data_kmer[sim]= pop_summary if freq_extract: pop_freqs= pop_dict_SFS(Window,pop_dict) data_freqs[sim]= pop_freqs if distances: data_kmer[sim]['pairDist']= pop_distances_PCA(Window,pop_dict) return data_kmer, data_freqs
def MC_sample_matrix_stdlone(sim, out_db='out_db.txt', min_size=80, samp=[5, 20, 10], stepup="increment", diffs=False, frequency_range=[0, 1], indfile='ind_assignments.txt', outemp='ind_assignments{}.txt', chrom_idx=0, prop_gen_used=1, sim_dir='mutation_counter/data/sims/', segregating=False, scale_genSize=False, outlog='indy.log', row=24, col=4, single=False, exclude=False, print_summ=False, sample_sim=0, collapsed=True, bases='ACGT', ksize=3, ploidy=2, freq_extract=False, sim_del='C', tag_sim='_ss', genome_size=1, haps_extract=False, return_private=True): ''' ''' ti = time.time() ## chromosome chrom = sim.split('.')[chrom_idx].split(sim_del)[-1].strip('chr') pop_dict, inds = get_pop_dict(sim, dir_sim=sim_dir, indfile=indfile, haps_extract=haps_extract, return_inds=True) if haps_extract: inds = list(inds) + list(inds) inds = np.array(inds) print(len(inds)) print(inds[:10]) total_inds = sum([len(x) for x in pop_dict.values()]) if exclude: files = read_exclude() else: files = {} data_kmer = {} tags = [] sim_extend = [] chroms = [] ### read vcf t0 = time.time() Window, mut_matrix, scale = VCF_read_filter(sim, sim_dir=sim_dir, chrom=chrom, haps_extract=haps_extract, scale_genSize=scale_genSize, collapsed=collapsed, min_size=min_size, samp=samp, stepup=stepup, outemp=outemp, indfile=indfile, diffs=diffs, bases=bases, ksize=ksize, ploidy=ploidy) #pop_dict= () #total_inds= sum([len(x) for x in pop_dict.values()]) t1 = time.time() read_time = t1 - t0 if not len(Window) or Window.shape[0] < total_inds: return '' ## counts for no tag sim: s0 = time.time() pop_summary, PA_dict = count_popKmers(Window, mut_matrix, pop_dict, single=single, prop_gen_used=prop_gen_used, frequency_range=frequency_range, row=row, col=col, segregating=segregating, scale=scale, return_private=return_private) t2 = time.time() print('time elapsed ref: {} m'.format((t2 - t1) / 60)) if return_private: pop_summary, dummy = count_popKmers(Window, mut_matrix, pop_dict, single=single, prop_gen_used=prop_gen_used, frequency_range=frequency_range, row=row, col=col, segregating=segregating, scale=scale, PA=PA_dict) data_kmer[sim] = pop_summary t3 = time.time() print('time elapsed ref PA: {} m'.format((t3 - t2) / 60)) new_wind = [] for pop in data_kmer[sim]['counts'].keys(): pop_counts = pop_summary['array'][pop_dict[pop], :] pop_counts = np.array(pop_counts, dtype=int) Service = np.zeros((pop_counts.shape[0], pop_counts.shape[1] + 3), dtype=int) Service[:, 3:] = pop_counts Service = np.array(Service, dtype=str) Service[:, 2] = inds[pop_dict[pop]] Service[:, 1] = pop Service[:, 0] = sim new_wind.append(Service) new_wind = np.concatenate(tuple(new_wind), axis=0) ### mutation labels mutations = get_mutations(bases=bases, ksize=ksize) kmers, kmer_idx = kmer_comp_index(mutations) mut_lib = kmer_mut_index(mutations) if collapsed: labels = [kmer_idx[x][0] for x in sorted(kmer_idx.keys())] else: labels = ['_'.join(x) for x in mutations] # write: db_file = out_db.format(sim) db_neigh = db_file.split('/')[:-1] db_name = db_file.split('/')[-1] db_neigh = os.listdir('/'.join(db_neigh)) header = ['SIM', 'POP', 'N'] + labels header = '\t'.join(header) + '\n' if db_name not in db_neigh: with open(db_file, 'w') as fp: fp.write(header) with open(db_file, 'w') as fp: fp.write(header) fp.write('\n'.join(['\t'.join(x) for x in new_wind])) t1 = time.time() count_time = t1 - t0 return ''