Пример #1
0
def MC_sample_matrix_stdlone(sim,out_db= 'out_db.txt',min_size= 80, samp= [5,20,10], stepup= "increment", diffs= False, frequency_range= [0,1],indfile= 'ind_assignments.txt', 
					outemp= 'ind_assignments{}.txt',chrom_idx= 0, prop_gen_used= 1, 
	                sim_dir= 'mutation_counter/data/sims/', segregating= False, scale_genSize= False,
	                outlog= 'indy.log', row= 24,col= 4, single= True, exclude= False, print_summ= False, sample_sim= 0,
	                collapsed= True,bases= 'ACGT',ksize= 3,ploidy= 2, freq_extract= False, sim_del= 'C', tag_sim= '_ss',
	                genome_size= 1,haps_extract= False, return_private= True):
	'''

	'''

	### prepare sim_specific db
	###
	### mutation labels
	mutations= get_mutations(bases= bases,ksize= ksize)
	kmers, kmer_idx= kmer_comp_index(mutations)

	mut_lib= kmer_mut_index(mutations)
	if collapsed:
	    labels= [kmer_idx[x][0] for x in sorted(kmer_idx.keys())]

	else:
	    labels= ['_'.join(x) for x in mutations]

	# write:
	out_db= out_db.format(sim)
	db_dir= out_db.split('/')
	db_name= db_dir[-1]
	db_dir= '/'.join(db_dir[:-1])+'/'
	db_neigh= os.listdir(db_dir)

	header= ['SIM','POP','N'] + labels
	header= '\t'.join(header) + '\n'

	if db_name not in db_neigh:
	    with open(out_db,'w') as fp:
	        fp.write(header)
	###
	###
	ti= time.time()

	## chromosome
	chrom= sim.split('.')[chrom_idx].split(sim_del)[-1].strip('chr')
	pop_dict, inds= get_pop_dict(sim,dir_sim= sim_dir,indfile= indfile,haps_extract= haps_extract,
		return_inds= True)
	if haps_extract:
		inds= list(inds) + list(inds)
		inds= np.array(inds)

	print(len(inds))
	print(inds[:10])

	total_inds= sum([len(x) for x in pop_dict.values()])

	if exclude:
	    files= read_exclude()
	else:
	    files= {}
	
	data_kmer= {}
	tags= []
	sim_extend= []
	chroms= []


	### read vcf
	t0= time.time()
	Window, mut_matrix, scale= VCF_read_filter(sim, sim_dir= sim_dir,chrom= chrom,haps_extract= haps_extract, scale_genSize= scale_genSize,
	    collapsed= collapsed,min_size= min_size, samp= samp, stepup= stepup, outemp= outemp,
	    indfile= indfile,diffs= diffs,bases= bases, ksize= ksize, ploidy= ploidy)

	mut_idx= np.argmax(mut_matrix,axis= 0)

	tag_list, tag_dict, pop_dict= ind_assignment_scatter_v1(sim,dir_sim= sim_dir,
	                  min_size= min_size, samp= samp, stepup= stepup, outemp= outemp,indfile= indfile,
	                  inds= inds, pop_dict= pop_dict,pop_sub= True)

	t1= time.time()
	read_time= t1- t0
	if not len(Window) or Window.shape[0] < total_inds:
	    return ''

	## counts for no tag sim:
	s0= time.time()
	PA_dict= parse_PA(Window, pop_dict,frequency_range= frequency_range)

	t2= time.time()
	print('time elapsed ref: {} m'.format((t2 - t1)/60))


	new_wind= {}
	for pop in pop_dict.keys():

		klist= list(pop_dict[pop])
		private= list(PA_dict[pop])
		pop_wind= Window[klist,:]
		pop_wind= pop_wind[:,private]

		##
		new_wind[pop]= {
			'array': pop_wind,
			'mut': [mut_idx[x] for x in private]
			}

		##
		#

	for pop in pop_dict.keys():
		dict_pop= {pop: list(range(len(pop_dict[pop])))}
		pop_summary, dummy= count_popKmers(new_wind[pop]['array'], mut_matrix, new_wind[pop]['mut'], dict_pop, 
												row=row,col=col)

		nline= [sim,pop,len(pop_dict[pop])]
		ncounts= pop_summary['counts'][pop]
		ncounts= ncounts.reshape(1,np.prod(ncounts.shape))[0]
		nline= nline + list(ncounts)
		nline= [str(x) for x in nline]
		with open(out_db,'a') as fp:
			fp.write('\t'.join(nline) + '\n')

	t3= time.time()
	print('time elapsed ref PA: {} m'.format((t3 - t2)/60))

	if len(tag_list):
		###
		print('len tag list: {}'.format(len(tag_list)))
		###

		for idx in range(len(tag_list)):
			
			tag= tag_list[idx]

			pop_dict= tag_dict[tag]
			pop_ori= list(pop_dict.keys())[0]
			if tag_sim in pop_ori:
			    pop_ori= pop_ori[len(tag_sim):].split('.')[0]

			pop_summary, dummy= count_popKmers(new_wind[pop_ori]['array'], mut_matrix, new_wind[pop_ori]['mut'], pop_dict,
						row=row,col=col,segregating= True,single= True)

			for pop in pop_summary['counts'].keys():
				nline= [sim,pop,len(pop_dict[pop])]
				ncounts= pop_summary['counts'][pop]
				ncounts= ncounts.reshape(1,np.prod(ncounts.shape))[0]
				nline= nline + list(ncounts)
				nline= [str(x) for x in nline]
				with open(out_db,'a') as fp:
					fp.write('\t'.join(nline) + '\n')

	return ''
Пример #2
0
def MC_sample_matrix_dict(pop_names, pop_lengths,min_size= 80, samp= [5,20,10], stepup= "increment", diffs= False, frequency_range= [0,1],indfile= 'ind_assignments.txt', outemp= 'ind_assignments{}.txt',chrom_idx= 0,
                    count_dir= './count/', dir_launch= '..',main_dir= './', sim_dir= 'mutation_counter/data/sims/', muted_dir= 'mutation_counter/data/mutation_count/', segregating= False, genome_size= 1,
                    outlog= 'indy.log', row= 24,col= 4, single= True, exclude= False, print_summ= False, sample_sim= 0,collapsed= True,bases= 'ACGT',ksize= 3,ploidy= 2, freq_extract= False, sim_del= 'C',
                         distances= 'PCA', Lsteps= 1,scale_genSize= False,prop_gen_used= 1,return_private= True):
    '''
    launch mutation counter pipeline on manipulated population assignments.
    Use matrix multiplication to extract counts. 
    - v1 relies on count_popKmers() function to count mutations per pop. allows freq. filter and single mutaiton count.  
    '''
    
    ti= time.time()
    sims= process_dir(sims_dir= sim_dir)
    print('available {}'.format(len(sims)))

    tags= []
    sim_extend= []
    chroms= []
    
    data_kmer= {}
    data_freqs= {}
    #sim_sample= np.random.choice(sims,8,replace= False)
    if sample_sim == 0:
        sample_sim= len(sims)
    
    print('sample {}'.format(sample_sim))
    sim_sub= np.random.choice(sims,sample_sim,replace= False)
    
    for sim in sim_sub:
        
        ## chromosome
        chrom= sim.split('.')[chrom_idx].split(sim_del)[-1].strip('chr')

        if exclude:
            files= read_exclude()
        else:
            files= {}

        ### read vcf
        t0= time.time()
        Window, mut_matrix, scale= VCF_read_filter(
        	sim, sim_dir= sim_dir,chrom= chrom,haps_extract= haps_extract, scale_genSize= scale_genSize,
            collapsed= collapsed,min_size= min_size, samp= samp, stepup= stepup, outemp= outemp,
            indfile= indfile, diffs= diffs,bases= bases, ksize= ksize, ploidy= ploidy
            )

        tag_list, tag_dict, pop_dict= ind_assignment_scatter_v1(sim,dir_sim= sim_dir, haps_extract= haps_extract,
                          min_size= min_size, samp= samp, stepup= stepup, outemp= outemp,indfile= indfile)

        total_inds= sum([len(x) for x in pop_dict.values()])
        if not len(Window) or Window.shape[0] < total_inds:
            continue
        ## counts for no tag sim:
        s0= time.time()
        t1= time.time()
        count_time= t1- t0
        
        if len(tag_list):
            ###
            sim_extend.extend([sim]*len(tag_list))
            chroms.extend([chrom]*len(tag_list))
            ###
            Window_lengths= np.linspace(1,len(refseq),Lsteps,dtype= int)
            ###
            
            for idx in range(len(tag_list)):

                seq_idx= 0
                present_state= 0

                for snp_n in Window_lengths:
                    if snp_n < 10:
                        lrange= list(range(Window.shape[1]))
                        tag_l= 'full'
                    else:
                        while present_state < snp_n:
                            
                            if seq_idx >= (subset_summary.shape[0]-1):
                                present_state= len(refseq)
                                seq_idx= subset_summary.shape[0]-1
                            else:
                                present_state= subset_summary['POS'][seq_idx]
                                seq_idx += 1

                        lrange= list(range(seq_idx))
                        tag_l= str(snp_n * scale)
                    #
                    tag_here= tag_list[idx] + '.' + tag_l
                    tags.append(tag_here)
                    ##
                    tag= tag_list[idx]
                    #
                    new_sim= sim + tag_here

                    pop_dict= tag_dict[tag]
                    
                    #########
                    #########
                    pop_summary, PA_dict= count_popKmers(Window[:,lrange], mut_matrix[:,lrange], pop_dict, single= single, prop_gen_used= prop_gen_used,
                                              frequency_range= frequency_range,row=row,col=col,segregating= segregating,scale= scale,
                                              return_private= return_private)

                    data_kmer[new_sim]= pop_summary

                    if return_private: 
                        pop_summary, dummy= count_popKmers(Window[:,lrange], mut_matrix[:,lrange], pop_dict, single= single, prop_gen_used= prop_gen_used,
                                                  frequency_range= frequency_range,row=row,col=col,segregating= segregating,scale= scale,
                                                  PA= PA_dict)
                        data_kmer[new_sim]= pop_summary


                    if freq_extract:
                        pop_freqs= pop_dict_SFS(Window,pop_dict)
                        data_freqs[new_sim]= pop_freqs

                    if distances:
                        data_kmer[new_sim]['pairDist']= pop_distances_PCA(Window,pop_dict)

        if print_summ:
            print('mut_matrix time: {} s'.format(time_mut / 60))
            print('count time: {} s'.format(count_time / 60))
            print('est total count time: {} s'.format(count_time*len(tag_list) / 60))
            print('replicates: {}'.format(len(tag_list)))
            print('read time: {} s'.format(read_time / 60))

    tf= time.time()
    time_elapsed= tf - ti
    
    print('time elapsed: {}s'.format(time_elapsed))
    
    return data_kmer, data_freqs
Пример #3
0
def MC_sample_matrix_v1(min_size= 80, samp= [5,20,10], stepup= "increment", diffs= False, frequency_range= [0,1],indfile= 'ind_assignments.txt', outemp= 'ind_assignments{}.txt',chrom_idx= 0, prop_gen_used= 1,
                    count_dir= './count/', dir_launch= '..',main_dir= './', sim_dir= 'mutation_counter/data/sims/', muted_dir= 'mutation_counter/data/mutation_count/', segregating= False, scale_genSize= False,
                    outlog= 'indy.log', row= 24,col= 4, single= True, exclude= False, print_summ= False, sample_sim= 0,collapsed= True,bases= 'ACGT',ksize= 3,ploidy= 2, freq_extract= False, sim_del= 'C',
                    genome_size= 1,haps_extract= False, return_private= True):
    '''
    launch mutation counter pipeline on population assignments.
    Use matrix multiplication to extract counts. 
    - v1 relies on count_popKmers() function to count mutations per pop. allows freq. filter and single mutaiton count.  
    '''

    ti= time.time()
    sims= process_dir(sims_dir= sim_dir)
    print('available {}'.format(len(sims)))

    tags= []
    sim_extend= []
    chroms= []
    
    data_kmer= {}
    data_freqs= {}
    
    if sample_sim == 0:
        sample_sim= len(sims)
    
    print('sample {}'.format(sample_sim))
    sim_sub= np.random.choice(sims,sample_sim,replace= False)
    
    for sim in sim_sub:
        
        ## chromosome
        chrom= sim.split('.')[chrom_idx].split(sim_del)[-1].strip('chr')

        if exclude:
            files= read_exclude()
        else:
            files= {}

        ### read vcf
        t0= time.time()
        Window, mut_matrix, scale= VCF_read_filter(sim, sim_dir= sim_dir,chrom= chrom,haps_extract= haps_extract, scale_genSize= scale_genSize,
            collapsed= collapsed,min_size= min_size, samp= samp, stepup= stepup, outemp= outemp,
            indfile= indfile,diffs= diffs,bases= bases, ksize= ksize, ploidy= ploidy)

        tag_list, tag_dict, pop_dict= ind_assignment_scatter_v1(sim,dir_sim= sim_dir, haps_extract= haps_extract,
                      min_size= min_size, samp= samp, stepup= stepup, outemp= outemp,indfile= indfile)
        
        total_inds= sum([len(x) for x in pop_dict.values()])
        t1= time.time()
        read_time= t1- t0
        if not len(Window) or Window.shape[0] < total_inds:
            continue

        ## counts for no tag sim:
        s0= time.time()
        pop_summary, PA_dict= count_popKmers(Window, mut_matrix, pop_dict, single= single, prop_gen_used= prop_gen_used,
                                  frequency_range= frequency_range,row=row,col=col,segregating= segregating,scale= scale,
                                  return_private= return_private)

        data_kmer[sim]= pop_summary

        if return_private: 
            pop_summary, dummy= count_popKmers(Window, mut_matrix, pop_dict, single= single, prop_gen_used= prop_gen_used,
                                      frequency_range= frequency_range,row=row,col=col,segregating= segregating,scale= scale,
                                      PA= PA_dict)
            data_kmer[sim]= pop_summary


        if freq_extract:
            pop_freqs= pop_dict_SFS(Window,pop_dict)
            data_freqs[sim]= pop_freqs
        
        t1= time.time()
        count_time= t1- t0
        
        if len(tag_list):
            ###
            sim_extend.append(sim)
            tags.append('')
            chroms.append(chrom)
            ###
            
            for idx in range(len(tag_list)):
                
                sim_extend.extend([sim]*len(tag_list))
                tags.extend(tag_list)
                chroms.extend([chrom]*len(tag_list))
                
                ##
                tag= tag_list[idx]
                ind_file= outemp.format(tags[idx])
                new_sim= sim + tag

                pop_dict= tag_dict[tag]
                
                pop_summary, dummy= count_popKmers(pop_summary['array'], mut_matrix, pop_dict, single= single, prop_gen_used= prop_gen_used,
                                  frequency_range= frequency_range,row=row,col=col,segregating= segregating,scale= scale,
                                  PA= PA_dict,counted= True)

                data_kmer[new_sim]= pop_summary

                if freq_extract:
                    pop_freqs= pop_dict_SFS(Window,pop_dict)
                    data_freqs[new_sim]= pop_freqs
                

        if print_summ:
            print('mut_matrix time: {} s'.format(time_mut / 60))
            print('count time: {} s'.format(count_time / 60))
            print('est total count time: {} s'.format(count_time*len(tag_list) / 60))
            print('replicates: {}'.format(len(tag_list)))
            print('read time: {} s'.format(read_time / 60))

    tf= time.time()
    time_elapsed= tf - ti
    
    print('time elapsed: {}s'.format(time_elapsed))

    return data_kmer, data_freqs
Пример #4
0
def MC_sample_matrix_simple(min_size= 80, samp= [5,20,10], stepup= "increment", diffs= False, frequency_range= [0,1],indfile= 'ind_assignments.txt', outemp= 'ind_assignments{}.txt',
                    count_dir= './count/', dir_launch= '..',main_dir= './', sim_dir= 'mutation_counter/data/sims/', muted_dir= 'mutation_counter/data/mutation_count/', segregating= False,
                    outlog= 'indy.log', row= 24,col= 4, single= False, exclude= False, print_summ= False, sample_sim= 0,collapsed= True,bases= 'ACGT',ksize= 3,ploidy= 2, freq_extract= False,
                    distances= 'PCA',prop_gen_used= 1, scale_genSize= False, return_private= True,haps_extract= True):
    '''
    launch mutation counter pipeline on manipulated population assignments.
    Use matrix multiplication to extract counts. 
    - v1 relies on count_popKmers() function to count mutations per pop. allows freq. filter and single mutaiton count.  
    '''
    
    ti= time.time()
    sims= process_dir(sims_dir= sim_dir)
    print('available {}'.format(len(sims)))

    tags= []
    sim_extend= []
    chroms= []
    
    data_kmer= {}
    data_freqs= {}
    #
    if sample_sim == 0:
        sample_sim= len(sims)

    print('sample {}'.format(sample_sim))
    sim_sub= np.random.choice(sims,sample_sim,replace= False)
    
    for sim in sim_sub:

        ## chromosome
        chrom= sim.split('.')[0].split('C')[-1].strip('chr')
        chromosomes= [sim.split('.')[0].split('C')[1]]
        chromosome_groups = [chromosomes]

        if exclude:
            files= read_exclude()
        else:
            files= {}

        ### read vcf
        t0= time.time()
        Window, mut_matrix, scale= VCF_read_filter(sim, sim_dir= sim_dir,chrom= chrom,haps_extract= haps_extract, scale_genSize= scale_genSize,
            collapsed= collapsed,min_size= min_size, samp= samp, stepup= stepup, outemp= outemp,
            indfile= indfile,diffs= diffs,bases= bases, ksize= ksize, ploidy= ploidy)

        tag_list, tag_dict, pop_dict= ind_assignment_scatter_v1(sim,dir_sim= sim_dir, haps_extract= haps_extract,
                          min_size= min_size, samp= samp, stepup= stepup, outemp= outemp,indfile= indfile)

        total_inds= sum([len(x) for x in pop_dict.values()])
        if not len(Window) or Window.shape[0] < total_inds:
            continue
        ## counts for no tag sim:
        s0= time.time()

        pop_summary, PA_dict= count_popKmers(Window, mut_matrix, pop_dict, single= single, prop_gen_used= prop_gen_used,
                                  frequency_range= frequency_range,row=row,col=col,segregating= segregating,scale= scale,
                                  return_private= return_private)

        data_kmer[sim]= pop_summary

        if return_private: 
            pop_summary, dummy= count_popKmers(Window, mut_matrix, pop_dict, single= single, prop_gen_used= prop_gen_used,
                                      frequency_range= frequency_range,row=row,col=col,segregating= segregating,scale= scale,
                                      PA= PA_dict)

            data_kmer[sim]= pop_summary

        if freq_extract:
            pop_freqs= pop_dict_SFS(Window,pop_dict)
            data_freqs[sim]= pop_freqs

        if distances:
            data_kmer[sim]['pairDist']= pop_distances_PCA(Window,pop_dict)
    
    return data_kmer, data_freqs
Пример #5
0
def MC_sample_matrix_stdlone(sim,
                             out_db='out_db.txt',
                             min_size=80,
                             samp=[5, 20, 10],
                             stepup="increment",
                             diffs=False,
                             frequency_range=[0, 1],
                             indfile='ind_assignments.txt',
                             outemp='ind_assignments{}.txt',
                             chrom_idx=0,
                             prop_gen_used=1,
                             sim_dir='mutation_counter/data/sims/',
                             segregating=False,
                             scale_genSize=False,
                             outlog='indy.log',
                             row=24,
                             col=4,
                             single=False,
                             exclude=False,
                             print_summ=False,
                             sample_sim=0,
                             collapsed=True,
                             bases='ACGT',
                             ksize=3,
                             ploidy=2,
                             freq_extract=False,
                             sim_del='C',
                             tag_sim='_ss',
                             genome_size=1,
                             haps_extract=False,
                             return_private=True):
    '''

	'''
    ti = time.time()

    ## chromosome
    chrom = sim.split('.')[chrom_idx].split(sim_del)[-1].strip('chr')
    pop_dict, inds = get_pop_dict(sim,
                                  dir_sim=sim_dir,
                                  indfile=indfile,
                                  haps_extract=haps_extract,
                                  return_inds=True)
    if haps_extract:
        inds = list(inds) + list(inds)
        inds = np.array(inds)

    print(len(inds))
    print(inds[:10])

    total_inds = sum([len(x) for x in pop_dict.values()])

    if exclude:
        files = read_exclude()
    else:
        files = {}

    data_kmer = {}
    tags = []
    sim_extend = []
    chroms = []

    ### read vcf
    t0 = time.time()
    Window, mut_matrix, scale = VCF_read_filter(sim,
                                                sim_dir=sim_dir,
                                                chrom=chrom,
                                                haps_extract=haps_extract,
                                                scale_genSize=scale_genSize,
                                                collapsed=collapsed,
                                                min_size=min_size,
                                                samp=samp,
                                                stepup=stepup,
                                                outemp=outemp,
                                                indfile=indfile,
                                                diffs=diffs,
                                                bases=bases,
                                                ksize=ksize,
                                                ploidy=ploidy)

    #pop_dict= ()
    #total_inds= sum([len(x) for x in pop_dict.values()])

    t1 = time.time()
    read_time = t1 - t0
    if not len(Window) or Window.shape[0] < total_inds:
        return ''

    ## counts for no tag sim:
    s0 = time.time()
    pop_summary, PA_dict = count_popKmers(Window,
                                          mut_matrix,
                                          pop_dict,
                                          single=single,
                                          prop_gen_used=prop_gen_used,
                                          frequency_range=frequency_range,
                                          row=row,
                                          col=col,
                                          segregating=segregating,
                                          scale=scale,
                                          return_private=return_private)

    t2 = time.time()
    print('time elapsed ref: {} m'.format((t2 - t1) / 60))

    if return_private:
        pop_summary, dummy = count_popKmers(Window,
                                            mut_matrix,
                                            pop_dict,
                                            single=single,
                                            prop_gen_used=prop_gen_used,
                                            frequency_range=frequency_range,
                                            row=row,
                                            col=col,
                                            segregating=segregating,
                                            scale=scale,
                                            PA=PA_dict)
        data_kmer[sim] = pop_summary

    t3 = time.time()
    print('time elapsed ref PA: {} m'.format((t3 - t2) / 60))

    new_wind = []
    for pop in data_kmer[sim]['counts'].keys():

        pop_counts = pop_summary['array'][pop_dict[pop], :]
        pop_counts = np.array(pop_counts, dtype=int)
        Service = np.zeros((pop_counts.shape[0], pop_counts.shape[1] + 3),
                           dtype=int)

        Service[:, 3:] = pop_counts

        Service = np.array(Service, dtype=str)
        Service[:, 2] = inds[pop_dict[pop]]
        Service[:, 1] = pop
        Service[:, 0] = sim
        new_wind.append(Service)

    new_wind = np.concatenate(tuple(new_wind), axis=0)

    ### mutation labels
    mutations = get_mutations(bases=bases, ksize=ksize)
    kmers, kmer_idx = kmer_comp_index(mutations)

    mut_lib = kmer_mut_index(mutations)
    if collapsed:
        labels = [kmer_idx[x][0] for x in sorted(kmer_idx.keys())]

    else:
        labels = ['_'.join(x) for x in mutations]

    # write:
    db_file = out_db.format(sim)
    db_neigh = db_file.split('/')[:-1]
    db_name = db_file.split('/')[-1]
    db_neigh = os.listdir('/'.join(db_neigh))

    header = ['SIM', 'POP', 'N'] + labels
    header = '\t'.join(header) + '\n'

    if db_name not in db_neigh:
        with open(db_file, 'w') as fp:
            fp.write(header)

    with open(db_file, 'w') as fp:
        fp.write(header)
        fp.write('\n'.join(['\t'.join(x) for x in new_wind]))

    t1 = time.time()
    count_time = t1 - t0

    return ''