예제 #1
0
def count_popKmers(Window, mut_matrix, pop_dict, single= True, frequency_range= [0,1],row=24,col=4):
    '''
    Extract population mutation counts from _ind x kmer_ mutation matrix. 
    '''
    
    pop_counts= {}
    num_variants= {}
    
    for pop in pop_dict.keys():
        pop_gen= Window[pop_dict[pop],:]
        freqs= np.sum(pop_gen,axis= 0) / pop_gen.shape[0]
        ## discount alleles outside freq range.
        in_out= (freqs < frequency_range[0]) | (freqs > frequency_range[1])
        
        pop_gen[:,in_out]= 0
        
        if single: 
            pop_gen= np.sum(pop_gen,axis= 0) > 0
            pop_gen= np.array(pop_gen,dtype= int).reshape(1,len(pop_gen))
        
        pop_collapsed_mat= geno_muts_v2(pop_gen, mut_matrix)
        pop_summed= np.sum(pop_collapsed_mat,axis= 0)
        
        pop_counts[pop]= pop_summed.reshape(row,col)

        num_variants[pop]= np.sum(pop_collapsed_mat)

    return {
        'counts': pop_counts,
        'Nvars': num_variants,
        'sizes': {z:len(g) for z,g in pop_dict.items()}
    }
예제 #2
0
def countkmers_cofactor(pop_gen,
                        mut_matrix,
                        pop_ori,
                        single=True,
                        frequency_range=[0, 1],
                        scale=1,
                        prop_gen_used=1,
                        return_private=False,
                        PA={}):
    '''
    module to count_popKmers. this level allows to dissect populations.
    '''

    freqs = np.sum(pop_gen, axis=0) / pop_gen.shape[0]
    ## discount alleles outside freq range.
    in_out = (freqs <= frequency_range[0]) | (freqs >= frequency_range[1])

    if PA:
        shared = [x for x in range(pop_gen.shape[1]) if PA[pop_ori][x] == 0]
        pop_gen[:, shared] = 0

    if single:
        pop_gen = np.sum(pop_gen, axis=0) > 0
        pop_gen = np.array(pop_gen, dtype=int).reshape(1, len(pop_gen))

    pop_seg_ori = np.sum(pop_gen, axis=0) > 0
    pop_seg_ori = np.array(pop_seg_ori, dtype=int).reshape(1, len(pop_seg_ori))
    pop_seg_ori = pop_seg_ori * scale * prop_gen_used
    pop_gen[:, in_out] = 0

    pop_collapsed_mat = geno_muts_v2(pop_gen, mut_matrix)

    return pop_collapsed_mat, pop_seg_ori
예제 #3
0
def countkmers_cofactor(pop_gen,
                        mut_matrix,
                        mut_idx,
                        pop_ori,
                        single=True,
                        frequency_range=[0, 1],
                        scale=1,
                        prop_gen_used=1,
                        return_private=False,
                        PA=False):
    '''
    module to count_popKmers. this level allows to dissect populations.
    '''
    t0 = time.time()
    if PA:
        freqs = np.sum(pop_gen, axis=0) / pop_gen.shape[0]
        ## discount alleles outside freq range.
        in_out = (freqs <= frequency_range[0]) | (freqs >= frequency_range[1])
        pop_gen[:, in_out] = 0

    pop_seg_ori = np.sum(pop_gen, axis=0) > 0
    pop_seg_ori = np.array(pop_seg_ori, dtype=int).reshape(1, len(pop_seg_ori))

    if single:
        pop_gen = pop_seg_ori

    t1 = time.time()
    if pop_gen.shape[0] == 1:
        pop_collapsed_mat = lineAssign(pop_gen[0],
                                       mut_idx,
                                       nmuts=mut_matrix.shape[0])
    else:
        pop_collapsed_mat = geno_muts_v2(pop_gen, mut_matrix)

    t2 = time.time()
    print('#')
    print(pop_gen.shape)
    print(t1 - t0)
    print(t2 - t1)

    return pop_collapsed_mat, pop_seg_ori
예제 #4
0
def MC_sample_matrix_v1(min_size= 80, samp= [5,20,10], stepup= "increment", diffs= False, frequency_range= [0,1],indfile= 'ind_assignments.txt', outemp= 'ind_assignments{}.txt',
                    count_dir= './count/', dir_launch= '..',main_dir= './', sim_dir= 'mutation_counter/data/sims/', muted_dir= 'mutation_counter/data/mutation_count/',
                    outlog= 'indy.log', row= 24,col= 4, single= True, exclude= False, print_summ= False, sample_sim= 0,collapsed= True,bases= 'ACGT',ksize= 3,ploidy= 2, freq_extract= False):
    '''
    launch mutation counter pipeline on manipulated population assignments.
    Use matrix multiplication to extract counts. 
    - v1 relies on count_popKmers() function to count mutations per pop. allows freq. filter and single mutaiton count.  
    '''
    
    ti= time.time()
    sims= process_dir(sims_dir= sim_dir)
    print('available {}'.format(len(sims)))

    tags= []
    sim_extend= []
    chroms= []
    
    data_kmer= {}
    data_freqs= {}
    #sim_sample= np.random.choice(sims,8,replace= False)
    if sample_sim == 0:
        sample_sim= len(sims)

    print('sample {}'.format(sample_sim))
    sim_sub= np.random.choice(sims,sample_sim,replace= False)
    
    for sim in sim_sub:
        
        ## chromosome
        chrom= sim.split('.')[0].split('C')[-1].strip('chr')
        chromosomes= [sim.split('.')[0].split('C')[1]]
        chromosome_groups = [chromosomes]

        if exclude:
            files= read_exclude()
        else:
            files= {}

        ### read vcf

        vcf_dir= sim_dir + sim + '/'
        vcf_file= vcf_dir + sim + '_' + 'chr' + chrom + '.vcf.gz'
        
        t0= time.time()
        print(sim)

        genotype, summary, Names= read_vcf_allel(vcf_file)
        t1= time.time()
        
        read_time= t1- t0

        if len(genotype) == 0:
            continue
        
        print(genotype.shape)
        
        ## read fasta
        fasta_file= vcf_dir + 'chr{}_{}.fa.gz'.format(chrom,sim)

        with gzip.open(fasta_file,'r') as f:
            lines= f.readlines()
            lines= [x.decode() for x in lines]

        refseq= lines[1].strip()

        ### 
        positions= [int(x) for x in summary.POS]
        wstart= int(min(positions))
        wend= int(max(positions))
        
        Wlen= wend - wstart
        
        genotype_parse= [x for x in range(summary.shape[0]) if int(summary.POS[x])-1 >= wstart and int(summary.POS[x])-1 <= wend]
        Window= genotype[:,genotype_parse]
        subset_summary= summary.loc[genotype_parse,:].reset_index()
        
        ##
        t0= time.time()
        mut_matrix, flag_reverse, flag_remove= vcf_muts_matrix_v1(refseq,subset_summary,start= wstart,end= wend,ksize= ksize,
        													bases=bases, collapse= collapsed)
        
        retain= [x for x in range(Window.shape[1]) if x not in flag_remove]
        Window= Window[:,retain]
        subset_summary= subset_summary.loc[retain,:].reset_index()

        t1= time.time()
        time_mut= t1 - t0

        if diffs:
        	sim_start= sim.split('.')[-1]
        	diff_snps= read_diffs(sim,diff_dir= vcf_dir, start= int(sim_start))

        	summary_diff= [x for x in range(subset_summary.shape[0]) if subset_summary.POS[x] in diff_snps.keys()]

        	flag_reverse.extend(summary_diff)
        	flag_reverse= list(set(flag_reverse))
        
        
        if flag_reverse:
            Window[:,flag_reverse]= ploidy - Window[:,flag_reverse]
        
        ind_collapsed_mat= geno_muts_v2(np.array(Window), mut_matrix)
        
        tag_list, tag_dict, pop_dict= ind_assignment_scatter_v1(sim,dir_sim= sim_dir,
                          min_size= min_size, samp= samp, stepup= stepup, outemp= outemp,indfile= indfile)
        #print(tag_list)
        total_inds= sum([len(x) for x in pop_dict.values()])
        if Window.shape[0] < total_inds:
            continue
        ## counts for no tag sim:
        s0= time.time()
        data_kmer[sim]= count_popKmers(Window, mut_matrix, pop_dict, single= single, 
                                  frequency_range= frequency_range,row=row,col=col)

        if freq_extract:
            pop_freqs= pop_dict_SFS(Window,pop_dict)
            data_freqs[sim]= pop_freqs
        
        t1= time.time()
        count_time= t1- t0
        
        if len(tag_list):
            ###
            sim_extend.append(sim)
            tags.append('')
            chroms.append(chrom)
            ###
            
            for idx in range(len(tag_list)):
                
                sim_extend.extend([sim]*len(tag_list))
                tags.extend(tag_list)
                chroms.extend([chrom]*len(tag_list))
                
                ##
                tag= tag_list[idx]
                ind_file= outemp.format(tags[idx])
                new_sim= sim + tag

                pop_dict= tag_dict[tag]
                
                data_kmer[new_sim]= count_popKmers(Window, mut_matrix, pop_dict, single= single, 
                                  frequency_range= frequency_range,row=row,col=col)

                if freq_extract:
                    pop_freqs= pop_dict_SFS(Window,pop_dict)
                    data_freqs[new_sim]= pop_freqs
                

        if print_summ:
            print('mut_matrix time: {} s'.format(time_mut / 60))
            print('count time: {} s'.format(count_time / 60))
            print('est total count time: {} s'.format(count_time*len(tag_list) / 60))
            print('replicates: {}'.format(len(tag_list)))
            print('read time: {} s'.format(read_time / 60))

    tf= time.time()
    time_elapsed= tf - ti
    
    print('time elapsed: {}s'.format(time_elapsed))
    

    return data_kmer, data_freqs
예제 #5
0
def MC_sample_matrix(logfile, min_size= 80, samp= [5,20,10], pops= 'ind_assignments.txt', outemp= 'ind_assignments{}.txt',
                    count_dir= './count/', dir_launch= '..',main_dir= './', sim_dir= 'mutation_counter/data/sims/', muted_dir= 'mutation_counter/data/mutation_count/',
                    outlog= 'indy.log', row= 24,col= 4,exclude= False):
    '''
    launch mutation counter pipeline on manipulated population assignments.
    Use matrix multiplication to extract counts. 
    '''
    
    sims= process_dir(sims_dir= main_dir+sim_dir)
    print(len(sims))
    tags= []
    sim_extend= []
    chroms= []
    
    data= {}
    
    for sim in sims:
        
        ## chromosome
        chrom= sim.split('.')[0].split('C')[-1].strip('chr')
        chromosomes= [sim.split('.')[0].split('C')[1]]
        chromosome_groups = [chromosomes]

        if exclude:
            files= read_exclude()
        else:
            files= {}

        ### read vcf

        row_info= 6
        header_info= 9
        phased= False
        vcf_dir= sim_dir + sim + '/'
        vcf_file= vcf_dir + sim + '_' + 'chr' + chrom + '.vcf.gz'

        genotype, summary, Names= read_geno_nanumv3(vcf_file, header_info= header_info,phased= phased)
        
        
        ## read fasta
        fasta_file= vcf_dir + 'chr{}_{}.fa.gz'.format(chrom,sim)

        with gzip.open(fasta_file,'r') as f:
            lines= f.readlines()
            lines= [x.decode() for x in lines]

        refseq= lines[1].strip()

        ###
        positions= [int(x) for x in summary.POS]
        wstart= int(min(positions))
        wend= int(max(positions))
        
        Wlen= wend - wstart
        ksize= 3 # odd.
        bases = 'ACGT'
        collapsed= True
        
        
        genotype_parse= [x for x in range(summary.shape[0]) if int(summary.POS[x])-1 >= wstart and int(summary.POS[x])-1 <= wend]
        Window= genotype[:,genotype_parse]
        subset_summary= summary.loc[genotype_parse,:].reset_index()
        
        ##
        mut_matrix, flag_reverse= vcf_muts_matrix_v1(refseq,subset_summary,start= wstart,end= wend,ksize= ksize,bases=bases, collapse= collapsed)
        if flag_reverse:
            Window[:,flag_reverse]= 2 - Window[:,flag_reverse]
        
        ind_collapsed_mat= geno_muts_v2(np.array(Window), mut_matrix)
        
        tag_list, tag_dict, pop_dict= ind_assignment_scatter_v1(sim,main_dir= main_dir,
                          min_size= min_size, samp= samp, outemp= outemp)
        #print(tag_list)
        
        ## counts for no tag sim:
        pop_counts= {
            z: np.sum(ind_collapsed_mat[pop_dict[z],:],axis= 0) for z in pop_dict.keys()
        }
        
        pop_counts= {
            z:g.reshape(row,col) for z,g in pop_counts.items()
        }
        
        num_variants= {
            z: np.sum(ind_collapsed_mat[pop_dict[z],:]) for z in pop_dict.keys()
        }
        
        data[sim]= {
            'counts': pop_counts,
            'Nvars': num_variants,
            'sizes': {z:len(g) for z,g in pop_dict.items()}
        }
        
        
        if len(tag_list):
            ###
            sim_extend.append(sim)
            tags.append('')
            chroms.append(chrom)
            ###
            
            for idx in range(len(tag_list)):
                
                sim_extend.extend([sim]*len(tag_list))
                tags.extend(tag_list)
                chroms.extend([chrom]*len(tag_list))
                
                ##
                tag= tag_list[idx]
                ind_file= outemp.format(tags[idx])
                new_sim= sim + tag

                pop_dict= tag_dict[tag]

                pop_sizes= {
                    z: len(g) for z,g in pop_dict.items()
                }
                
                pops= list(set(pop_dict.keys()))
                
                ###
                pop_counts= {
                    z: np.sum(ind_collapsed_mat[pop_dict[z],:],axis= 0) for z in pop_dict.keys()
                }
                
                pop_counts= {
                    z:g.reshape(row,col) for z,g in pop_counts.items()
                }
                num_variants= {
                    z: np.sum(ind_collapsed_mat[pop_dict[z],:]) for z in pop_dict.keys()
                }
                
                data[new_sim]= {
                    'counts': pop_counts,
                    'Nvars': num_variants,
                    'sizes': {z:len(g) for z,g in pop_dict.items()}
                }
    
    return data
예제 #6
0
def count_popKmers(Window,
                   mut_matrix,
                   mut_idx,
                   pop_dict,
                   single=True,
                   frequency_range=[0, 1],
                   segregating=True,
                   scale=1,
                   prop_gen_used=1,
                   return_private=False,
                   return_seg=False,
                   pop_tag='_ss',
                   row=32,
                   col=3):
    '''
    Extract population mutation counts from _ind x kmer_ mutation matrix. 
    '''
    pop_counts = {}
    num_variants = {}
    pop_seg = {}
    PA_dict = {}

    pop_list = list(pop_dict.keys())

    for pop in pop_list:
        t0 = time.time()
        pop_ori = pop

        if pop_tag in pop:
            pop_ori = pop[len(pop_tag):].split('.')[0]

        klist = sorted(pop_dict[pop])
        pop_gen = Window[klist, :]

        t1 = time.time()

        if single:
            pop_gen = np.sum(pop_gen, axis=0)
            if segregating:
                pop_gen = pop_gen > 0

            pop_gen = np.array(pop_gen, dtype=int).reshape(1, len(pop_gen))

        t2 = time.time()
        if pop_gen.shape[0] == 1:
            pop_collapsed_mat = lineAssign(pop_gen,
                                           mut_idx,
                                           nmuts=mut_matrix.shape[0])
        else:
            pop_collapsed_mat = geno_muts_v2(pop_gen, mut_matrix)

        t3 = time.time()

        tfetch = t1 - t0
        tfilter = t2 - t1
        tcount = t3 - t2
        ttot = t3 - t0
        rate = ttot / (len(klist) / 1000)
        #print('#')
        #print('w shape: {}'.format(Window.shape))
        #print(pop)
        #print('tfetch: {} s'.format(tfetch))
        #print('t filter: {} s'.format(tfilter))
        #print('N {} rate /1K : {}'.format(len(klist),rate))
        #print('total {} s'.format(ttot))
        #print('count {} s'.format(tcount / ttot))

        pop_seg[pop] = pop_gen

        pop_summed = np.sum(pop_collapsed_mat, axis=0)
        t2 = time.time()
        ######
        ######
        pop_counts[pop] = pop_summed.reshape(row, col) * scale * prop_gen_used
        num_variants[pop] = np.sum(pop_collapsed_mat) * scale * prop_gen_used

    pop_summary = {
        'counts': pop_counts,
        'Nvars': num_variants,
        'sizes': {z: len(g)
                  for z, g in pop_dict.items()}
    }

    if return_seg:
        pop_summary['seg'] = pop_seg

    return pop_summary, PA_dict