def fasta_get_freq(seq,start= 0,end= 0,step= 1,ksize=3,bases= 'ATCG'): '''return count of kmer across fasta region''' kmer_dict= kmer_dict_init(ksize= ksize,bases=bases) if end == 0: end= len(seq) - ksize for ki in range(start,end,step): kmer= seq[ki:ki+ksize] if 'N' in kmer: continue get_by_path(kmer_dict, kmer[:-1])[kmer[-1]] += 1 return kmer_dict
def kmer_freq_balance(kmer_dict, mutations, fasta_len= 10000, bases= 'ACGT',ksize= 3): '''return list of possible kmer mutations''' mutation_sum= [] Nkmers= fasta_len - ksize for idx in range(len(mutations)): mut= mutations[idx] prop= get_by_path(kmer_dict,mut[0]) prop= prop / Nkmers mutation_sum.append(prop) return np.array(mutation_sum).reshape(1,-1)
def vcf_muts_matrix_v1(refseq,summary,start= 0,end= 0,ksize= 3,bases='ATCG', collapse= True): ''' Return matrix of mutation contexts by SNP in genotype array Each mutation is mapped to list of possible mutations as a binary vector. - v1 determines if alternative allele = reference allele in fasta. if so, allele is switched, position idx is flagged. ''' mutations= get_mutations(bases= bases,ksize= ksize) kmers, kmer_idx= kmer_comp_index(mutations) mut_lib= kmer_mut_index(mutations) if end == 0: end= max(summary.POS) k5= int(ksize/2) k3= ksize - k5 pos_mut= [] flag_reverse= [] flag_remove= [] for x in range(summary.shape[0]): pos= int(summary.POS[x]) - 1 if pos >= start and pos <= end: kmer= refseq[pos-k5: pos + k3] if 'N' in kmer: flag_remove.append(x) continue mut= kmer + summary.ALT[x] if kmer[1] == summary.ALT[x]: flag_reverse.append(x) mut= kmer+summary.REF[x] if len(mut) != 4: print(kmer) print(summary.REF[x],summary.ALT[x]) print(x,pos) print(len(refseq),summary.shape[0]) if collapse: mut_array=np.zeros(len(kmer_idx)) pos_mut.append(mut_array) continue else: mut_array=np.zeros(len(mutations)) pos_mut.append(mut_array) continue if collapse: mut_index= kmers[mut] mut_array=np.zeros(len(kmer_idx)) else: mut_index= get_by_path(mut_lib, list(mut)) mut_array=np.zeros(len(mutations)) mut_array[mut_index]= 1 pos_mut.append(mut_array) pos_mut= np.array(pos_mut).T return pos_mut, flag_reverse, flag_remove