def model_region_singletons(data_container, vcf_path, fasta_path, kmer_size, region): start = time.time() fasta = Fasta(fasta_path) vcf = VCF(vcf_path) start_idx_offset = int(kmer_size / 2 + 1) kmer_mid_idx = int(start_idx_offset - 1) try: if region.strand is not None: if ek.is_dash(region.strand): sequence = fasta.get_seq( region.chrom, region.start - kmer_mid_idx, region.stop + kmer_mid_idx).complement.seq.upper() else: sequence = fasta.get_seq( region.chrom, region.start - kmer_mid_idx, region.stop + kmer_mid_idx).seq.upper() else: sequence = fasta.get_seq(region.chrom, region.start - kmer_mid_idx, region.stop + kmer_mid_idx).seq.upper() except (KeyError, FetchError): print('Region %s not found in fasta, continuing...' % str(region), file=sys.stderr, flush=True) return region_ref_counts = ek.kmer_search( sequence, kmer_size) # nprocs=1 due to short region r_string = str(region.chrom) + ':' + str(region.start) + '-' + str( region.stop) transitions = defaultdict(lambda: array.array('L', [0, 0, 0, 0])) # Define indices for nucleotides nuc_idx = {'A': 0, 'C': 1, 'G': 2, 'T': 3} idx_nuc = list('ACGT') for variant in vcf(r_string): if ek.is_singleton_snv(variant): new_var = Variant(variant=variant, fields=['vep']) # take 7mer around variant. pyfaidx excludes start index and includes end index adj_seq = fasta[str( new_var.CHROM)][(new_var.POS - start_idx_offset):(new_var.POS + kmer_mid_idx)].seq if str(adj_seq[kmer_mid_idx]).upper() != str(variant.REF).upper(): print( 'WARNING: Reference mismatch\tFasta REF: %s\tVCF REF: %s' % (adj_seq[kmer_mid_idx], variant.REF), file=sys.stderr, flush=True) if ek.complete_sequence(adj_seq): transitions[adj_seq.upper()][nuc_idx[new_var.ALT[0]]] += 1 temp = data_container.get() temp.add_kmer_counts(region_ref_counts) temp.add_transition(transitions) data_container.set(temp) print('Finished region %s in %s' % (str(region), str(time.time() - start)), flush=True) return
def process_chrom_bin(region, kmer_size, vcf_path, fasta_path, AF=False): start = time.time() fasta = Fasta(fasta_path) vcf = VCF(vcf_path) start_idx_offset = int(kmer_size / 2 + 1) kmer_mid_idx = int(start_idx_offset - 1) try: sequence = fasta.get_seq(region.chrom, region.start, region.stop).seq.upper() except (KeyError, FetchError): print('Region %s not found in fasta, continuing...' % str(region), file=sys.stderr) return region_ref_counts, gc_content, n_count = ek.kmer_search(sequence, kmer_size, count_gc=True, count_n=True) # nprocs=1 due to short region r_string = str(region.chrom) + ':' + str(region.start) + '-' + str(region.stop) if AF: transitions = defaultdict(lambda: array.array('d', [0, 0, 0, 0])) else: transitions = defaultdict(lambda: array.array('L', [0, 0, 0, 0])) # Define indices for nucleotides nuc_idx = {'A': 0, 'C': 1, 'G': 2, 'T': 3} # count, singletons = ek.count_regional_variants(vcf(r_string)) for variant in vcf(r_string): if ek.is_singleton_snv(variant): new_var = Variant(variant=variant) # take 7mer around variant. pyfaidx excludes start index and includes end index adj_seq = fasta[str(new_var.CHROM)][(new_var.POS - start_idx_offset):(new_var.POS + kmer_mid_idx)].seq if str(adj_seq[kmer_mid_idx]).upper() != str(variant.REF).upper(): print('WARNING: Reference mismatch\tFasta REF: %s\tVCF REF: %s' % (adj_seq[kmer_mid_idx], variant.REF), file=sys.stderr, flush=True) if ek.complete_sequence(adj_seq): if AF: transitions[adj_seq.upper()][nuc_idx[new_var.ALT[0]]] += variant.INFO.get('AF') else: transitions[adj_seq.upper()][nuc_idx[new_var.ALT[0]]] += 1 if len(transitions.keys()) > 0 and len(region_ref_counts.keys()) > 0: bin_trans = pd.DataFrame.from_dict(transitions, orient='index') bin_trans.sort_index(inplace=True) # bin_trans['tot'] = bin_trans.sum(axis=1) bin_kcounts = pd.DataFrame.from_dict(region_ref_counts, orient='index') bin_kcounts.sort_index(inplace=True) bin_trans['counts'] = bin_kcounts[0] bin_trans['freq'] = bin_trans.apply(row_multinomial) # kmer_freq = pd.concat([bin_trans.loc[:, 'tot'], bin_kcounts], join='outer', axis=1, sort=True) # kmer_freq.fillna(0, inplace=True) # kmer_freq['freq'] = kmer_freq.tot / kmer_freq.counts bin_trans.loc['GC_content', 'freq'] = gc_content bin_trans.loc['N_count', 'freq'] = n_count print('Finished region %s in %s' % (str(region), str(time.time() - start)), flush=True) return region, bin_trans['freq'].to_dict() else: print('Finished region %s in %s' % (str(region), str(time.time() - start)), flush=True) return region, None
def model_region_nonsingletons(data_container, vcf_path, fasta_path, kmer_size, region, AC_cutoff): if AC_cutoff is not None: try: AC_cutoff = int(AC_cutoff) except ValueError: AC_cutoff = None print( 'AC cutoff must be a positive integer. Ignoring user value and using SNVs with any AC.', file=sys.stderr, flush=True) try: kmer_size = int(kmer_size) if kmer_size < 1: raise ValueError except ValueError: print('kmer_size must be a positive integer. Please check arguments.', file=sys.stderr, flush=True) exit(1) start = time.time() fasta = Fasta(fasta_path) vcf = VCF(vcf_path) start_idx_offset = int(kmer_size / 2 + 1) kmer_mid_idx = int(start_idx_offset - 1) try: if region.strand is not None: if ek.is_dash(region.strand): sequence = fasta.get_seq( region.chrom, region.start - kmer_mid_idx, region.stop + kmer_mid_idx).complement.seq.upper() else: sequence = fasta.get_seq( region.chrom, region.start - kmer_mid_idx, region.stop + kmer_mid_idx).seq.upper() else: sequence = fasta.get_seq(region.chrom, region.start - kmer_mid_idx, region.stop + kmer_mid_idx).seq.upper() except (KeyError, FetchError): print('Region %s not found in fasta, continuing...' % str(region), file=sys.stderr, flush=True) return region_ref_counts = ek.kmer_search( sequence, kmer_size) # nprocs=1 due to short region r_string = str(region.chrom) + ':' + str(region.start) + '-' + str( region.stop) ac_transitions = defaultdict(lambda: array.array('L', [0, 0, 0, 0])) an_transitions = defaultdict(lambda: array.array('L', [0, 0, 0, 0])) # Define indices for nucleotides nuc_idx = {'A': 0, 'C': 1, 'G': 2, 'T': 3} idx_nuc = list('ACGT') for variant in vcf(r_string): if ek.is_quality_snv(variant, AC_cutoff=AC_cutoff): new_var = Variant(variant=variant) adj_seq = fasta[str( new_var.CHROM)][(new_var.POS - start_idx_offset):(new_var.POS + kmer_mid_idx)].seq if str(adj_seq[kmer_mid_idx]).upper() != str(variant.REF).upper(): print( 'WARNING: Reference mismatch\tFasta REF: %s\tVCF REF: %s' % (adj_seq[kmer_mid_idx], variant.REF), file=sys.stderr, flush=True) if ek.complete_sequence(adj_seq): ac_transitions[adj_seq.upper()][nuc_idx[ new_var.ALT[0]]] += new_var.AC an_transitions[adj_seq.upper()][nuc_idx[ new_var.ALT[0]]] += new_var.AN # if ek.is_singleton_snv(variant): # new_var = Variant(variant=variant, fields=['vep']) # # take 7mer around variant. pyfaidx excludes start index and includes end index # adj_seq = fasta[str(new_var.CHROM)][(new_var.POS - start_idx_offset):(new_var.POS + kmer_mid_idx)].seq # if str(adj_seq[kmer_mid_idx]).upper() != str(variant.REF).upper(): # print('WARNING: Reference mismatch\tFasta REF: %s\tVCF REF: %s' % (adj_seq[kmer_mid_idx], variant.REF), file=sys.stderr, flush=True) # if ek.complete_sequence(adj_seq): # transitions[adj_seq.upper()][nuc_idx[new_var.ALT[0]]] += 1 temp = data_container.get() temp.add_kmer_counts(region_ref_counts) temp.add_transition(ac_transitions) temp.add_transition2(an_transitions) data_container.set(temp) print('Finished region %s in %s' % (str(region), str(time.time() - start)), flush=True) return
def process_bed_region(region, kmer_size, vcf_path, fasta_path, AF=False, delim=','): start = time.time() fasta = Fasta(fasta_path) vcf = VCF(vcf_path) start_idx_offset = int(kmer_size / 2 + 1) kmer_mid_idx = int(start_idx_offset - 1) try: # sequence = fasta.get_seq(region.chrom, region.start, region.stop).seq.upper() if region.strand is not None: if ek.is_dash(region.strand): sequence = fasta.get_seq(region.chrom, region.start - kmer_mid_idx, region.stop + kmer_mid_idx).complement.seq.upper() else: sequence = fasta.get_seq(region.chrom, region.start - kmer_mid_idx, region.stop + kmer_mid_idx).seq.upper() else: sequence = fasta.get_seq(region.chrom, region.start - kmer_mid_idx, region.stop + kmer_mid_idx).seq.upper() except (KeyError, FetchError): print('Region %s not found in fasta, continuing...' % str(region), file=sys.stderr) return region_ref_counts, gc_content, n_count = ek.kmer_search(sequence, kmer_size, count_gc=True, count_n=True) # nprocs=1 due to short region if AF: transitions = defaultdict(lambda: array.array('d', [0, 0, 0, 0])) else: transitions = defaultdict(lambda: array.array('L', [0, 0, 0, 0])) # Define indices for nucleotides nuc_idx = {'A': 0, 'C': 1, 'G': 2, 'T': 3} # count, singletons = ek.count_regional_variants(vcf(r_string)) for variant in vcf(region.vcf_str()): if ek.is_singleton_snv(variant): new_var = Variant(variant=variant) # take 7mer around variant. pyfaidx excludes start index and includes end index adj_seq = fasta[str(new_var.CHROM)][(new_var.POS - start_idx_offset):(new_var.POS + kmer_mid_idx)].seq if str(adj_seq[kmer_mid_idx]).upper() != str(variant.REF).upper(): print('WARNING: Reference mismatch\tFasta REF: %s\tVCF REF: %s' % (adj_seq[kmer_mid_idx], variant.REF), file=sys.stderr, flush=True) if ek.complete_sequence(adj_seq): if AF: transitions[adj_seq.upper()][nuc_idx[new_var.ALT[0]]] += variant.INFO.get('AF') else: transitions[adj_seq.upper()][nuc_idx[new_var.ALT[0]]] += 1 if len(transitions.keys()) > 0 and len(region_ref_counts.keys()) > 0: bin_trans = pd.DataFrame.from_dict(transitions, orient='index') bin_trans.sort_index(inplace=True) bin_trans['tot'] = bin_trans.sum(axis=1) bin_kcounts = pd.DataFrame.from_dict(region_ref_counts, orient='index') bin_kcounts.sort_index(inplace=True) bin_kcounts.columns = ['counts'] kmer_freq = pd.concat([bin_trans.loc[:, 'tot'], bin_kcounts], join='outer', axis=1, sort=True) kmer_freq.fillna(0, inplace=True) kmer_freq['freq'] = kmer_freq.tot / kmer_freq.counts kmer_freq.loc['GC_content', 'freq'] = gc_content kmer_freq.loc['N_count', 'freq'] = n_count kdict = kmer_freq['freq'].to_dict() # kmer_freq.sort_index(inplace=True) # print('Finished region %s in %s' % (region.str_name(), str(time.time() - start)), flush=True) outstring = region.str_name() + delim kkeys = ek.generate_kmers(kmer_size) kkeys.append('GC_content') kkeys.append('N_count') for i, k in enumerate(kkeys): try: outstring = outstring + str(kmer_freq.loc[k, 'freq']) except KeyError: outstring = outstring + '0' if (i + 1) < len(kkeys): outstring = outstring + delim print(outstring, flush=True) # return region, kmer_freq['freq'].to_dict() else: # print('Finished region %s in %s' % (region.str_name(), str(time.time() - start)), flush=True) outstring = region.str_name() + delim for i in range((kmer_size ** 4) + 2): outstring = outstring + '0' if (i + 1) < ((kmer_size ** 4) + 2): outstring = outstring + delim print(outstring, flush=True)
def model_region(datacontainer, vcf_path, fasta_path, kmer_size, region, AC_cutoff): if AC_cutoff is not None: try: AC_cutoff = int(AC_cutoff) except ValueError: AC_cutoff = None print( 'AC cutoff must be a positive integer. Ignoring user value and using SNVs with any AC.', file=sys.stderr, flush=True) try: kmer_size = int(kmer_size) if kmer_size < 1: raise ValueError except ValueError: print('kmer_size must be a positive integer. Please check arguments.', file=sys.stderr, flush=True) exit(1) start = time.time() fasta = Fasta(fasta_path) vcf = VCF(vcf_path) start_idx_offset = int(kmer_size / 2 + 1) kmer_mid_idx = int(start_idx_offset - 1) try: if region.strand is not None: if is_dash(region.strand): sequence = fasta.get_seq( region.chrom, region.start - kmer_mid_idx, region.stop + kmer_mid_idx).complement.seq.upper() else: sequence = fasta.get_seq( region.chrom, region.start - kmer_mid_idx, region.stop + kmer_mid_idx).seq.upper() else: sequence = fasta.get_seq(region.chrom, region.start - kmer_mid_idx, region.stop + kmer_mid_idx).seq.upper() except (KeyError, FetchError): print('Region %s not found in fasta, continuing...' % str(region), file=sys.stderr, flush=True) return region_ref_counts = kmer_search(sequence, kmer_size) # nprocs=1 due to short region r_string = str(region.chrom) + ':' + str(region.start) + '-' + str( region.stop) singleton_transitions = defaultdict(lambda: array.array('L', [0, 0, 0, 0])) ac_transitions = defaultdict(lambda: array.array('L', [0, 0, 0, 0])) an_transitions = defaultdict(lambda: array.array('L', [0, 0, 0, 0])) af_transitions = defaultdict(lambda: array.array('d', [0, 0, 0, 0])) # Define indices for nucleotides nuc_idx = {'A': 0, 'C': 1, 'G': 2, 'T': 3} idx_nuc = list('ACGT') for variant in vcf(r_string): if is_quality_snv(variant, AC_cutoff=AC_cutoff): adj_seq = fasta[str( variant.CHROM)][(variant.POS - start_idx_offset):(variant.POS + kmer_mid_idx)].seq if str(adj_seq[kmer_mid_idx]).upper() != str(variant.REF).upper(): print( 'WARNING: Reference mismatch\tFasta REF: %s\tVCF REF: %s' % (adj_seq[kmer_mid_idx], variant.REF), file=sys.stderr, flush=True) if complete_sequence(adj_seq): ac_transitions[adj_seq.upper()][nuc_idx[ variant.ALT[0]]] += variant.INFO.get('AC') an_transitions[adj_seq.upper()][nuc_idx[ variant.ALT[0]]] += variant.INFO.get('AN') af_transitions[adj_seq.upper()][nuc_idx[ variant.ALT[0]]] += variant.INFO.get('AF') if variant.INFO.get('AC') == 1: singleton_transitions[adj_seq.upper()][nuc_idx[ variant.ALT[0]]] += 1 data = { 'singleton': singleton_transitions, 'AC': ac_transitions, 'AN': an_transitions, 'AF': af_transitions } temp = datacontainer.get() temp.add_kmer_counts(region_ref_counts) for k, v in data.items(): temp.add_transition(v, k) datacontainer.set(temp) print('Finished region %s in %s' % (str(region), str(time.time() - start)), flush=True) return