def __init__(self, annotation, reference): self.annotation = pd.read_csv(os.path.join( ANNOTATION, annotation + '.exon.gff.tsv'), sep='\t', header=None, dtype={0: str}) self.matrix5 = load_matrix5() self.matrix3 = load_matrix3() self.reference = pyfaidx.Fasta(reference)
def get_sequences(length): exon_names, exon_seqs = files.read_fasta(exons_file) exons = collections.defaultdict(lambda: collections.defaultdict(lambda: [])) [exons[name.split('(')[0].split('.')[0]][int(name.split('(')[0].split('.')[1])].append(exon_seqs[i]) for i, name in enumerate(exon_names) if len(exon_seqs[i]) > length] exons = {id: {exon_id: exons[id][exon_id][0] for exon_id in exons[id]} for id in exons} intron_names, intron_seqs = files.read_fasta(introns_file) introns = collections.defaultdict(lambda: collections.defaultdict(lambda: [])) [introns[name.split('(')[0].split('.')[0]][int(name.split('(')[0].split('.')[1].split('-')[0])].append(intron_seqs[i]) for i, name in enumerate(intron_names) if name.split('.')[0] in exons] introns = {id: {intron_id: introns[id][intron_id][0] for intron_id in introns[id]} for id in introns} # with open(output_file, 'w') as outfile: for id in exons: for exon_id in exons[id]: if id in introns: if exon_id in introns[id]: outfile.write(">{0}.{1}\n{2}{3}\n".format(id, exon_id, exons[id][exon_id][-length:].lower(), introns[id][exon_id][:length])) matrix5 = load_matrix5() entries = files.read_fasta(output_file) entries = {id: entries.sequences[i] for i, id in enumerate(entries.ids)} decoys = [] for id in entries: seq = entries[id] splice_site = int(len(seq)/2) splice_site_seq = seq[splice_site-3:splice_site+6] real_splice_site_max_ent = maxent.score5(splice_site_seq, matrix=matrix5) kept = False for i in range(1, len(seq) - splice_site - 5): if not kept: query = seq[splice_site + i - 3:splice_site + i + 6] query = "{0}{1}".format(query[:3].lower(), query[3:]) max_ent_score = maxent.score5(query, matrix=matrix5) if max_ent_score >= real_splice_site_max_ent: print(id, real_splice_site_max_ent, i, query, max_ent_score) decoys.append(id) kept = True with open(decoy_file, "w") as outfile: [outfile.write(">{0}\n{1}\n".format(id, entries[id])) for id in decoys]
def __init__(self, side='5prime'): """ """ if side not in ['5prime', '3prime']: raise Exception("side should be 5prime or 3prime") self.side = side if self.side == '5prime': self.matrix = load_matrix5() self.model = score5 else: self.matrix = load_matrix3() self.model = score3
def parse_intron(options, chrom, start, end, strand, intron_info): # fetch fasta fa = check_fasta(options['--genome']) intron_fa = dna_to_rna(fa.fetch(chrom, start, end), strand) # load matrix matrix5 = load_matrix5() # parse options phastcons_f = pyBigWig.open(options['--bigwig']) min_distance = int(options['--min-distance']) min_score = float(options['--min-score']) min_phastcons = float(options['--min-phastcons']) # start to parse rs sites rs_list = [] for m in re.finditer('AGGT', intron_fa): if strand == '+': pos = start + m.start() + 2 left_dist, right_dist, dist_flag = cal_distance( pos, start, end, min_distance) if not dist_flag: # not enough distance continue ss5_seq = dna_to_rna(fa.fetch(chrom, pos - 3, pos + 6)) if ss5_seq.find('N') != -1: # ensure there is no N continue ss5, score_flag = cal_score(ss5_seq, matrix5, min_score) if not score_flag: # not high score continue else: pos = end - m.start() - 2 left_dist, right_dist, dist_flag = cal_distance( pos, start, end, min_distance) if not dist_flag: # not enough distance continue ss5_seq = dna_to_rna(fa.fetch(chrom, pos - 6, pos + 3), strand='-') if ss5_seq.find('N') != -1: # ensure there is no N continue ss5, score_flag = cal_score(ss5_seq, matrix5, min_score) if not score_flag: # not high score continue phastcons = phastcons_f.stats(chrom, pos - 2, pos + 2)[0] if phastcons is None or phastcons < min_phastcons: # not conserved continue rs_feature = '%d|%d|%d|%f|%f' % (pos, left_dist, right_dist, ss5, phastcons) rs_list.append(rs_feature) if rs_list: return (intron_info, rs_list) else: return (None, None)
def findSpliceSites(self): # chr15:28235701-28235871,chr15:28235789-28235790,G,A,1921,682,1467,315,3223,1142,1276,367,attgggactgtgac... # first line is schema, every line after that is data i = 0 for line in self.inputData: if i == 0: print(line.strip() + "," + "wt5start" + "," + "wt5sequence" + "," "wt5score" + "," "mu5start" + "," + "mu5sequence" + "," + "mu5score" + "," + "wt3start" + "," + "wt3sequence" + "," + "wt3score" + "," + "mu3start" + "," + "mu3sequence" + "," + "mu3score") else: splitLine = line.strip().split(',') sequenceStart = int(splitLine[0].split(":")[1].split("-")[0]) mutationStart = int(splitLine[1].split(":")[1].split("-")[1]) mutationTuple = (splitLine[2], splitLine[3]) matrix5 = maxent.load_matrix5() self.fivePrimeSites[splitLine[0]] = self.check5Prime( splitLine[12].strip(), self.fivePrimeSiteLength, sequenceStart, mutationStart, mutationTuple, matrix5) matrix3 = maxent.load_matrix3() self.threePrimeSites[splitLine[0]] = self.check3Prime( splitLine[12].strip(), self.threePrimeSiteLength, sequenceStart, mutationStart, mutationTuple, matrix3) print(line.strip() + "," + str(self.fivePrimeSites[splitLine[0]][0]) + "," + str(self.fivePrimeSites[splitLine[0]][1]) + "," + str(self.fivePrimeSites[splitLine[0]][2]) + "," + str(self.fivePrimeSites[splitLine[0]][3]) + "," + str(self.fivePrimeSites[splitLine[0]][4]) + "," + str(self.fivePrimeSites[splitLine[0]][5]) + "," + str(self.threePrimeSites[splitLine[0]][0]) + "," + str(self.threePrimeSites[splitLine[0]][1]) + "," + str(self.threePrimeSites[splitLine[0]][2]) + "," + str(self.threePrimeSites[splitLine[0]][3]) + "," + str(self.threePrimeSites[splitLine[0]][4]) + "," + str(self.threePrimeSites[splitLine[0]][5])) i += 1
#!/usr/bin/env python # -*- coding:utf-8 -*- # author: Jiguang Peng # datetime: 2019/6/27 17:54 import itertools import re from pyhgvs.models import Transcript from maxentpy import maxent from maxentpy.maxent import load_matrix5, load_matrix3 from read_data import transcripts, genome, domain_bed, hotspot_bed, curated_region, pathogenic_dict from utils import contained_in_bed matrix5 = load_matrix5() matrix3 = load_matrix3() class Splicing: """ splice class """ donor_threshold = 3 acceptor_threshold = 3 percent_threshold = 0.7 def __init__(self, vcfrecord, transcript): self.chrom = vcfrecord.chrom self.offset = int(vcfrecord.pos) self.ref = vcfrecord.ref self.alt = vcfrecord.alt
def read_and_score_fasta(outdir, species, donor_dinucleotide_start=3, acceptor_dinucleotide_start=18): donor_dict = {} acceptor_dict = {} acceptor_scorefile = open(outdir + "/" + species + "_acceptor_scores.tsv", 'w') acceptor_scorefile.write("\t".join([ "splice_site_type", "location", "seq", "score", "dinucleotide", "dinucleotide_is_standard" ]) + "\n") donor_scorefile = open(outdir + "/" + species + "_donor_scores.tsv", 'w') donor_scorefile.write("\t".join([ "splice_site_type", "location", "seq", "score", "dinucleotide", "dinucleotide_is_standard" ]) + "\n") with open(outdir + "/" + species + "_donor.fastatab", 'r') as file: donor_matrix = maxent.load_matrix5() for line in file: entry = line.strip().split("\t") key = entry[0].split("(")[0] seq = entry[1].upper() dinucleotide = seq[ donor_dinucleotide_start:donor_dinucleotide_start + 2] standard_dinucleotide = dinucleotide == "GT" donor_dict[key] = { "seq": seq, "score": maxent.score5(seq, donor_matrix) if "N" not in seq else "NA", "dinucleotide": dinucleotide, "standard_dinucleotide": standard_dinucleotide } donor_scorefile.write("\t".join([ "donor", key, seq, str(donor_dict[key]["score"]), dinucleotide, str(standard_dinucleotide) ]) + "\n") with open(outdir + "/" + species + "_acceptor.fastatab", 'r') as file: acceptor_matrix = maxent.load_matrix3() for line in file: entry = line.strip().split("\t") key = entry[0].split("(")[0] seq = entry[1].upper() dinucleotide = seq[ acceptor_dinucleotide_start:acceptor_dinucleotide_start + 2] standard_dinucleotide = dinucleotide == "AG" acceptor_dict[key] = { "seq": seq, "score": maxent.score3(seq, acceptor_matrix) if "N" not in seq else "NA", "dinucleotide": dinucleotide, "standard_dinucleotide": standard_dinucleotide } acceptor_scorefile.write("\t".join([ "acceptor", key, seq, str(acceptor_dict[key]["score"]), dinucleotide, str(standard_dinucleotide) ]) + "\n") donor_scorefile.close() acceptor_scorefile.close() return donor_dict, acceptor_dict