예제 #1
0
 def __init__(self, annotation, reference):
     self.annotation = pd.read_csv(os.path.join(
         ANNOTATION, annotation + '.exon.gff.tsv'),
                                   sep='\t',
                                   header=None,
                                   dtype={0: str})
     self.matrix5 = load_matrix5()
     self.matrix3 = load_matrix3()
     self.reference = pyfaidx.Fasta(reference)
예제 #2
0
파일: test.py 프로젝트: la466/ess_stops
def get_sequences(length):

    exon_names, exon_seqs = files.read_fasta(exons_file)
    exons = collections.defaultdict(lambda: collections.defaultdict(lambda: []))
    [exons[name.split('(')[0].split('.')[0]][int(name.split('(')[0].split('.')[1])].append(exon_seqs[i]) for i, name in enumerate(exon_names) if len(exon_seqs[i]) > length]
    exons = {id: {exon_id: exons[id][exon_id][0] for exon_id in exons[id]} for id in exons}

    intron_names, intron_seqs = files.read_fasta(introns_file)
    introns = collections.defaultdict(lambda: collections.defaultdict(lambda: []))
    [introns[name.split('(')[0].split('.')[0]][int(name.split('(')[0].split('.')[1].split('-')[0])].append(intron_seqs[i]) for i, name in enumerate(intron_names) if name.split('.')[0] in exons]
    introns = {id: {intron_id: introns[id][intron_id][0] for intron_id in introns[id]} for id in introns}




    #
    with open(output_file, 'w') as outfile:
        for id in exons:
            for exon_id in exons[id]:
                if id in introns:
                    if exon_id in introns[id]:
                        outfile.write(">{0}.{1}\n{2}{3}\n".format(id, exon_id, exons[id][exon_id][-length:].lower(), introns[id][exon_id][:length]))


    matrix5 = load_matrix5()
    entries = files.read_fasta(output_file)
    entries = {id: entries.sequences[i] for i, id in enumerate(entries.ids)}


    decoys = []

    for id in entries:
        seq = entries[id]

        splice_site = int(len(seq)/2)

        splice_site_seq = seq[splice_site-3:splice_site+6]
        real_splice_site_max_ent = maxent.score5(splice_site_seq, matrix=matrix5)


        kept = False
        for i in range(1, len(seq) - splice_site - 5):
            if not kept:
                query = seq[splice_site + i - 3:splice_site + i + 6]
                query = "{0}{1}".format(query[:3].lower(), query[3:])
                max_ent_score = maxent.score5(query, matrix=matrix5)
                if max_ent_score >= real_splice_site_max_ent:
                    print(id, real_splice_site_max_ent, i, query, max_ent_score)
                    decoys.append(id)
                    kept = True


    with open(decoy_file, "w") as outfile:
        [outfile.write(">{0}\n{1}\n".format(id, entries[id])) for id in decoys]
예제 #3
0
파일: model.py 프로젝트: teslaa22/models
    def __init__(self, side='5prime'):
        """

        """
        if side not in ['5prime', '3prime']:
            raise Exception("side should be 5prime or 3prime")
        self.side = side
        if self.side == '5prime':
            self.matrix = load_matrix5()
            self.model = score5
        else:
            self.matrix = load_matrix3()
            self.model = score3
예제 #4
0
def parse_intron(options, chrom, start, end, strand, intron_info):
    # fetch fasta
    fa = check_fasta(options['--genome'])
    intron_fa = dna_to_rna(fa.fetch(chrom, start, end), strand)
    # load matrix
    matrix5 = load_matrix5()
    # parse options
    phastcons_f = pyBigWig.open(options['--bigwig'])
    min_distance = int(options['--min-distance'])
    min_score = float(options['--min-score'])
    min_phastcons = float(options['--min-phastcons'])
    # start to parse rs sites
    rs_list = []
    for m in re.finditer('AGGT', intron_fa):
        if strand == '+':
            pos = start + m.start() + 2
            left_dist, right_dist, dist_flag = cal_distance(
                pos, start, end, min_distance)
            if not dist_flag:  # not enough distance
                continue
            ss5_seq = dna_to_rna(fa.fetch(chrom, pos - 3, pos + 6))
            if ss5_seq.find('N') != -1:  # ensure there is no N
                continue
            ss5, score_flag = cal_score(ss5_seq, matrix5, min_score)
            if not score_flag:  # not high score
                continue
        else:
            pos = end - m.start() - 2
            left_dist, right_dist, dist_flag = cal_distance(
                pos, start, end, min_distance)
            if not dist_flag:  # not enough distance
                continue
            ss5_seq = dna_to_rna(fa.fetch(chrom, pos - 6, pos + 3), strand='-')
            if ss5_seq.find('N') != -1:  # ensure there is no N
                continue
            ss5, score_flag = cal_score(ss5_seq, matrix5, min_score)
            if not score_flag:  # not high score
                continue
        phastcons = phastcons_f.stats(chrom, pos - 2, pos + 2)[0]
        if phastcons is None or phastcons < min_phastcons:  # not conserved
            continue
        rs_feature = '%d|%d|%d|%f|%f' % (pos, left_dist, right_dist, ss5,
                                         phastcons)
        rs_list.append(rs_feature)
    if rs_list:
        return (intron_info, rs_list)
    else:
        return (None, None)
예제 #5
0
    def findSpliceSites(self):
        # chr15:28235701-28235871,chr15:28235789-28235790,G,A,1921,682,1467,315,3223,1142,1276,367,attgggactgtgac...
        # first line is schema, every line after that is data
        i = 0
        for line in self.inputData:
            if i == 0:
                print(line.strip() + "," + "wt5start" + "," + "wt5sequence" +
                      ","
                      "wt5score" + ","
                      "mu5start" + "," + "mu5sequence" + "," + "mu5score" +
                      "," + "wt3start" + "," + "wt3sequence" + "," +
                      "wt3score" + "," + "mu3start" + "," + "mu3sequence" +
                      "," + "mu3score")
            else:
                splitLine = line.strip().split(',')
                sequenceStart = int(splitLine[0].split(":")[1].split("-")[0])
                mutationStart = int(splitLine[1].split(":")[1].split("-")[1])
                mutationTuple = (splitLine[2], splitLine[3])
                matrix5 = maxent.load_matrix5()
                self.fivePrimeSites[splitLine[0]] = self.check5Prime(
                    splitLine[12].strip(), self.fivePrimeSiteLength,
                    sequenceStart, mutationStart, mutationTuple, matrix5)
                matrix3 = maxent.load_matrix3()
                self.threePrimeSites[splitLine[0]] = self.check3Prime(
                    splitLine[12].strip(), self.threePrimeSiteLength,
                    sequenceStart, mutationStart, mutationTuple, matrix3)

                print(line.strip() + "," +
                      str(self.fivePrimeSites[splitLine[0]][0]) + "," +
                      str(self.fivePrimeSites[splitLine[0]][1]) + "," +
                      str(self.fivePrimeSites[splitLine[0]][2]) + "," +
                      str(self.fivePrimeSites[splitLine[0]][3]) + "," +
                      str(self.fivePrimeSites[splitLine[0]][4]) + "," +
                      str(self.fivePrimeSites[splitLine[0]][5]) + "," +
                      str(self.threePrimeSites[splitLine[0]][0]) + "," +
                      str(self.threePrimeSites[splitLine[0]][1]) + "," +
                      str(self.threePrimeSites[splitLine[0]][2]) + "," +
                      str(self.threePrimeSites[splitLine[0]][3]) + "," +
                      str(self.threePrimeSites[splitLine[0]][4]) + "," +
                      str(self.threePrimeSites[splitLine[0]][5]))
            i += 1
예제 #6
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# author: Jiguang Peng
# datetime: 2019/6/27 17:54

import itertools
import re

from pyhgvs.models import Transcript
from maxentpy import maxent
from maxentpy.maxent import load_matrix5, load_matrix3
from read_data import transcripts, genome, domain_bed, hotspot_bed, curated_region, pathogenic_dict
from utils import contained_in_bed

matrix5 = load_matrix5()
matrix3 = load_matrix3()


class Splicing:
    """
    splice class
    """
    donor_threshold = 3
    acceptor_threshold = 3
    percent_threshold = 0.7

    def __init__(self, vcfrecord, transcript):
        self.chrom = vcfrecord.chrom
        self.offset = int(vcfrecord.pos)
        self.ref = vcfrecord.ref
        self.alt = vcfrecord.alt
예제 #7
0
def read_and_score_fasta(outdir,
                         species,
                         donor_dinucleotide_start=3,
                         acceptor_dinucleotide_start=18):

    donor_dict = {}
    acceptor_dict = {}

    acceptor_scorefile = open(outdir + "/" + species + "_acceptor_scores.tsv",
                              'w')
    acceptor_scorefile.write("\t".join([
        "splice_site_type", "location", "seq", "score", "dinucleotide",
        "dinucleotide_is_standard"
    ]) + "\n")

    donor_scorefile = open(outdir + "/" + species + "_donor_scores.tsv", 'w')
    donor_scorefile.write("\t".join([
        "splice_site_type", "location", "seq", "score", "dinucleotide",
        "dinucleotide_is_standard"
    ]) + "\n")

    with open(outdir + "/" + species + "_donor.fastatab", 'r') as file:

        donor_matrix = maxent.load_matrix5()

        for line in file:

            entry = line.strip().split("\t")

            key = entry[0].split("(")[0]
            seq = entry[1].upper()
            dinucleotide = seq[
                donor_dinucleotide_start:donor_dinucleotide_start + 2]
            standard_dinucleotide = dinucleotide == "GT"

            donor_dict[key] = {
                "seq": seq,
                "score":
                maxent.score5(seq, donor_matrix) if "N" not in seq else "NA",
                "dinucleotide": dinucleotide,
                "standard_dinucleotide": standard_dinucleotide
            }

            donor_scorefile.write("\t".join([
                "donor", key, seq,
                str(donor_dict[key]["score"]), dinucleotide,
                str(standard_dinucleotide)
            ]) + "\n")

    with open(outdir + "/" + species + "_acceptor.fastatab", 'r') as file:

        acceptor_matrix = maxent.load_matrix3()

        for line in file:

            entry = line.strip().split("\t")

            key = entry[0].split("(")[0]
            seq = entry[1].upper()
            dinucleotide = seq[
                acceptor_dinucleotide_start:acceptor_dinucleotide_start + 2]
            standard_dinucleotide = dinucleotide == "AG"

            acceptor_dict[key] = {
                "seq":
                seq,
                "score":
                maxent.score3(seq, acceptor_matrix)
                if "N" not in seq else "NA",
                "dinucleotide":
                dinucleotide,
                "standard_dinucleotide":
                standard_dinucleotide
            }

            acceptor_scorefile.write("\t".join([
                "acceptor", key, seq,
                str(acceptor_dict[key]["score"]), dinucleotide,
                str(standard_dinucleotide)
            ]) + "\n")

    donor_scorefile.close()
    acceptor_scorefile.close()

    return donor_dict, acceptor_dict