예제 #1
0
    def get_splice_statistics(self, chrom, fnames, name):
        if type("") == type(fnames):
            fnames = [fnames]

        for fname in fnames:
            self.logger.debug("Getting splicing data from %s", fname)
            read_source = get_or_create(self.session, 
                    ReadSource, name=name, source=fname)
            self.session.commit()
            for line in open(fname):
                vals = line.strip().split("\t")
                if vals[0] == chrom:
                    start, end, c = [int(x) for x in vals[1:4]]
                    strand = vals[5]
                    
                    splice = get_or_create(self.session, Feature,
                             chrom = chrom,
                             start = start,
                             end = end,
                             strand = strand,
                             ftype = "splice_junction"
                             ) 
                    self.session.commit()

                    count = get_or_create(self.session, FeatureReadCount,
                            feature_id = splice.id,
                            read_source_id = read_source.id)
                
                    if not count.count:
                        count.count = c
                    else:
                        count.count += c
            
                    self.session.commit()    
예제 #2
0
    def load_yaml(self, fname):
        data = yaml.load(open(fname))
        source_map = {}
        
        if not data['feature']:
            return 

        for old_id,name,fname,nreads in data['read_source']:
            r = get_or_create(self.session, ReadSource,
                    name=name, source=fname, nreads=nreads)
            self.session.commit()
            source_map[old_id] = r.id
    
    
        t = ["chrom","start","end","strand","ftype","seq"]
        self.engine.execute(
            Feature.__table__.insert(),
            [dict(zip(t, row[1:])) for row in data['feature']]
            )
         
        self.session.commit()
        
        first = self.fetch_feature(data['feature'][0][1:])
        last = self.fetch_feature(data['feature'][-1][1:])
    
        f_map = dict(zip([x[0] for x in data['feature']], range(first.id, last.id + 1)))
        data['read_count'] = [
                [source_map[row[0]]] + [f_map[row[1]]] + row[2:] for row in data['read_count']
            ]
        t = ["read_source_id", "feature_id", "count", "span", "extend_up", "extend_down"]
    
        self.engine.execute(
            FeatureReadCount.__table__.insert(),
            [dict(zip(t, row)) for row in data['read_count']]
            )
    
        if data['evidence']:
            t = ["name","source"]
            result = self.engine.execute(
                Evidence.__table__.insert(),
                [dict(zip(t, row[1:])) for row in data['evidence']]
                )
    
            self.session.commit()
            first = self.fetch_evidence(data['evidence'][0][1:])
            last = self.fetch_evidence(data['evidence'][-1][1:])
    
            ev_map = dict(zip([x[0] for x in data['evidence']], range(first.id, last.id + 1)))
    
            data['feature_evidence'] = [
                    [f_map[row[0]], ev_map[row[1]]] for row in data['feature_evidence']
                ]
    
            t = ["feature_id", "evidence_id"]
            self.engine.execute(
                FeatureEvidence.__table__.insert(),
                [dict(zip(t, row)) for row in data['feature_evidence']]
                )
예제 #3
0
    def load_yaml(self, fname):
        data = yaml.load(open(fname))
        source_map = {}

        if not data['feature']:
            return

        for old_id, name, fname, nreads in data['read_source']:
            r = get_or_create(self.session,
                              ReadSource,
                              name=name,
                              source=fname,
                              nreads=nreads)
            self.session.commit()
            source_map[old_id] = r.id

        t = ["chrom", "start", "end", "strand", "ftype", "seq"]
        self.engine.execute(Feature.__table__.insert(),
                            [dict(zip(t, row[1:])) for row in data['feature']])

        self.session.commit()

        first = self.fetch_feature(data['feature'][0][1:])
        last = self.fetch_feature(data['feature'][-1][1:])

        f_map = dict(
            zip([x[0] for x in data['feature']], range(first.id, last.id + 1)))
        data['read_count'] = [[source_map[row[0]]] + [f_map[row[1]]] + row[2:]
                              for row in data['read_count']]
        t = [
            "read_source_id", "feature_id", "count", "span", "extend_up",
            "extend_down"
        ]

        self.engine.execute(FeatureReadCount.__table__.insert(),
                            [dict(zip(t, row)) for row in data['read_count']])

        if data['evidence']:
            t = ["name", "source"]
            result = self.engine.execute(
                Evidence.__table__.insert(),
                [dict(zip(t, row[1:])) for row in data['evidence']])

            self.session.commit()
            first = self.fetch_evidence(data['evidence'][0][1:])
            last = self.fetch_evidence(data['evidence'][-1][1:])

            ev_map = dict(
                zip([x[0] for x in data['evidence']],
                    range(first.id, last.id + 1)))

            data['feature_evidence'] = [[f_map[row[0]], ev_map[row[1]]]
                                        for row in data['feature_evidence']]

            t = ["feature_id", "evidence_id"]
            self.engine.execute(
                FeatureEvidence.__table__.insert(),
                [dict(zip(t, row)) for row in data['feature_evidence']])
예제 #4
0
    def get_splice_statistics(self, chrom, fnames, name):
        if type("") == type(fnames):
            fnames = [fnames]

        for fname in fnames:
            self.logger.debug("Getting splicing data from %s", fname)
            read_source = get_or_create(self.session,
                                        ReadSource,
                                        name=name,
                                        source=fname)
            self.session.commit()
            for line in open(fname):
                vals = line.strip().split("\t")
                if vals[0] == chrom:
                    start, end, c = [int(x) for x in vals[1:4]]
                    strand = vals[5]

                    splice = get_or_create(self.session,
                                           Feature,
                                           chrom=chrom,
                                           start=start,
                                           end=end,
                                           strand=strand,
                                           ftype="splice_junction")
                    self.session.commit()

                    count = get_or_create(self.session,
                                          FeatureReadCount,
                                          feature_id=splice.id,
                                          read_source_id=read_source.id)

                    if not count.count:
                        count.count = c
                    else:
                        count.count += c

                    self.session.commit()
예제 #5
0
    def get_read_statistics(self, chrom, fnames, name, span="all", extend=(0,0), nreads=None):

        if span not in ["all", "start", "end"]:
            raise Exception("Incorrect span: {}".format(span))
        
        tmp = NamedTemporaryFile(delete=False)
        estore = {}
        self.logger.debug("Writing exons to file %s", tmp.name)
        exons =  self.get_exons(chrom)
        if len(exons) == 0:
            return
        
        for exon in exons:
            start = exon.start
            end = exon.end
            if span == "start":
                if exon.strand == "+":
                    end = start
                elif exon.strand == "-":
                    start = end
            if span == "end":
                if exon.strand == "+":
                    start = end
                elif exon.strand == "-":
                    end = start
            
            if exon.strand == "-":
                start -= extend[1]
                end += extend[0]
            else:
                start -= extend[0]
                end += extend[1]
            if start < 0:
                start = 0

            estr = "{}:{}-{}".format(exon.chrom, start, end)

            if estr in estore:
                estore[estr].append(exon)
            else:
                estore[estr] = [exon]
                tmp.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(
                    exon.chrom,
                    start,
                    end,
                    str(exon),
                    0,
                    exon.strand
                ))
        tmp.flush()

        if type("") == type(fnames):
            fnames = [fnames]

        for i, fname in enumerate(fnames):
            self.logger.debug("Creating read_source for %s %s", name, fname)
            read_source = get_or_create(self.session, ReadSource, name=name, source=fname)
            self.session.commit() 
            #rmrepeats = False
            if fname.endswith("bam") and (not nreads or not nreads[i]):
                #rmrepeats = True
                self.logger.debug("Counting reads in %s", fname)
                read_source.nreads = read_statistics(fname)

            self.logger.debug("Getting overlap from %s", fname)
            result = get_binned_stats(tmp.name, fname, 1, rpkm=False, rmdup=False, rmrepeats=False)

            self.logger.debug("Reading results, save to exon stats")

            insert_vals = []
            for row in result:
                try:
                    vals = row.strip().split("\t")
                    e = "%s:%s-%s" % (vals[0], vals[1], vals[2])
                    c = float(vals[3])
                    for exon in estore[e]:
                        insert_vals.append([read_source.id, exon.id, c, span, extend[0], extend[1]])
                except:
                    self.logger.info("binned_stat line skipped: {}".format(row))
            t =  ["read_source_id", "feature_id", "count", "span", "extend_up", "extend_down"]
            result = self.engine.execute(
                    FeatureReadCount.__table__.insert(),
                    [dict(zip(t,row)) for row in insert_vals]
                    )
                
        tmp.close()
예제 #6
0
    def add_transcript(self, name, source, exons):
        """
        Add a transcript to the database
        """

        # Sanity checks
        for e1, e2 in zip(exons[:-1], exons[1:]):
            if e1[0] != e2[0]:
                sys.stderr.write("{0} - {1}\n".format(e1, e2))
                raise ValueError("Different chromosomes!")
            if e2[1] <= e1[2]:
                sys.stderr.write("{0} - {1}\n".format(e1, e2))
                raise ValueError("exons overlap, or in wrong order")
            if e1[3] != e2[3]:
                sys.stderr.write("{0} - {1}\n".format(e1, e2))
                raise ValueError("strands don't match")
       
        chrom = exons[0][0]
        strand = exons[0][-1]
        
        evidence = get_or_create(self.session, Evidence,
                name = name,
                source=source)

        seqs = []
        for exon in exons:
            seq = ""
            real_seq = ""
            if self.index:
                seq = ""
                try:                    
                    seq = self.index.get_sequence(chrom, exon[1] - 20, exon[2] + 20, strand)
                    real_seq = seq[20:-20]
                except Exception:
                    real_seq = self.index.get_sequence(chrom, exon[1], exon[2], strand)
                seqs.append(seq)
            
            exon = get_or_create(self.session, Feature,
                             chrom = chrom,
                             start = exon[1],
                             end = exon[2],
                             strand = strand,
                             ftype = "exon",
                             seq = real_seq
                             ) 
            exon.evidences.append(evidence)

        splice_donors = []
        splice_acceptors = []
        for i,(start,end) in enumerate([(e1[2], e2[1]) for e1, e2 in zip(exons[0:-1], exons[1:])]):
            self.logger.debug("%s %s %s %s", chrom, start, end, strand)
            sj = get_or_create(self.session, Feature,
                             chrom = chrom,
                             start = start,
                             end = end,
                             strand = strand,
                             ftype = "splice_junction"
                             )
            sj.evidences.append(evidence)
            
            if strand == "+":
                if len(seqs) > (i + 1) and len(seqs[i]) > 46:
                    splice_donors.append(["{}_{}".format(name, i + 1), seqs[i][-23:-14]])
                if len(seqs) > (i + 2) and len(seqs[i + 1]) > 46:
                    f = ["{}_{}".format(name, i + 1), seqs[i + 1][:23]]
                    splice_acceptors.append(f)
            else:
                if len(seqs) > (i + 2) and len(seqs[i + 1]) > 46:
                    f = ["{}_{}".format(name, i + 1), seqs[i + 1][-23:-14]]
                    splice_donors.append(f)
                     
                if len(seqs) > (i + 1) and len(seqs[i]) > 46:
                    f = ["{}_{}".format(name, i + 1), seqs[i][:23]]
                    splice_acceptors.append(f)
        
        donor_score = get_splice_score(splice_donors, 5)
        acceptor_score = get_splice_score(splice_acceptors, 3)
        if donor_score + acceptor_score < 0:
            self.logger.warning("Skipping %s, splicing not OK!", name)
            self.session.rollback()
        else:
            self.session.commit()
예제 #7
0
    def get_read_statistics(self,
                            chrom,
                            fnames,
                            name,
                            span="all",
                            extend=(0, 0),
                            nreads=None):

        if span not in ["all", "start", "end"]:
            raise Exception("Incorrect span: {}".format(span))

        tmp = NamedTemporaryFile(delete=False, suffix=".bed")
        estore = {}
        self.logger.debug("Writing exons to file %s", tmp.name)
        exons = self.get_exons(chrom)
        if len(exons) == 0:
            return

        for exon in exons:
            start = exon.start
            end = exon.end
            if span == "start":
                if exon.strand == "+":
                    end = start
                elif exon.strand == "-":
                    start = end
            if span == "end":
                if exon.strand == "+":
                    start = end
                elif exon.strand == "-":
                    end = start

            if exon.strand == "-":
                start -= extend[1]
                end += extend[0]
            else:
                start -= extend[0]
                end += extend[1]
            if start < 0:
                start = 0

            estr = "{}:{}-{}".format(exon.chrom, start, end)

            if estr in estore:
                estore[estr].append(exon)
            else:
                estore[estr] = [exon]
                tmp.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(
                    exon.chrom, start, end, str(exon), 0, exon.strand))
        tmp.flush()

        if type("") == type(fnames):
            fnames = [fnames]

        for i, fname in enumerate(fnames):
            self.logger.debug("Creating read_source for %s %s", name, fname)
            read_source = get_or_create(self.session,
                                        ReadSource,
                                        name=name,
                                        source=fname)
            self.session.commit()
            #rmrepeats = False
            if fname.endswith("bam") and (not nreads or not nreads[i]):
                #rmrepeats = True
                self.logger.debug("Counting reads in %s", fname)
                read_source.nreads = read_statistics(fname)

            self.logger.debug("Getting overlap from %s", fname)
            t = BamTrack(fname)
            result = t.binned_stats(tmp.name,
                                    1,
                                    rpkm=False,
                                    rmdup=False,
                                    rmrepeats=False)

            self.logger.debug("Reading results, save to exon stats")

            insert_vals = []
            for row in result:
                try:
                    vals = row.strip().split("\t")
                    e = "%s:%s-%s" % (vals[0], vals[1], vals[2])
                    c = float(vals[3])
                    for exon in estore[e]:
                        insert_vals.append([
                            read_source.id, exon.id, c, span, extend[0],
                            extend[1]
                        ])
                except:
                    self.logger.info(
                        "binned_stat line skipped: {}".format(row))
            t = [
                "read_source_id", "feature_id", "count", "span", "extend_up",
                "extend_down"
            ]
            result = self.engine.execute(
                FeatureReadCount.__table__.insert(),
                [dict(zip(t, row)) for row in insert_vals])

        tmp.close()
예제 #8
0
    def add_transcript(self, name, source, exons):
        """
        Add a transcript to the database
        """

        # Sanity checks
        for e1, e2 in zip(exons[:-1], exons[1:]):
            if e1[0] != e2[0]:
                sys.stderr.write("{0} - {1}\n".format(e1, e2))
                raise ValueError("Different chromosomes!")
            if e2[1] <= e1[2]:
                sys.stderr.write("{0} - {1}\n".format(e1, e2))
                raise ValueError("exons overlap, or in wrong order")
            if e1[3] != e2[3]:
                sys.stderr.write("{0} - {1}\n".format(e1, e2))
                raise ValueError("strands don't match")

        chrom = exons[0][0]
        strand = exons[0][-1]

        evidence = get_or_create(self.session,
                                 Evidence,
                                 name=name,
                                 source=source)

        seqs = []
        for exon in exons:
            seq = ""
            real_seq = ""
            if self.index:
                seq = ""
                try:
                    seq = self.index.get_sequence(chrom, exon[1] - 20,
                                                  exon[2] + 20, strand)
                    real_seq = seq[20:-20]
                except Exception:
                    real_seq = self.index.get_sequence(chrom, exon[1], exon[2],
                                                       strand)
                seqs.append(seq)

            exon = get_or_create(self.session,
                                 Feature,
                                 chrom=chrom,
                                 start=exon[1],
                                 end=exon[2],
                                 strand=strand,
                                 ftype="exon",
                                 seq=real_seq)
            exon.evidences.append(evidence)

        splice_donors = []
        splice_acceptors = []
        for i, (start, end) in enumerate([
            (e1[2], e2[1]) for e1, e2 in zip(exons[0:-1], exons[1:])
        ]):
            self.logger.debug("%s %s %s %s", chrom, start, end, strand)
            sj = get_or_create(self.session,
                               Feature,
                               chrom=chrom,
                               start=start,
                               end=end,
                               strand=strand,
                               ftype="splice_junction")
            sj.evidences.append(evidence)

            if strand == "+":
                if len(seqs) > (i + 1) and len(seqs[i]) > 46:
                    splice_donors.append(
                        ["{}_{}".format(name, i + 1), seqs[i][-23:-14]])
                if len(seqs) > (i + 2) and len(seqs[i + 1]) > 46:
                    f = ["{}_{}".format(name, i + 1), seqs[i + 1][:23]]
                    splice_acceptors.append(f)
            else:
                if len(seqs) > (i + 2) and len(seqs[i + 1]) > 46:
                    f = ["{}_{}".format(name, i + 1), seqs[i + 1][-23:-14]]
                    splice_donors.append(f)

                if len(seqs) > (i + 1) and len(seqs[i]) > 46:
                    f = ["{}_{}".format(name, i + 1), seqs[i][:23]]
                    splice_acceptors.append(f)

        donor_score = get_splice_score(splice_donors, 5)
        acceptor_score = get_splice_score(splice_acceptors, 3)
        if donor_score + acceptor_score < 0:
            self.logger.warning("Skipping %s, splicing not OK!", name)
            self.session.rollback()
        else:
            self.session.commit()