Пример #1
0
 def get_acs_for_protein_seq(self, seq):
     """
     returns a list of protein accessions for a given sequence.  The
     list is guaranteed to contain at least one element with the
     MD5-based accession (MD5_01234abc...def56789) at the end of the
     list.
     """
     md5 = seq_md5(seq)
     return [r['ac'] for r in self._fetchall(self._queries['acs_for_protein_md5'], [md5])
             ] + ['MD5_' + md5]
Пример #2
0
 def get_acs_for_protein_seq(self,seq):
     """
     returns a list of protein accessions for a given sequence.  The
     list is guaranteed to contain at least one element with the
     MD5-based accession (MD5_01234abc...def56789) at the end of the
     list.
     """
     md5 = seq_md5(seq)
     cur = self._get_cursor()
     cur.execute(self.sql['acs_for_protein_md5'],[md5])
     return [ r['ac'] for r in cur.fetchall() ] + [ 'MD5_'+md5 ]
Пример #3
0
def load_txinfo(session, opts, cf):
    self_aln_method = "transcript"
    update_period = 250
    sf = None                 # established on first use, below

    @lru_cache(maxsize=100)
    def _fetch_origin_by_name(name):
        try:
            ori = session.query(usam.Origin).filter(
                usam.Origin.name == name).one()
        except NoResultFound as e:
            logger.error("No origin for " + ti.origin)
            raise e
        return ori


    n_rows = len(gzip.open(opts["FILE"]).readlines()) - 1
    tir = ufti.TxInfoReader(gzip.open(opts["FILE"]))
    logger.info("opened " + opts["FILE"])

    session.execute("set role {admin_role};".format(
        admin_role=cf.get("uta", "admin_role")))
    session.execute("set search_path = " + usam.schema_name)

    n_new = 0
    n_unchanged = 0
    n_cds_changed = 0
    n_exons_changed = 0

    for i_ti, ti in enumerate(tir):
        if ti.exons_se_i == "":
            logger.warning(ti.ac + ": no exons?!; skipping.")
            continue

        if ti.cds_se_i:
            cds_start_i, cds_end_i = map(int, ti.cds_se_i.split(","))
        else:
            cds_start_i = cds_end_i = None
            cds_md5 = None

        # 1. Fetch or make the Transcript record
        existing = session.query(usam.Transcript).filter(
            usam.Transcript.ac == ti.ac,
            )
        assert existing.count() <= 1, "Expected max 1 existing transcripts with accession {ti.ac}".format(ti=ti)

        u_tx = None

        if existing.count() == 1:
            u_tx = existing[0]
            if (u_tx.cds_start_i, u_tx.cds_end_i) != (cds_start_i, cds_end_i):
                u_tx.ac = "{u_tx.ac}/{u_tx.cds_start_i}..{u_tx.cds_end_i}".format(u_tx=u_tx)
                logger.warn("Transcript {ti.ac}: CDS coordinates changed!; renamed to {u_tx.ac}".format(ti=ti, u_tx=u_tx))
                session.flush()
                u_tx = None
                n_cds_changed += 1

        # state: u_tx is set if a transcript was found and was
        # unchanged, or None if 1) no such was found or 2) was found
        # and had updated CDS coords.
        if u_tx is None:
            ori = _fetch_origin_by_name(ti.origin)

            if ti.cds_se_i:
                if sf is None:
                    sf = _get_seqfetcher(cf)
                try:
                    cds_seq = sf.fetch(ti.ac, cds_start_i, cds_end_i)
                except KeyError:
                    raise Exception("{ac}: not in sequence database".format(ac=ti.ac))
                cds_md5 = seq_md5(cds_seq)
            else:
                cds_md5 = None

            assert (cds_start_i is not None) ^ (cds_md5 is None), "failed: cds_start_i is None i.f.f. cds_md5_is None"
            u_tx = usam.Transcript(
                ac=ti.ac,
                origin=ori,
                hgnc=ti.hgnc,
                cds_start_i=cds_start_i,
                cds_end_i=cds_end_i,
                cds_md5=cds_md5,
            )
            session.add(u_tx)

        if u_tx.hgnc != ti.hgnc:
            logger.warn("{ti.ac}: HGNC symbol changed from {u_tx.hgnc} to {ti.hgnc}".format(
                u_tx=u_tx, ti=ti))
            u_tx.hgnc = ti.hgnc

        # state: transcript now exists, either existing or freshly-created

        # 2. Upsert an ExonSet attached to the Transcript
        n, o = _upsert_exon_set_record(session, ti.ac, ti.ac, 1, self_aln_method, ti.exons_se_i)

        (no) = (n is not None, o is not None)
        if no == (True, False):
            n_new += 1
        elif no == (True, True):
            logger.warn("Transcript {ti.ac} exon structure changed".format(ti=ti))
            n_exons_changed += 1
        elif no == (False, True):
            logger.debug("Transcript {ti.ac} exon structure unchanged".format(ti=ti))
            n_unchanged += 1

        if i_ti % update_period == 0 or i_ti + 1 == n_rows:
            session.commit()
            logger.info("{i_ti}/{n_rows} {p:.1f}%; {n_new} new, {n_unchanged} unchanged, "
                        "{n_cds_changed} cds changed, {n_exons_changed} exons changed; commited".format(
                i_ti=i_ti, n_rows=n_rows,
                n_new=n_new, n_unchanged=n_unchanged, n_cds_changed=n_cds_changed, n_exons_changed=n_exons_changed,
                p=(i_ti + 1) / n_rows * 100))