示例#1
0
def generate_hmm_(opts):
    fd, tmphmm = mkstemp()
    close(fd)
    fd, tmpaln = mkstemp()
    close(fd)

    is_dna = opts.ENCODER == DNAEncoder

    try:
        with open(opts.REFMSA) as msa_fh:
            with open(tmpaln, 'w') as aln_fh:
                msa_fmt = seqfile_format(opts.REFMSA)
                source = Verifier(SeqIO.parse(msa_fh, msa_fmt), DNAAlphabet)
                try:
                    SeqIO.write((record if is_dna else translate(record)
                                 for record in source), aln_fh, 'stockholm')
                except VerifyError:
                    if is_dna:
                        raise RuntimeError(
                            "DNA encoding incompatible with protein reference MSA"
                        )
                    source.set_alphabet(AminoAlphabet)
                    aln_fh.seek(0)
                    SeqIO.write(source, aln_fh, 'stockholm')

        hmmer = HMMER(opts.HMMER_ALIGN_BIN, opts.HMMER_BUILD_BIN)
        hmmer.build(tmphmm,
                    tmpaln,
                    alphabet=HMMER.DNA if is_dna else HMMER.AMINO)
    finally:
        if exists(tmpaln):
            remove(tmpaln)

    return tmphmm
示例#2
0
 def seqrecords():
     is_dna = ARGS.ENCODER == DNAEncoder
     seq_fmt = seqfile_format(ARGS.SEQUENCES)
     source = Verifier(SeqIO.parse(seq_fh, seq_fmt), DNAAlphabet)
     try:
         for record in source:
             yield record if is_dna else translate(record)
     except VerifyError:
         if is_dna:
             msg = (
                 "your model specifies a DNA encoding "
                 "which is incompatible with protein sequences"
                 )
             raise RuntimeError(msg)
         source.set_alphabet(AminoAlphabet)
         for record in source:
             yield record
示例#3
0
 def seqrecords():
     is_dna = ARGS.ENCODER == DNAEncoder
     seq_fmt = seqfile_format(ARGS.SEQUENCES)
     source = Verifier(SeqIO.parse(seq_fh, seq_fmt), DNAAlphabet)
     try:
         for record in source:
             yield record if is_dna else translate(record)
     except VerifyError:
         if is_dna:
             msg = (
                 "your model specifies a DNA encoding "
                 "which is incompatible with protein sequences"
                 )
             raise RuntimeError(msg)
         source.set_alphabet(AminoAlphabet)
         for record in source:
             yield record
示例#4
0
 def __call__(self, string):
     try:
         with open(string) as h:
             source = Verifier(SeqIO.parse(h, seqfile_format(string)), DNAAlphabet)
             try:
                 seq = next(iter(source))
                 if not self.is_dna:
                     seq = translate(seq)
             except VerifyError:
                 if self.is_dna:
                     raise ArgumentTypeError("DNA encoding incompatible with protein reference")
                 source.set_alphabet(AminoAlphabet)
                 seq = next(iter(source))
         return seq
     except ArgumentTypeError:
         raise sys.exc_info()[1]
     except:
         raise ArgumentTypeError("invalid FASTA file '{0:s}'".format(string))
示例#5
0
 def __call__(self, string):
     try:
         with open(string) as h:
             source = Verifier(SeqIO.parse(h, seqfile_format(string)),
                               DNAAlphabet)
             try:
                 seq = next(iter(source))
                 if not self.is_dna:
                     seq = translate(seq)
             except VerifyError:
                 if self.is_dna:
                     raise ArgumentTypeError(
                         "DNA encoding incompatible with protein reference")
                 source.set_alphabet(AminoAlphabet)
                 seq = next(iter(source))
         return seq
     except ArgumentTypeError:
         raise sys.exc_info()[1]
     except:
         raise ArgumentTypeError(
             "invalid FASTA file '{0:s}'".format(string))
示例#6
0
def generate_hmm_(opts):
    fd, tmphmm = mkstemp(); close(fd)
    fd, tmpaln = mkstemp(); close(fd)

    is_dna = opts.ENCODER == DNAEncoder

    try:
        with open(opts.REFMSA) as msa_fh:
            with open(tmpaln, 'w') as aln_fh:
                msa_fmt = seqfile_format(opts.REFMSA)
                source = Verifier(SeqIO.parse(msa_fh, msa_fmt), DNAAlphabet)
                try:
                    SeqIO.write(
                        (record if is_dna else translate(record) for record in source),
                        aln_fh,
                        'stockholm')
                except VerifyError:
                    if is_dna:
                        raise RuntimeError("DNA encoding incompatible with protein reference MSA")
                    source.set_alphabet(AminoAlphabet)
                    aln_fh.seek(0)
                    SeqIO.write(
                        source,
                        aln_fh,
                        'stockholm')

        hmmer = HMMER(opts.HMMER_ALIGN_BIN, opts.HMMER_BUILD_BIN)
        hmmer.build(
            tmphmm,
            tmpaln,
            alphabet=HMMER.DNA if is_dna else HMMER.AMINO
            )
    finally:
        if exists(tmpaln):
            remove(tmpaln)

    return tmphmm
示例#7
0
    def seqrecords(self, antibodies, clonal=False):
        conn = connect(self.__filename)
        cur = conn.cursor()

        antibodies_ = set(antibodies)

        ab_clause = ' or '.join(['ANTIBODY = ?'] * len(antibodies_))

        equivalencies = set((next(
            cur.execute(
                'select distinct ALT_IDS from ANTIBODY where %s' % ab_clause,
                tuple(antibodies_)))[0] or '').split(',')) - set([''])

        if len(equivalencies):
            antibodies_ |= equivalencies
            ab_clause = ' or '.join(['ANTIBODY = ?'] * len(antibodies_))

        antibodies__ = tuple(sorted(antibodies_))

        stmt = dedent('''\
            select distinct SG.NO as NO, SG.ID as ID, SG.SEQ as SEQ, SG.SUBTYPE as SUBTYPE, ? as AB, N.VALUE as VALUE from
            (select NO, S.ID as ID, SUBTYPE, SEQ from
                (select SEQUENCE_NO as NO, SEQUENCE_ID as ID, RAW_SEQ as SEQ from SEQUENCE {0:s} group by ID) as S left join
                (select SEQUENCE_ID as ID, SUBTYPE from GENO_REPORT group by ID) as G
                on S.ID = G.ID
            ) as SG join
            (select SEQUENCE_ID as ID, ANTIBODY as AB, group_concat(TYPE || ':' || VALUE, ',') as VALUE from NEUT where ({1:s}) group by ID) as N
            on SG.ID = N.ID order by SG.ID;
            '''.format('where IS_CLONAL = 1' if clonal else '', ab_clause))
        params = ('+'.join(antibodies__), ) + antibodies__

        cur.execute(stmt, params)

        def records():
            ids = {}
            for row in cur:
                nno, sid, seq, subtype, ab, values = row[:6]
                values_ = {}
                for kv in values.split(','):
                    k, v = kv.split(':')
                    try:
                        v_ = float(v.strip().lstrip('<>'))
                    except ValueError:
                        continue
                    if k not in values_:
                        values_[k] = []
                    values_[k].append(v_)
                if len(values_) == 0:
                    warn("skipping sequence '%s', invalid values '%s'" %
                         (sid, values))
                    continue
                record = SeqRecord(Seq(
                    OrfList(seq, include_stops=False)[0], DNAAlphabet),
                                   id=sid,
                                   description=json_dumps({
                                       'subtype':
                                       '' if subtype is None else subtype,
                                       'ab':
                                       ab,
                                       'values':
                                       values_
                                   }),
                                   annotations={
                                       'antibody': values_,
                                       'subtype': subtype
                                   })
                if sid in ids:
                    record.id += str(-ids[sid])
                    ids[sid] += 1
                else:
                    ids[sid] = 1
                yield record

        source = Verifier(records(), DNAAlphabet)
        try:
            seqrecords = list(source)
        except VerifyError:
            source.set_alphabet(AminoAlphabet)
            seqrecords = list(source)

        conn.close()

        return seqrecords, clonal, antibodies__
示例#8
0
    def seqrecords(self, antibodies, clonal=False):
        if clonal:
            raise ValueError(
                'clonal property is not available with Monogram datasets')
        if len(antibodies) > 1:
            raise ValueError(
                'only one antibody can be interrogated with Monogram datasets')

        seqrecords = []
        with open(self.__fastafile) as h:
            source = Verifier(SeqIO.parse(h, 'fasta'), DNAAlphabet)
            try:
                seqrecords = list(source)
            except VerifyError:
                source.set_alphabet(AminoAlphabet)
                seqrecords = list(source)

        underdash = re_compile(r'[_-](\d+)$')
        for r in seqrecords:
            r.id = underdash.sub(r'_\1', r.id)

        ic50s = dict((r.id, []) for r in seqrecords)

        with open(self.__csvfile) as fh:
            sample = fh.read(MonogramData.__sample_len)
            sniffer = csv_sniffer()
            dialect = sniffer.sniff(sample)
            if not sniffer.has_header(sample):
                raise ValueError(MonogramData.__no_header_msg)
            fh.seek(0)
            reader = csv_reader(fh, dialect)
            columns = None
            for i, row in enumerate(reader):
                if columns is None:
                    columns = dict((v.strip(), j) for j, v in enumerate(row))
                    missing = set(antibodies) - set(columns.keys())
                    if len(missing):
                        raise ValueError("antibodies ('%s') not found!" %
                                         "', '".join(missing))
                else:
                    acc = underdash.sub(r'_\1', row[0])
                    try:
                        if acc in ic50s:
                            cln_ic50s = [
                                float(row[columns[ab]].strip().lstrip('<>'))
                                for ab in antibodies
                                if ab in columns and columns[ab] < len(row)
                            ]
                            ic50s[acc].extend(cln_ic50s)
                    except:
                        pass

        drop = []
        for i, r in enumerate(seqrecords):
            if r.id not in ic50s or len(ic50s[r.id]) == 0:
                drop.append(i)
                warn("skipping sequence '%s', VALUE not found" % r.id)
            else:
                values = {'IC50': ic50s[r.id]}
                r.description = json_dumps({
                    'ab': antibodies[0],
                    'values': values
                })
                r.annotations['antibody'] = values

        for i in sorted(drop, reverse=True):
            del seqrecords[i]

        return seqrecords, clonal, antibodies
示例#9
0
    def seqrecords(self, antibodies, clonal=False):
        conn = connect(self.__filename)
        cur = conn.cursor()

        antibodies_ = set(antibodies)

        ab_clause = ' or '.join(['ANTIBODY = ?'] * len(antibodies_))

        equivalencies = set((
            next(cur.execute(
                'select distinct ALT_IDS from ANTIBODY where %s' % ab_clause,
                tuple(antibodies_)
                ))[0]
            or ''
            ).split(',')) - set([''])

        if len(equivalencies):
            antibodies_ |= equivalencies
            ab_clause = ' or '.join(['ANTIBODY = ?'] * len(antibodies_))

        antibodies__ = tuple(sorted(antibodies_))

        stmt = dedent('''\
            select distinct SG.NO as NO, SG.ID as ID, SG.SEQ as SEQ, SG.SUBTYPE as SUBTYPE, ? as AB, N.VALUE as VALUE from
            (select NO, S.ID as ID, SUBTYPE, SEQ from
                (select SEQUENCE_NO as NO, SEQUENCE_ID as ID, RAW_SEQ as SEQ from SEQUENCE {0:s} group by ID) as S left join
                (select SEQUENCE_ID as ID, SUBTYPE from GENO_REPORT group by ID) as G
                on S.ID = G.ID
            ) as SG join
            (select SEQUENCE_ID as ID, ANTIBODY as AB, group_concat(TYPE || ':' || VALUE, ',') as VALUE from NEUT where ({1:s}) group by ID) as N
            on SG.ID = N.ID order by SG.ID;
            '''.format('where IS_CLONAL = 1' if clonal else '', ab_clause))
        params = ('+'.join(antibodies__),) + antibodies__

        cur.execute(stmt, params)

        def records():
            ids = {}
            for row in cur:
                nno, sid, seq, subtype, ab, values = row[:6]
                values_ = {}
                for kv in values.split(','):
                    k, v = kv.split(':')
                    try:
                        v_ = float(v.strip().lstrip('<>'))
                    except ValueError:
                        continue
                    if k not in values_:
                        values_[k] = []
                    values_[k].append(v_)
                if len(values_) == 0:
                    warn("skipping sequence '%s', invalid values '%s'" % (sid, values))
                    continue
                record = SeqRecord(
                    Seq(OrfList(seq, include_stops=False)[0], DNAAlphabet),
                    id=sid,
                    description=json_dumps({
                        'subtype': '' if subtype is None else subtype,
                        'ab': ab,
                        'values': values_
                        }),
                    annotations={'antibody': values_, 'subtype': subtype}
                    )
                if sid in ids:
                    record.id += str(-ids[sid])
                    ids[sid] += 1
                else:
                    ids[sid] = 1
                yield record

        source = Verifier(records(), DNAAlphabet)
        try:
            seqrecords = list(source)
        except VerifyError:
            source.set_alphabet(AminoAlphabet)
            seqrecords = list(source)

        conn.close()

        return seqrecords, clonal, antibodies__
示例#10
0
    def seqrecords(self, antibodies, clonal=False):
        if clonal:
            raise ValueError('clonal property is not available with Monogram datasets')
        if len(antibodies) > 1:
            raise ValueError('only one antibody can be interrogated with Monogram datasets')

        seqrecords = []
        with open(self.__fastafile) as h:
            source = Verifier(SeqIO.parse(h, 'fasta'), DNAAlphabet)
            try:
                seqrecords = list(source)
            except VerifyError:
                source.set_alphabet(AminoAlphabet)
                seqrecords = list(source)

        underdash = re_compile(r'[_-](\d+)$')
        for r in seqrecords:
            r.id = underdash.sub(r'_\1', r.id)

        ic50s = dict((r.id, []) for r in seqrecords)

        with open(self.__csvfile) as fh:
            sample = fh.read(MonogramData.__sample_len)
            sniffer = csv_sniffer()
            dialect = sniffer.sniff(sample)
            if not sniffer.has_header(sample):
                raise ValueError(MonogramData.__no_header_msg)
            fh.seek(0)
            reader = csv_reader(fh, dialect)
            columns = None
            for i, row in enumerate(reader):
                if columns is None:
                    columns = dict((v.strip(), j) for j, v in enumerate(row))
                    missing = set(antibodies) - set(columns.keys())
                    if len(missing):
                        raise ValueError("antibodies ('%s') not found!" % "', '".join(missing))
                else:
                    acc = underdash.sub(r'_\1', row[0])
                    try:
                        if acc in ic50s:
                            cln_ic50s = [float(row[columns[ab]].strip().lstrip('<>'))
                                         for ab in antibodies
                                         if ab in columns and columns[ab] < len(row)]
                            ic50s[acc].extend(cln_ic50s)
                    except:
                        pass

        drop = []
        for i, r in enumerate(seqrecords):
            if r.id not in ic50s or len(ic50s[r.id]) == 0:
                drop.append(i)
                warn("skipping sequence '%s', VALUE not found" % r.id)
            else:
                values = {'IC50': ic50s[r.id]}
                r.description = json_dumps({
                    'ab': antibodies[0],
                    'values': values
                    })
                r.annotations['antibody'] = values

        for i in sorted(drop, reverse=True):
            del seqrecords[i]

        return seqrecords, clonal, antibodies