示例#1
0
class GlyTouCanUtil(object):
    _wurcs_mono_format = WURCS20MonoFormat()
    _wurcs_format = WURCS20Format()
    _glycoct_format = GlycoCTFormat()
    _alphamap = None

    def getUnsupportedCodes(self, acc):
        codes = set()
        substs = set()
        invalid = set()
        other = set()
        sequence = self.getseq(acc, 'wurcs')
        if not sequence:
            return codes, substs, invalid, other
        monos = sequence.split('/[', 1)[1].split(']/')[0].split('][')
        for m in monos:
            try:
                g = self._wurcs_mono_format.parsing(m)
            except UnsupportedSkeletonCodeError as e:
                codes.add(e.message.rsplit(None, 1)[-1])
            except UnsupportedSubstituentError as e:
                substs.add(e.message.rsplit(None, 1)[-1])
            except InvalidMonoError as e:
                invalid.add(e.message.rsplit(None, 1)[-1])
            except GlycanParseError:
                pass
        try:
            g = self._wurcs_format.toGlycan(sequence)
        except ZeroPlusLinkCountError:
            other.add("0+ link count")
        except UndeterminedLinkCountError:
            other.add("undetermined link count")
        except CircularError:
            other.add("circular")
        except LinkCountError:
            other.add("bad link count")
        except GlycanParseError:
            pass
        return codes, substs, invalid, other

    def getGlycan(self, acc, format=None):
        if not format or (format == 'wurcs'):
            sequence = self.getseq(acc, 'wurcs')
            if sequence:
                try:
                    return self._wurcs_format.toGlycan(sequence)
                except GlycanParseError:
                    pass  # traceback.print_exc()
        if not format or (format == 'glycoct'):
            sequence = self.getseq(acc, 'glycoct')
            if sequence:
                try:
                    return self._glycoct_format.toGlycan(sequence)
                except GlycanParseError:
                    pass
        return None

    def glycoct(self, acc, fetch=None):
        g = self.getGlycan(acc, fetch)
        if not g:
            return None
        try:
            return g.glycoct()
        except RuntimeError:
            pass
        return None

    def umw(self, acc, fetch=None):
        g = self.getGlycan(acc, fetch)
        if not g:
            return None
        try:
            return g.underivitized_molecular_weight()
        except (LookupError, ValueError):
            pass
        return None

    def wurcs2glycoct(self, acc):
        sequence = self.getseq(acc, 'wurcs')
        if sequence:
            sequence1 = urllib.parse.quote_plus(sequence)
            url = 'https://api.glycosmos.org/glycanformatconverter/2.3.2-snapshot/wurcs2glycoct/' + sequence1
            try:
                data = json.loads(urllib.request.urlopen(url).read())
                if 'GlycoCT' in data:
                    return data['GlycoCT']
            except ValueError:
                pass
        return None

    def subsumptionbyapi(self, acc):
        sequence = self.getseq(acc, 'wurcs')
        if sequence:
            sequence1 = urllib.parse.quote_plus(sequence)
            url = 'https://api.glycosmos.org/subsumption/0.2.0/' + sequence1
            data = urllib.request.urlopen(url).read()
            seen = set()
            lasts = None
            for triple in sorted([
                    tuple([
                        s.strip() for s in list(
                            map(str, list(map(t.get, ("S", "P", "O")))))
                    ]) for t in json.loads(data)
            ]):
                if triple in seen:
                    continue
                seen.add(triple)
                if triple[0] != lasts:
                    if lasts != None:
                        print("")
                    print(triple[0])
                    lasts = triple[0]
                if triple[2] == sequence:
                    print(">>  " + "\t".join(triple[1:]))
                else:
                    print("    " + "\t".join(triple[1:]))

    def findskel(self, skel, maxcount=None):
        if maxcount != None:
            maxcount = int(maxcount)

        for acc, format, wurcs in self.allseq(format='wurcs'):
            glycoct = self.getseq(acc, format='glycoct')
            if not glycoct:
                continue
            monos = wurcs.split('/[', 1)[1].split(']/')[0].split('][')
            if maxcount != None and len(monos) > maxcount:
                continue
            for mono in monos:
                msk = re.search(r'^(.*?)([-_].*)?$', mono).group(1)
                assert msk
                m = re.search(r"^%s$" % (skel, ), msk)
                if m:
                    yield acc, m.group(0)

    def multiseq(self):
        counts = defaultdict(set)
        for acc, fmt, seq in self.allseq():
            counts[(acc, fmt)].add(seq)
        for k, v in list(counts.items()):
            if len(v) > 1:
                yield k

    def fixcompwurcs(self, wurcsseq, subst=[]):
        if not self._alphamap:
            self._alphamap = dict()
            for i, c in enumerate(range(ord('a'), ord('z') + 1)):
                self._alphamap[i + 1] = chr(c)
                self._alphamap[chr(c)] = (i + 1)
            for i, c in enumerate(range(ord('A'), ord('Z') + 1)):
                self._alphamap[i + 1 + 26] = chr(c)
                self._alphamap[chr(c)] = (i + 1 + 26)
        prefix, counts, rest = wurcsseq.split('/', 2)
        unodes, nodes, edges = counts.split(',')
        nodes = int(nodes)
        assert '0+' in edges
        edges = (nodes - 1)
        ambignode = "|".join(
            ["%s?" % (self._alphamap[i], ) for i in range(1, nodes + 1)])
        ambigedge = "%s}-{%s" % (ambignode, ambignode)
        ambigedges = [ambigedge] * edges
        if hasattr(subst, 'items'):
            subst = list(subst.items())
        for sub, cnt in subst:
            for i in range(cnt):
                ambigedges.insert(0, "%s}%s" % (ambignode, sub))
        return "%s/%s,%d,%d/%s%s" % (prefix, unodes, nodes, len(ambigedges),
                                     rest, "_".join(ambigedges))

    def anyglycan2wurcs(self, glycan):
        sequence = ""
        if isinstance(glycan, Glycan.Glycan):
            if not self.glycoct_format:
                self.glycoct_format = GlycoCTFormat()
            sequence = self.glycoct2wurcs(self.glycoct_format.toStr(glycan))
            if '0+' in sequence:
                sequence = self.fixcompwurcs(sequence)
        else:
            sequence = re.sub(r'\n\n+', r'\n', glycan)
            if sequence.strip().startswith('RES'):
                sequence = self.glycoct2wurcs(glycan)
        return sequence

    def glycoct2wurcs(self, seq):
        requestURL = "https://api.glycosmos.org/glycanformatconverter/2.3.2-snapshot/glycoct2wurcs/"
        encodedseq = urllib.parse.quote(seq, safe='')
        requestURL += encodedseq
        req = urllib.request.Request(requestURL)
        # self.wait()
        response = urllib.request.urlopen(req).read()

        result = json.loads(response)

        try:
            wurcs = result["WURCS"]
        except:
            raise ValueError("GlycoCT 2 WURCS conversion failed")

        return wurcs.strip()
示例#2
0
#!/usr/bin/env python27

import sys, os, os.path
import findpygly
from pygly.GlyTouCan import GlyTouCan
from pygly.GlycanFormatter import GlycoCTFormat

glycoct_format = GlycoCTFormat()
gtc = GlyTouCan()

for l in sys.stdin:
    acc = l.strip()
    g = gtc.getGlycan(acc)
    if g and g.fully_determined():
        print acc, True
        if not os.path.exists('%s.txt' % (acc, )):
            seq = gtc.getseq(acc, 'glycoct')
            if not seq:
                try:
                    seq = glycoct_format.toStr(g)
                except:
                    pass
            if seq:
                wh = open('%s.txt' % (acc, ), 'w')
                wh.write(seq.strip() + '\n')
                wh.close()
    else:
        print acc, (False if g else None)
示例#3
0
for r in rows:
    id = r["ID"]
    name = r["Name"]
    iupacseq = r["IUPAC"]

    accession = "%06d" % id
    if not iupacseq:
        continue

    if "n-" in iupacseq:
        print "%s Contains repeats" % accession
        continue

    try:
        gobj = imparser.toGlycan(iupacseq)
        glycoct = gparser.toStr(gobj)
        wurcs = gtc.glycoct2wurcs(glycoct)
        glytoucan = gtc.register(wurcs)[0]

    except:
        print "%s is not able to load to Glycomotif" % accession
        print iupacseq
        continue

    aglycon = None
    redend = None
    for agly in possibleaglycon:
        if agly in iupacseq:
            aglycon = agly
            if aglycon in reaglycon:
                redend = True
示例#4
0
        key = sys.argv[i]
        try:
            value = sys.argv[i + 1]
            value = float(sys.argv[i + 1])
            value = int(sys.argv[i + 1])
        except ValueError:
            pass
        getargs[key] = value
    if count:
        cnt = gdb.count(**getargs)
        print dbname, cnt
    elif glycoct:
        fmt1 = GlycoCTFormat()
        zf = ZipFile(dbname.rsplit('.', 1)[0] + '.gct', 'w', ZIP_DEFLATED)
        for r in gdb.get(**getargs):
            zf.writestr("%s.txt" % r.accession, fmt1.toStr(r.glycan))
        zf.close()
    else:
        fmt = LinearCodeFormat()
        for r in gdb.get(**getargs):
            if r['lincode']:
                lc = fmt.toStr(r.glycan)
                print r.accession, r['nlinked'], r.get(
                    'oxford', "-"), r['molecular_weight'], r['composition'], lc
                print r
            else:
                print r.accession, r['nlinked'], r.get(
                    'oxford', "-"), r['molecular_weight'], r['composition']
                print r
else:
    print >> sys.stderr, "Bad command:" + sys.argv[1]