def anyglycan2wurcs(self, glycan): sequence = "" if isinstance(glycan, Glycan.Glycan): if not self.glycoct_format: self.glycoct_format = GlycoCTFormat() sequence = self.glycoct2wurcs(self.glycoct_format.toStr(glycan)) if '0+' in sequence: sequence = self.fixcompwurcs(sequence) else: sequence = re.sub(r'\n\n+', r'\n', glycan) if sequence.strip().startswith('RES'): sequence = self.glycoct2wurcs(glycan) return sequence
#!/bin/env python27 import sys from getwiki import GlycanData w = GlycanData() import findpygly from pygly.CompositionTable import PermethylCompositionTable from pygly.GlycanFormatter import GlycoCTFormat pctable = PermethylCompositionTable() glycoctformat = GlycoCTFormat() for g in w.iterglycan(): glycan = g.getGlycan() if not glycan: continue for m in glycan.all_nodes(): try: eltcomp = m.composition(pctable) except KeyError: print g.get('accession'), glycoctformat.mtoStr(m)
class GlyTouCanUtil(object): _wurcs_mono_format = WURCS20MonoFormat() _wurcs_format = WURCS20Format() _glycoct_format = GlycoCTFormat() _alphamap = None def getUnsupportedCodes(self, acc): codes = set() substs = set() invalid = set() other = set() sequence = self.getseq(acc, 'wurcs') if not sequence: return codes, substs, invalid, other monos = sequence.split('/[', 1)[1].split(']/')[0].split('][') for m in monos: try: g = self._wurcs_mono_format.parsing(m) except UnsupportedSkeletonCodeError as e: codes.add(e.message.rsplit(None, 1)[-1]) except UnsupportedSubstituentError as e: substs.add(e.message.rsplit(None, 1)[-1]) except InvalidMonoError as e: invalid.add(e.message.rsplit(None, 1)[-1]) except GlycanParseError: pass try: g = self._wurcs_format.toGlycan(sequence) except ZeroPlusLinkCountError: other.add("0+ link count") except UndeterminedLinkCountError: other.add("undetermined link count") except CircularError: other.add("circular") except LinkCountError: other.add("bad link count") except GlycanParseError: pass return codes, substs, invalid, other def getGlycan(self, acc, format=None): if not format or (format == 'wurcs'): sequence = self.getseq(acc, 'wurcs') if sequence: try: return self._wurcs_format.toGlycan(sequence) except GlycanParseError: pass # traceback.print_exc() if not format or (format == 'glycoct'): sequence = self.getseq(acc, 'glycoct') if sequence: try: return self._glycoct_format.toGlycan(sequence) except GlycanParseError: pass return None def glycoct(self, acc, fetch=None): g = self.getGlycan(acc, fetch) if not g: return None try: return g.glycoct() except RuntimeError: pass return None def umw(self, acc, fetch=None): g = self.getGlycan(acc, fetch) if not g: return None try: return g.underivitized_molecular_weight() except (LookupError, ValueError): pass return None def wurcs2glycoct(self, acc): sequence = self.getseq(acc, 'wurcs') if sequence: sequence1 = urllib.parse.quote_plus(sequence) url = 'https://api.glycosmos.org/glycanformatconverter/2.3.2-snapshot/wurcs2glycoct/' + sequence1 try: data = json.loads(urllib.request.urlopen(url).read()) if 'GlycoCT' in data: return data['GlycoCT'] except ValueError: pass return None def subsumptionbyapi(self, acc): sequence = self.getseq(acc, 'wurcs') if sequence: sequence1 = urllib.parse.quote_plus(sequence) url = 'https://api.glycosmos.org/subsumption/0.2.0/' + sequence1 data = urllib.request.urlopen(url).read() seen = set() lasts = None for triple in sorted([ tuple([ s.strip() for s in list( map(str, list(map(t.get, ("S", "P", "O"))))) ]) for t in json.loads(data) ]): if triple in seen: continue seen.add(triple) if triple[0] != lasts: if lasts != None: print("") print(triple[0]) lasts = triple[0] if triple[2] == sequence: print(">> " + "\t".join(triple[1:])) else: print(" " + "\t".join(triple[1:])) def findskel(self, skel, maxcount=None): if maxcount != None: maxcount = int(maxcount) for acc, format, wurcs in self.allseq(format='wurcs'): glycoct = self.getseq(acc, format='glycoct') if not glycoct: continue monos = wurcs.split('/[', 1)[1].split(']/')[0].split('][') if maxcount != None and len(monos) > maxcount: continue for mono in monos: msk = re.search(r'^(.*?)([-_].*)?$', mono).group(1) assert msk m = re.search(r"^%s$" % (skel, ), msk) if m: yield acc, m.group(0) def multiseq(self): counts = defaultdict(set) for acc, fmt, seq in self.allseq(): counts[(acc, fmt)].add(seq) for k, v in list(counts.items()): if len(v) > 1: yield k def fixcompwurcs(self, wurcsseq, subst=[]): if not self._alphamap: self._alphamap = dict() for i, c in enumerate(range(ord('a'), ord('z') + 1)): self._alphamap[i + 1] = chr(c) self._alphamap[chr(c)] = (i + 1) for i, c in enumerate(range(ord('A'), ord('Z') + 1)): self._alphamap[i + 1 + 26] = chr(c) self._alphamap[chr(c)] = (i + 1 + 26) prefix, counts, rest = wurcsseq.split('/', 2) unodes, nodes, edges = counts.split(',') nodes = int(nodes) assert '0+' in edges edges = (nodes - 1) ambignode = "|".join( ["%s?" % (self._alphamap[i], ) for i in range(1, nodes + 1)]) ambigedge = "%s}-{%s" % (ambignode, ambignode) ambigedges = [ambigedge] * edges if hasattr(subst, 'items'): subst = list(subst.items()) for sub, cnt in subst: for i in range(cnt): ambigedges.insert(0, "%s}%s" % (ambignode, sub)) return "%s/%s,%d,%d/%s%s" % (prefix, unodes, nodes, len(ambigedges), rest, "_".join(ambigedges)) def anyglycan2wurcs(self, glycan): sequence = "" if isinstance(glycan, Glycan.Glycan): if not self.glycoct_format: self.glycoct_format = GlycoCTFormat() sequence = self.glycoct2wurcs(self.glycoct_format.toStr(glycan)) if '0+' in sequence: sequence = self.fixcompwurcs(sequence) else: sequence = re.sub(r'\n\n+', r'\n', glycan) if sequence.strip().startswith('RES'): sequence = self.glycoct2wurcs(glycan) return sequence def glycoct2wurcs(self, seq): requestURL = "https://api.glycosmos.org/glycanformatconverter/2.3.2-snapshot/glycoct2wurcs/" encodedseq = urllib.parse.quote(seq, safe='') requestURL += encodedseq req = urllib.request.Request(requestURL) # self.wait() response = urllib.request.urlopen(req).read() result = json.loads(response) try: wurcs = result["WURCS"] except: raise ValueError("GlycoCT 2 WURCS conversion failed") return wurcs.strip()
#!/bin/env python27 import os, sys import json import findpygly from pygly.GlycanFormatter import GlycoCTFormat, WURCS20Format from pygly.GlyTouCan import GlyTouCan g = GlyTouCan() wurcs_parser = WURCS20Format() glycoct_parser = GlycoCTFormat() wpath = "dumps/wurcs" gpath = "dumps/glycoct" wlist = os.listdir(wpath) glist = os.listdir(gpath) alllist = list(set(wlist+glist)) print "Total glycan number %s" % len(alllist) glycanobj = {} for filename in alllist: acc = filename.rstrip(".txt") try: gseq = open(os.path.join(gpath, filename)).read().strip() obj = glycoct_parser.toGlycan(gseq) except: try:
#!/bin/env python27 import sys from collections import defaultdict from getwiki import GlycanData, Glycan from pygly.GlycanFormatter import GlycoCTFormat w = GlycanData() glycoctformat = GlycoCTFormat() monosdb = {} f = open(sys.argv[1], 'r') for line in f: k, v = line.split() monosdb[k] = v for g in w.iterglycan(): acc = g.get('accession') monodbids = set() glycan = g.getGlycan() if not glycan: continue for m in glycan.all_nodes(): try: glycoctsym = glycoctformat.mtoStr(m) except KeyError: continue try: monodbids.add(monosdb[glycoctsym]) except KeyError:
import sys, re from collections import defaultdict # from getwiki import GlycanData import findpygly from pygly.Glycan import Glycan from pygly.GlycanFormatter import GlycoCTFormat from pygly.GlycanResource import GlyTouCan, GlyCosmos # w = GlycanData() glycoctformat = GlycoCTFormat() basecomp = {'x-HEX-x:x':'Hex', 'x-HEX-x:x||(2d:1)n-acetyl':'HexNAc', 'x-dgro-dgal-NON-x:x|1:a|2:keto|3:d||(5d:1)n-acetyl':'NeuAc', 'x-dgro-dgal-NON-x:x|1:a|2:keto|3:d||(5d:1)n-glycolyl':'NeuGc', 'x-HEX-x:x|6:d':'dHex', 'x-lgal-HEX-x:x|6:d':'Fuc', 'x-PEN-x:x':'Pent', 'x-dgro-dgal-NON-x:x|1:a|2:keto|3:d':'KDN', 'x-HEX-x:x|6:a':'HexA', 'phosphate':'P', 'sulfate':'S'} badskel = set(""" axxxxh-1x """.split()) # NeuAc, NeuGc, KDN, Fuc, Hex, HexNAc, dHex, HexA, Pent expskel = set(""" AUd21122h_5*NCC/3=O
from collections import defaultdict from pygly.GlycanFormatter import WURCS20Format, GlycoCTFormat # Has to come first to pick out the --smwenv PROD command-line argument. from getwiki import GlycoMotifWiki, AllMotif w = GlycoMotifWiki() topology_file_path = sys.argv[1] non_file_path = sys.argv[2] red_file_path = sys.argv[3] wp = WURCS20Format() gp = GlycoCTFormat() class RootMonosaccharideTopoLeq(alignment.MonosaccharideImageEqual): def leq(self, a, b): return self.eq(a, b) class MonosaccharideTopoLeq(alignment.MonosaccharideTopoEqual): def leq(self, a, b): return self.eq(a, b) class LinkageTopoLeq(alignment.LinkageTopoEqual):
class Glycan(SMW.SMWClass): template = 'Glycan' @staticmethod def pagename(**kwargs): assert kwargs.get('accession') return kwargs.get('accession') def toPython(self, data): data = super(Glycan, self).toPython(data) # if '_subobjs' in data: # del data['_subobjs'] return data def toTemplate(self, data): data = super(Glycan, self).toTemplate(data) return data def set_annotation(self, **kwargs): if 'annotation' in kwargs: ann = kwargs.get('annotation') else: ann = Annotation(**kwargs) if not ann.goodvalue(): return if not hasattr(self, '_annotations'): self._annotations = dict() self._annotations[ann.key()] = ann def add_annotation(self, **kwargs): assert kwargs.get('value') value = kwargs.get('value') del kwargs['value'] if self.has_annotations(**kwargs): values = self.get_annotation_values(**kwargs) values.append(value) else: values = [value] self.set_annotation(value=values, **kwargs) def delete_annotations(self, **kwargs): if not hasattr(self, '_annotations'): return for an in list(self.annotations(**kwargs)): del self._annotations[an.key()] def count_annotations(self, **kwargs): return len(list(self.annotations(**kwargs))) def get_annotation_values(self, property=None, **kwargs): return map( str, self.get_annotation(property=property, **kwargs).get('value')) def get_annotation_value(self, property=None, **kwargs): return str( self.get_annotation(property=property, **kwargs).get('value')) def get_annotation(self, property=None, **kwargs): if property != None: kwargs['property'] = property anns = list(self.annotations(**kwargs)) if len(anns) == 0: raise LookupError("No matching annotations") elif len(anns) > 1: raise LookupError("Too many annotations") return anns[0] def has_annotations(self, **kwargs): for an in self.annotations(**kwargs): return True return False def annotations(self, type=None, property=None, source=None, sourceid=None): if hasattr(self, '_annotations'): for key, an in sorted(self._annotations.items()): if (type == None or an.get('type') == type) and \ (property == None or an.get('property') == property) and \ (source == None or an.get('source') == source) and \ (sourceid == None or an.get('sourceid') == sourceid): yield an def __str__(self): sl = [super(Glycan, self).__str__()] for an in self.annotations(): sl.append(str(an)) return "\n".join(sl) glycoct_format = GlycoCTFormat() wurcs_format = WURCS20Format() def getGlycan(self): try: sequence = self.get_annotation_value('WURCS') return self.wurcs_format.toGlycan(sequence) except (LookupError, GlycanParseError, RuntimeError): pass try: sequence = self.get_annotation_value('GlycoCT') return self.glycoct_format.toGlycan(sequence) except (LookupError, GlycanParseError, RuntimeError): pass return None
import time import findpygly import pygly.alignment from pygly.GlycanFormatter import GlycoCTFormat, WURCS20Format from pygly.GlycanResource.GlyTouCan import GlyTouCanNoCache from pygly.GlycanResource.GlyCosmos import GlyCosmosNoCache from getwiki import GlycoMotifWiki w = GlycoMotifWiki() if len(sys.argv) > 1: res_file_path = sys.argv[1] # "../data/motif_alignments.tsv" else: res_file_path = None wp = WURCS20Format() gp = GlycoCTFormat() gtc = GlyTouCanNoCache() nodes_cache = pygly.alignment.ConnectedNodesCache() loose_matcher = pygly.alignment.MotifInclusive( connected_nodes_cache=nodes_cache) loose_nred_matcher = pygly.alignment.NonReducingEndMotifInclusive( connected_nodes_cache=nodes_cache) strict_matcher = pygly.alignment.MotifStrict(connected_nodes_cache=nodes_cache) strict_nred_matcher = pygly.alignment.NonReducingEndMotifStrict( connected_nodes_cache=nodes_cache) motif_gobjs = {} for m in w.itermotif():
linkCheck = GlycanLinkCompatibleEitherway() monoCheck = MonosaccharideCompatibleOneway() rootMonoCheck = MonosaccharideCompatibleOneway() if __name__ == "__main__": seq1 = """RES 1b:x-dgal-HEX-1:5 2b:a-dgal-HEX-1:5 LIN 1:1o(3+1)2d""" seq2 = """RES 1b:x-dglc-HEX-1:5 2s:n-acetyl 3b:b-dgal-HEX-1:5 4b:a-dgal-HEX-1:5 LIN 1:1d(2+1)2n 2:1o(4+1)3d 3:3o(4+1)4d""" wurcsp = WURCS20Format() glycoctp = GlycoCTFormat() g1 = glycoctp.toGlycan(seq1) g2 = glycoctp.toGlycan(seq2) mstsa = MotifSearchTopologicalSameAs() print mstsa.get(g1, g2)
#!/bin/env python27 import sys, traceback from getwiki import GlycoMotifWiki, UniCarbMotif w = GlycoMotifWiki() from pygly.GlyTouCan import GlyTouCan gtc = GlyTouCan() from gtccache import GlyTouCanCache gtccache = GlyTouCanCache() from pygly.GlycanFormatter import GlycoCTFormat, IUPACParserExtended1 gparser = GlycoCTFormat() imparser = IUPACParserExtended1() from dataset import XLSXFileTable rows = XLSXFileTable(sys.argv[1]) possibleaglycon = ["Cer", "R", "Ser/Thr"] reaglycon = ["Ser/Thr", "Cer", "Other"] current = set() for r in rows: id = r["ID"] name = r["Name"] iupacseq = r["IUPAC"] accession = "%06d" % id if not iupacseq: continue
#!/usr/bin/env python27 import sys, os, os.path import findpygly from pygly.GlyTouCan import GlyTouCan from pygly.GlycanFormatter import GlycoCTFormat glycoct_format = GlycoCTFormat() gtc = GlyTouCan() for l in sys.stdin: acc = l.strip() g = gtc.getGlycan(acc) if g and g.fully_determined(): print acc, True if not os.path.exists('%s.txt' % (acc, )): seq = gtc.getseq(acc, 'glycoct') if not seq: try: seq = glycoct_format.toStr(g) except: pass if seq: wh = open('%s.txt' % (acc, ), 'w') wh.write(seq.strip() + '\n') wh.close() else: print acc, (False if g else None)
getargs = {} for i in range(1, len(sys.argv), 2): key = sys.argv[i] try: value = sys.argv[i + 1] value = float(sys.argv[i + 1]) value = int(sys.argv[i + 1]) except ValueError: pass getargs[key] = value if count: cnt = gdb.count(**getargs) print dbname, cnt elif glycoct: fmt1 = GlycoCTFormat() zf = ZipFile(dbname.rsplit('.', 1)[0] + '.gct', 'w', ZIP_DEFLATED) for r in gdb.get(**getargs): zf.writestr("%s.txt" % r.accession, fmt1.toStr(r.glycan)) zf.close() else: fmt = LinearCodeFormat() for r in gdb.get(**getargs): if r['lincode']: lc = fmt.toStr(r.glycan) print r.accession, r['nlinked'], r.get( 'oxford', "-"), r['molecular_weight'], r['composition'], lc print r else: print r.accession, r['nlinked'], r.get( 'oxford', "-"), r['molecular_weight'], r['composition']
def substructure_search_init(shared_resources, structure_list_file_path, PPID): print >> sys.stderr, "Computing Processor%s is starting" % PPID task_queue, result_queue = shared_resources gp = GlycoCTFormat() wp = WURCS20Format() motif_match_connected_nodes_cache = pygly.alignment.ConnectedNodesCache() mm1 = pygly.alignment.GlyTouCanMotif( connected_nodes_cache=motif_match_connected_nodes_cache) # mm2 = pygly.alignment.MotifAllowOptionalSub(connected_nodes_cache=motif_match_connected_nodes_cache) glycans = {} for line in open(structure_list_file_path): acc, s = line.strip().split() glycans[acc] = wp.toGlycan(s) print >> sys.stderr, "Processor-%s: finishes loading %s glycans" % ( PPID, len(glycans)) while True: task_detail = task_queue.get(block=True) print >> sys.stderr, "Processor-%s: Job %s received." % ( PPID, task_detail["id"]) seq = task_detail["seq"] jobid = task_detail["id"] #loose_root_match = task_detail["loose_root_match"] #additional_subst = task_detail["additional_subst"] motif_match_position = task_detail["motif_match_position"] motif_matcher = mm1 """ if loose_root_match: motif_matcher = mm3 """ #fullstructure = False rootOnly = False anywhereExceptRoot = False if motif_match_position == "anywhere": pass elif motif_match_position == "reo": rootOnly = True else: pass """ elif motif_match_position == "notre": anywhereExceptRoot = True elif motif_match_position == "fullstructure": rootOnly = True fullstructure = True """ matches = [] error = [] calculation_start_time = time.time() try: if "RES" in seq: motif = gp.toGlycan(seq) elif "WURCS" in seq: motif = wp.toGlycan(seq) else: raise RuntimeError except: error.append("Unable to parse") if len(error) == 0: motif_node_num = len(list(motif.all_nodes())) if motif_node_num > max_motif_size: error.append("Motif is too big") # TODO time out mechanism to avoid running for too long for acc, glycan in glycans.items(): if len(error) != 0: for e in error: print >> sys.stderr, "Processor-%s: Issues (%s) is found with task %s" % ( PPID, e, task_detail["id"]) break #if fullstructure: # if motif_node_num != len(list(glycan.all_nodes())): # continue if motif_matcher.leq(motif, glycan, rootOnly=rootOnly, anywhereExceptRoot=anywhereExceptRoot): matches.append(acc) calculation_end_time = time.time() calculation_time_cost = calculation_end_time - calculation_start_time res = { "id": jobid, "start time": calculation_start_time, "end time": calculation_end_time, "alignment calculation time": calculation_time_cost, "matches": matches, "error": error } print >> sys.stderr, "Processor-%s: Job %s finished within %ss" % ( PPID, task_detail["id"], calculation_time_cost) result_queue.put(res)