def ReadTSV(filename): snvheaders = [_f for _f in """CHROM POS REF ALT""".split() if _f] base, extn = filename.rsplit('.', 1) extn = extn.lower() if extn == 'csv': snvs = CSVFileTable(filename=filename) elif extn == 'tsv': snvs = TSVFileTable(filename=filename) elif extn == 'xls': snvs = XLSFileTable(filename=filename) elif extn == 'xlsx': snvs = XLSXFileTable(filename=filename) elif extn == 'txt': snvs = TXTFileTable(filename=filename, headers=snvheaders) else: raise RuntimeError("Unexpected SNV file extension: %s" % filename) for h in snvheaders: if h not in snvs.headers(): raise RuntimeError("Required header: %s missing from SNV file %s" % (h, filename)) assert (snvs.headers()[:4] == snvheaders) chrom = set() snvdata = [] for r in snvs: ri = list(map(r.get, snvs.headers())) chrom.add(ri[0]) snvdata.append(ri) return ["\t".join(snvs.headers())], chrom, snvdata
progress.stage("Read SNV data", len(opt.snvs)) snvheaders = filter(None, """ CHROM POS REF ALT """.split()) snvdata = {} # extrasnvheaders = [] # usedsnvheaders = set() snvchroms = defaultdict(set) for filename in opt.snvs: base, extn = filename.rsplit('.', 1) extn = extn.lower() if extn == 'csv': snvs = CSVFileTable(filename=filename) elif extn == 'vcf': snvs = VCFFile(filename=filename) elif extn == 'tsv': snvs = TSVFileTable(filename=filename) elif extn == 'xls': snvs = XLSFileTable(filename=filename) elif extn == 'xlsx': snvs = XLSXFileTable(filename=filename) elif extn == 'txt': snvs = TXTFileTable(filename=filename, headers=snvheaders) else: raise RuntimeError("Unexpected SNV file extension: %s" % filename) for h in snvheaders: if h not in snvs.headers():
#!/bin/env python27 import sys, traceback, re from collections import defaultdict from getwiki import GlycoMotifWiki, GlyGenMotif w = GlycoMotifWiki() motif2gd = defaultdict(set) from dataset import CSVFileTable for r in CSVFileTable(sys.argv[1]): entry = r['term (main_entry)'].strip() xrefs = r['term_xref'] gdacc = r['glycan_dictionary_accession'] allmid = set() for xr in filter(None, xrefs.split('|')): try: src, mid = xr.split(':', 1) except ValueError: continue if src.lower() == "glycomotif": if not re.search(r'^GGM\.\d{6}$', mid): print "Bad motif id: %s (%s)" % (mid, entry) continue motif2gd[mid].add((gdacc, entry)) for mid in w.site.allpages(prefix='GGM.', generator=False): # print mid m = w.get(mid) entries = motif2gd[mid]