def initDatabase(dbFilename, iFilename, clobber=False): db = Alchemy() gene_table = Table("Gene", db.metadata, Column('id', Integer, primary_key=True), Column('geneId', Text), Column('chrom', Text, index=True), Column('start', Integer, index=True), Column('end', Integer, index=True), Column('strand', String(1)) ) mapper(Gene, gene_table) populateDb = clobber or not os.path.exists(dbFilename) session = db.startSession('sqlite:///%s' % dbFilename) if populateDb: iFile = open(iFilename) headers = iFile.readline() strandTable = {'1': '+', '-1': '-'} for i,line in enumerate(iFile): if (i % 1000): progressMessage('# genes %s', i) tokens = line.strip().split('\t') g = Gene(None, tokens[0], tokens[5], int(tokens[6]), int(tokens[7]), strandTable[tokens[8]]) session.save(g) progressMessage('# genes %s', i) session.commit() return session
def initDatabase(dbFilename, iFilename, clobber=False): db = Alchemy() gene_table = Table( "Gene", db.metadata, Column("id", Integer, primary_key=True), Column("geneId", Text), Column("chrom", Text, index=True), Column("start", Integer, index=True), Column("end", Integer, index=True), Column("strand", String(1)), ) mapper(Gene, gene_table) populateDb = clobber or not os.path.exists(dbFilename) session = db.startSession("sqlite:///%s" % dbFilename) if populateDb: iFile = open(iFilename) headers = iFile.readline() strandTable = {"1": "+", "-1": "-"} for i, line in enumerate(iFile): if i % 1000: progressMessage("# genes %s", i) tokens = line.strip().split("\t") g = Gene(None, tokens[0], tokens[5], int(tokens[6]), int(tokens[7]), strandTable[tokens[8]]) session.save(g) progressMessage("# genes %s", i) session.commit() return session
def build(self, clobber=False, separator='!'): """Build index file. @keyword clobber: Overwrite existing index file (default=False) """ if clobber: try: self.cursor.execute('drop table Offsets;') except: pass if not self.isIndexed(): schema = """ CREATE TABLE Offsets ( id INTEGER, accession TEXT, offset INTEGER ); create index idx_offsets_id on Offsets (id); create index idx_offsets_accession on Offsets (accession); """ self.connection.executescript(schema) tmpFile = tempfile.NamedTemporaryFile() for i,(accession,offset) in enumerate(self._byteOffsetGenerator()): print >> tmpFile, "%i%s%s%s%i" % (i,separator,accession,separator,offset) if i % 1000==0: progressMessage("Sequences: %s", i+1) progressMessage("Sequences: %s\n", i+1) tmpFile.flush() cmd = """sqlite3 -separator '%s' %s '.import "%s" Offsets'""" \ % (separator, self.idxFilename, tmpFile.name) os.system(cmd) tmpFile.close()
def build(self, clobber=False, separator='!'): """Build index file. @keyword clobber: Overwrite existing index file (default=False) """ if clobber: try: self.cursor.execute('drop table Offsets;') except: pass if not self.isIndexed(): schema = """ CREATE TABLE Offsets ( id INTEGER, accession TEXT, offset INTEGER ); create index idx_offsets_id on Offsets (id); create index idx_offsets_accession on Offsets (accession); """ self.connection.executescript(schema) tmpFile = tempfile.NamedTemporaryFile() for i, (accession, offset) in enumerate(self._byteOffsetGenerator()): print >> tmpFile, "%i%s%s%s%i" % (i, separator, accession, separator, offset) if i % 1000 == 0: progressMessage("Sequences: %s", i + 1) progressMessage("Sequences: %s\n", i + 1) tmpFile.flush() cmd = """sqlite3 -separator '%s' %s '.import "%s" Offsets'""" \ % (separator, self.idxFilename, tmpFile.name) os.system(cmd) tmpFile.close()
# contigData[chrom].add_interval(contig) # except KeyError: # contigData[chrom] = Intersecter() # contigData[chrom].add_interval(contig) print 'Parse genes' iFilename = '/Users/papenfuss/databases/platypus/ensembl/Release50/mart_names_locations.txt' iFile = open(iFilename) headers = iFile.readline() annotated = set() for i,line in enumerate(iFile): if (i % 1000)==0: progressMessage('# genes %s', i) tokens = line.strip().split('\t') geneId = tokens[0] transId = tokens[1] name = tokens[3] chrom = tokens[5] start = int(tokens[6]) end = int(tokens[7]) strand = {'1': '+', '-1': '-'}[tokens[8]] try: for contig in contigData[chrom].find(start-500, end+500): annotated.add(contig.value[0]) except: pass
iFilename = os.path.join(iDir, 'mart_names_locations.txt') dbFilename = os.path.join(iDir, 'mart_names_locations.sqlite') # dbFilename = ':memory:' session = initDatabase(dbFilename, iFilename) # 2a. Parse read alignment file and has results # 2b. Write out unannotated reads flankSize = 1000 maqFilename = '/Users/papenfuss/databases/platypus/venom/solexa/mapview.txt' data = {} unannFile = open('unannotated.txt', 'w') multFile = open('multiple.txt', 'w') multFile2 = open('multiple_gene.txt', 'w') for i,m in enumerate(MaqViewFile(maqFilename)): if (i % 1000)==0: progressMessage('# maq alns %s', i) q = session.query(Gene).filter(Gene.chrom==m.chrom) \ .filter(Gene.start<m.start+flankSize) \ .filter(Gene.end>m.start+32-flankSize).all() if len(q)==0: print >> unannFile, m continue elif len(q)>1: x = set([r.geneId for r in q]) if len(x)>1: print >> multFile, m print >> multFile2, "%s\t%s" % (m.name, ','.join(x)) continue try:
dbFilename = os.path.join(iDir, "mart_names_locations.sqlite") # dbFilename = ':memory:' session = initDatabase(dbFilename, iFilename) # 2a. Parse read alignment file and has results # 2b. Write out unannotated reads flankSize = 1000 maqFilename = "/Users/papenfuss/databases/platypus/venom/solexa/mapview.txt" data = {} unannFile = open("unannotated.txt", "w") multFile = open("multiple.txt", "w") multFile2 = open("multiple_gene.txt", "w") for i, m in enumerate(MaqViewFile(maqFilename)): if (i % 1000) == 0: progressMessage("# maq alns %s", i) q = ( session.query(Gene) .filter(Gene.chrom == m.chrom) .filter(Gene.start < m.start + flankSize) .filter(Gene.end > m.start + 32 - flankSize) .all() ) if len(q) == 0: print >> unannFile, m continue elif len(q) > 1: x = set([r.geneId for r in q]) if len(x) > 1: print >> multFile, m
hsps_table = createTable(tableName, metadata, h.attributes, h.converters, indexedAttributes=['subjectId', 'sStart', 'sEnd']) mapper(HSP, hsps_table) # Start a session & initialize database session = createSession(dsn, metadata) if case in [1,2]: # Devil 454 reads for i,line in enumerate(open(iFilename)): tokens = line.strip().split('\t') h = HSP(tokens[0:-2]) h.convertBlockToGenomeCoords() session.save(h) if (i % 5000)==0: progressMessage("# HSPs %s", i, n) session.commit() progressMessage("# HSPs %s\n", i, n) session.commit() elif case==3: # Platypus 454 reads for i,h in enumerate(BlastFile(iFilename)): h.subjectId = h.subjectId.split('|')[1] h.convertBlockToGenomeCoords() session.save(h) if (i % 5000)==0: progressMessage("# HSPs %s", i, n) session.commit() progressMessage("# HSPs %s\n", i, n) session.commit()
sys.exit(__doc__) iFilename = '/Users/papenfuss/databases/platypus/venom/solexa/mapview_filtered.txt' # sys.argv[1] oFilename = '/Users/papenfuss/platy/venom/gbrowse/solexa250.gff' # sys.argv[2] windowSize = 250 iFile = open(iFilename) headers = iFile.readline().strip().split('\t') oFile = open(oFilename, 'w') chrom = None lastChrom = None countDict = {} for i, line in enumerate(iFile): if (i % 1000) == 0: progressMessage('# reads %s', i, 28000000) tokens = line.strip().split('\t') d = dict(zip(headers, tokens)) chrom = "%s" % d['chrom'] start = int(d['start']) # print chrom, start, lastChrom, (chrom!=lastChrom and len(countDict)!=0) if chrom != lastChrom and len(countDict) != 0: countData = countDict.items() countData.sort(key=lambda x: x[0]) for (_chrom, _wStart), _counts in countData: g = Feature(reference=_chrom, source='solexa250', type='tlevel', start=_wStart, end=_wStart + windowSize - 1,
#!/usr/bin/env python """ loadSolexa.py Author: Tony Papenfuss Date: Tue Jun 24 14:27:34 EST 2008 """ import os, sys from maq import * from useful import progressMessage oFilename = 'tmp/PlatySolexa.txt' if not os.path.exists(oFilename): oFile = open(oFilename, 'w') dataDir = '/Users/papenfuss/databases/platypus/venom/solexa/' for i, read in enumerate( MaqViewFile(os.path.join(dataDir, 'mapview.txt'), mQ_cutoff=40)): if (i % 1000) == 0: progressMessage("# maq %s", i, 28395347) tokens = str(read).split('\t') tokens.append(i) print >> oFile, "|".join([str(x) for x in tokens]) oFile.close() progressMessage("# maq %s\n", i, 28395347) os.system( """sqlite3 alignedReads.db '.import "tmp/PlatySolexa.txt" PlatySolexa'""")
import os, sys from useful import progressMessage iFilename = sys.argv[1] oFilename = sys.argv[2] mQ_cutoff = 40 nSeqs = 280000000 oFile = open(oFilename, 'w') headers = ['name','chrom','start','strand','mQ','numTied','score','numZeroMismatches'] format = '\t'.join(['%s','%s','%i','%s','%i','%i','%i','%i']) print >> oFile, '\t'.join(headers) for i,line in enumerate(open(iFilename)): tokens = line.strip().split('\t') mQ = int(tokens[7]) if mQ>=mQ_cutoff: name = tokens[0] chrom = tokens[1] start = int(tokens[2]) strand = tokens[3] numTied = int(tokens[10]) score = int(tokens[11]) numZeroMismatches = int(tokens[12]) print >> oFile, format % (name,chrom,start,strand,mQ,numTied,score,numZeroMismatches) if (i % 1000)==0: progressMessage('# maq hits %s', i, nSeqs) oFile.close()
h.converters, indexedAttributes=['subjectId', 'sStart', 'sEnd']) mapper(HSP, hsps_table) # Start a session & initialize database session = createSession(dsn, metadata) if case in [1, 2]: # Devil 454 reads for i, line in enumerate(open(iFilename)): tokens = line.strip().split('\t') h = HSP(tokens[0:-2]) h.convertBlockToGenomeCoords() session.save(h) if (i % 5000) == 0: progressMessage("# HSPs %s", i, n) session.commit() progressMessage("# HSPs %s\n", i, n) session.commit() elif case == 3: # Platypus 454 reads for i, h in enumerate(BlastFile(iFilename)): h.subjectId = h.subjectId.split('|')[1] h.convertBlockToGenomeCoords() session.save(h) if (i % 5000) == 0: progressMessage("# HSPs %s", i, n) session.commit() progressMessage("# HSPs %s\n", i, n) session.commit()
""" loadSolexa.py Author: Tony Papenfuss Date: Tue Jun 24 14:27:34 EST 2008 """ import os, sys from maq import * from useful import progressMessage oFilename = 'tmp/PlatySolexa.txt' if not os.path.exists(oFilename): oFile = open(oFilename, 'w') dataDir = '/Users/papenfuss/databases/platypus/venom/solexa/' for i,read in enumerate(MaqViewFile(os.path.join(dataDir, 'mapview.txt'), mQ_cutoff=40)): if (i % 1000)==0: progressMessage("# maq %s", i, 28395347) tokens = str(read).split('\t') tokens.append(i) print >> oFile, "|".join([str(x) for x in tokens]) oFile.close() progressMessage("# maq %s\n", i, 28395347) os.system("""sqlite3 alignedReads.db '.import "tmp/PlatySolexa.txt" PlatySolexa'""")
# dbFilename = ':memory:' session = initDatabase(dbFilename, iFilename) # 2a. Parse read alignment file and has results # 2b. Write out unannotated reads flankSize = 1000 maqFilename = '/Users/papenfuss/databases/platypus/venom/solexa/mapview.txt' data = {} unannFile = open('test_unannotated.txt', 'w') multFile = open('test_multiple.txt', 'w') multFile2 = open('test_multiple_gene.txt', 'w') for i,m in enumerate(MaqViewFile(maqFilename)): if i==10000: break if (i % 1000)==0: progressMessage('# maq alns %s', i) q = session.query(Gene).filter(Gene.chrom==m.chrom) \ .filter(Gene.start<m.start+flankSize) \ .filter(Gene.end>m.start+32-flankSize).all() if len(q)==0: print >> unannFile, m continue elif len(q)>1: x = set([r.geneId for r in q]) if len(x)>1: print >> multFile, m print >> multFile2, "%s\t%s" % (m.name, ','.join(x)) continue
tileSize = 35 chrSizeFilename = '/Users/papenfuss/databases/chromSizes/ornAna5.txt' chrSizes = loadChrSizes(chrSizeFilename) iFile = open(iFilename) headers = iFile.readline().strip().split('\t') oFile = open(oFilename, 'w') format = "%s\t%i\t%i" chrom = None lastChrom = None countDict = {} for i,line in enumerate(iFile): if (i % 1000)==0: progressMessage('# reads %s', i, 28000000) tokens = line.strip().split('\t') d = dict(zip(headers, tokens)) chrom = d['chrom'] if chrom=='MT': continue elif 'Ultra' in chrom or 'Contig' in chrom: pass else: chrom = 'chr%s' % chrom start = int(d['start']) if chrom!=lastChrom and countDict: print chrom for _wStart in xrange(1, chrSizes[lastChrom], tileSize): counts = countDict.get((lastChrom, _wStart), 0)
dbFilename = os.path.join(iDir, 'mart_names_locations.sqlite') # dbFilename = ':memory:' session = initDatabase(dbFilename, iFilename) # 2a. Parse read alignment file and has results # 2b. Write out unannotated reads flankSize = 1000 maqFilename = '/Users/papenfuss/databases/platypus/venom/solexa/mapview.txt' data = {} unannFile = open('test_unannotated.txt', 'w') multFile = open('test_multiple.txt', 'w') multFile2 = open('test_multiple_gene.txt', 'w') for i, m in enumerate(MaqViewFile(maqFilename)): if i == 10000: break if (i % 1000) == 0: progressMessage('# maq alns %s', i) q = session.query(Gene).filter(Gene.chrom==m.chrom) \ .filter(Gene.start<m.start+flankSize) \ .filter(Gene.end>m.start+32-flankSize).all() if len(q) == 0: print >> unannFile, m continue elif len(q) > 1: x = set([r.geneId for r in q]) if len(x) > 1: print >> multFile, m print >> multFile2, "%s\t%s" % (m.name, ','.join(x)) continue