def get_annot_db(self, table, primaryKey='name', sliceAttrDict=dict(id='chrom', start='chromStart', stop='chromEnd')): '''generic method to obtain an AnnotationDB for any annotation table in UCSC, e.g. snp130. If your target table has non-standard name, start, end columns, specify them in the primaryKey and sliceAttrDict args. Saves table as named attribute on this package object.''' try: # return existing db if already cached here return getattr(self, table) except AttributeError: pass sliceDB = sqlgraph.SQLTable(self.ucsc_db + '.' + table, primaryKey=primaryKey, serverInfo=self.ucsc_server, itemClass=UCSCSeqIntervalRow) annoDB = annotation.AnnotationDB(sliceDB, self.genome_seq, checkFirstID=False, sliceAttrDict=sliceAttrDict) setattr(self, table, annoDB) # cache this db on named attribute return annoDB
def test_slice_descr(self): aa_db = annotation.AnnotationDB( {}, self.db, itemClass=annotation.TranslationAnnot, itemSliceClass=annotation.TranslationAnnotSlice, sliceAttrDict=dict(id=0, start=1, stop=2)) aa = aa_db.new_annotation('bar', (self.FLIM.id, 0, 12)) assert str(aa) == 'FLIM' assert str(aa[1:3].sequence) == 'CTAATT'
def makeResourceFromBed(fileLines, genome, docstring='Temp Resource From BED', dataPath='memory'): 'Generate a sqlite table, annotDB, and NLMSA from the given bed lines' bedLines = readBedLines(fileLines) bedDict = makeDictFromBed(bedLines) tableName = os.path.split(dataPath)[1] sqlDataPath = dataPath if dataPath != 'memory' else ':memory:' # SQLite has special name for in-memory tables dataTable = convertDictToSQLite(bedDict, tableName, sqlDataPath) annotDB = annotation.AnnotationDB(dataTable, genome, sliceAttrDict=eval(defaultSliceAttrs)) annotMap = makeNLMSA([annotDB], dataPath) return dataTable, annotDB, annotMap
def bed2pygr(dbprefix, referencefile, bedfile, indir): collision_counter = defaultdict(int) chrdb = seqdb.SequenceFileDB(referencefile) annodb = annotation.AnnotationDB({}, chrdb) al = cnestedlist.NLMSA(dbprefix, 'w', pairwiseMode=True) load_bed(al, annodb, bedfile, collision_counter) al.build(saveSeqDict=True) genomeprefix = os.path.basename(referencefile).rsplit('.', 1)[0] print >> open(os.path.join(dbprefix) + '.genome', 'w'), genomeprefix
def test_translation_db(self): aa_db = annotation.AnnotationDB( {}, self.db, itemClass=annotation.TranslationAnnot, itemSliceClass=annotation.TranslationAnnotSlice, sliceAttrDict=dict(id=0, start=1, stop=2)) aa = aa_db.new_annotation('foo', (self.M.id, 0, 3)) orf = aa_db['foo'] assert str(orf) == 'M' aa2 = aa_db.new_annotation('bar', (self.FLIM.id, 0, 12)) orf = aa_db['bar'] assert str(orf) == 'FLIM'
def read_genbank_annots(gbfile, fastafile=None, featureType='CDS', geneQualifier='gene'): '''construct annotation DB for gene CDS intervals. NB: this assumes each gene consists of ONE interval. This cannot be used for multi-exon genes!''' try: gbparse = SeqIO.parse(gbfile, 'genbank') except TypeError: # SeqIO changed its interface? ifile = open(gbfile) try: gbparse = SeqIO.parse(ifile, 'genbank') gbseqs = list(gbparse) finally: ifile.close() else: gbseqs = list(gbparse) if fastafile is None: fastafile = gbfile.split('.')[0] + '.fna' genome = seqdb.SequenceFileDB(fastafile) genomeIndex = blast.BlastIDIndex(genome) # handle NCBI ID blobs properly annodb = annotation.AnnotationDB({}, genome, sliceAttrDict=dict(id=0, start=1, stop=2, orientation=3)) i = 0 for s in gbseqs: seqID = genomeIndex[s.id].id # find the right seq and get its actual ID for f in s.features: if f.type == featureType: try: name = f.qualifiers[geneQualifier][0] except KeyError: # keep the annotation even if label missing warnings.warn('Missing gene qualifier "%s" on %s annotation' % (geneQualifier, featureType)) name = 'unlabeled_%s_%d' % (featureType, i) i += 1 annodb.new_annotation(name, (seqID, f.location.start.position, f.location.end.position, f.strand)) al = cnestedlist.NLMSA('tmp', 'memory', pairwiseMode=True) for a in annodb.itervalues(): al.addAnnotation(a) al.build() return annodb, al, genome
def test_negative_frames(self): aa_db = annotation.AnnotationDB( {}, self.db, itemClass=annotation.TranslationAnnot, itemSliceClass=annotation.TranslationAnnotSlice, sliceAttrDict=dict(id=0, start=1, stop=2, orientation=3)) f1 = aa_db.new_annotation('f1', (self.FLIM.id, 0, 12, -1)) assert str(f1) == 'HN*K' assert f1.frame == -2 f2 = aa_db.new_annotation('f2', (self.FLIM.id, 1, 10, -1)) assert str(f2) == '*LE' assert f2.frame == -1 f3 = aa_db.new_annotation('f3', (self.FLIM.id, 2, 11, -1)) assert str(f3) == 'IIR' assert f3.frame == -3
def test_positive_frames(self): aa_db = annotation.AnnotationDB( {}, self.db, itemClass=annotation.TranslationAnnot, itemSliceClass=annotation.TranslationAnnotSlice, sliceAttrDict=dict(id=0, start=1, stop=2)) f1 = aa_db.new_annotation('f1', (self.FLIM.id, 0, 12)) assert str(f1) == 'FLIM' assert f1.frame == +1 f2 = aa_db.new_annotation('f2', (self.FLIM.id, 1, 10)) assert str(f2) == 'F*L' assert f2.frame == +2 f3 = aa_db.new_annotation('f3', (self.FLIM.id, 2, 11)) assert str(f3) == 'SNY' assert f3.frame == +3
def read_exon_annots(genome, genesFile='knownGene.txt'): '''read multi-exon transcript set and build exon annotation db and exon-to-gene mapping''' exonDict, genes, trLen = read_known_genes(genesFile) geneLengths = get_gene_maxlengths(genes, trLen) totalSize = sum(geneLengths.values()) annodb = annotation.AnnotationDB({}, genome, sliceAttrDict=dict(id=0, orientation=1, start=2, stop=3)) al = cnestedlist.NLMSA('tmp', 'memory', pairwiseMode=True, maxlen=1000000000) i = 0 exonGene = {} for t,geneID in exonDict.iteritems(): a = annodb.new_annotation(i, t) exonGene[i] = geneID i += 1 al.addAnnotation(a) al.build() return annodb, al, exonGene, totalSize, geneLengths
def __init__(self, O): self.species = O dir(worldbase.Bio.Seq) dir(worldbase.Bio.Seq.Genome) Genome = eval('dir(worldbase.Bio.Seq.Genome.' + O.genome_name + ')[-1]') O.genome_build = Genome serverInfo = sqlgraph.DBServerInfo(host='genome-mysql.cse.ucsc.edu', user='******') txInfo = sqlgraph.SQLTable(O.genome_build + '.ensGene', serverInfo=serverInfo, itemClass=UCSCSeqIntervalRow, primaryKey='name') self.chromosome = eval('worldbase.Bio.Seq.Genome.' + O.genome_name + '.' + Genome + '(download=True)') for i in self.chromosome: print i, len(self.chromosome[i]) self.annodb = annotation.AnnotationDB(txInfo, self.chromosome, sliceAttrDict=dict( id='chrom', start='txStart', stop='txEnd'))
def bedToNLMSA(bedlines, genome, field_locations=dict(id=0, start=1, stop=2, name=3, score=4, orientation=-1)): "Build a pygr resource off of the BED file in_name" annotDB = annotation.AnnotationDB(None, genome, verbose=False, sliceAttrDict=field_locations) nlmsa = cnestedlist.NLMSA('tmp_bed', mode='memory', pairwiseMode=True, bidirectional=False) index = 0 skipped = 0 for line in bedlines: if not line: continue fields = line.strip().split('\t') orientation = 1 if len(fields) < 6 or fields[5] == '+' else -1 #print fields, orientation try: curAnnot = annotDB.new_annotation(index, fields + [orientation]) nlmsa.addAnnotation(curAnnot) index += 1 except KeyError as e: print ('Skipping row without matching chromosome: %s,' +\ 'message: %s') % (row.id, e.message) skipped += 1 #annotDB.close() nlmsa.build() return annotDB, nlmsa
def __init__(self, ucsc_genome_name, ens_species=None, ucsc_serverInfo=None, ens_serverInfo=None, ens_db=None, trackVersion='hgFixed.trackVersion'): '''Construct interfaces to UCSC/Ensembl annotation databases. ucsc_genome_name must be a worldbase ID specifying a UCSC genome. naming convention. ens_species should be the Ensembl database name (generally the name of the species). If not specified, we will try to autodetect it based on ucsc_genome_name. The interface uses the standard UCSC and Ensembl mysql servers by default, unless you provide serverInfo argument(s). trackVersion must be the fully qualified MySQL table name of the trackVersion table containing information about the Ensembl version that each genome dataset connects to.''' # Connect to both servers and prepare database names. if ucsc_serverInfo is not None: if isinstance(ucsc_serverInfo, str): # treat as worldbase ID self.ucsc_server = worldbase(ucsc_serverInfo) else: self.ucsc_server = ucsc_serverInfo else: self.ucsc_server = sqlgraph.DBServerInfo( host='genome-mysql.cse.ucsc.edu', user='******') if ens_serverInfo is not None: if isinstance(ens_serverInfo, str): # treat as worldbase ID self.ens_server = worldbase(ens_serverInfo) else: self.ens_server = ens_serverInfo else: self.ens_server = sqlgraph.DBServerInfo( host='ensembldb.ensembl.org', port=5306, user='******') self.ucsc_db = ucsc_genome_name.split('.')[-1] if ens_db is None: # auto-set ensembl database name self.ens_db = self.get_ensembl_db_name(ens_species, trackVersion) else: self.ens_db = ens_db # Connect to all the necessary tables. self.ucsc_ensGene_trans = sqlgraph.SQLTable( '%s.ensGene' % self.ucsc_db, serverInfo=self.ucsc_server, primaryKey='name', itemClass=UCSCSeqIntervalRow) self.ucsc_ensGene_gene = sqlgraph.SQLTable( '%s.ensGene' % self.ucsc_db, serverInfo=self.ucsc_server, primaryKey='name2', allowNonUniqueID=True, itemClass=UCSCSeqIntervalRow, attrAlias=dict(minTxStart='min(txStart)', maxTxEnd='max(txEnd)')) self.ucsc_ensGtp_gene = sqlgraph.SQLTable('%s.ensGtp' % self.ucsc_db, serverInfo=self.ucsc_server, primaryKey='gene', allowNonUniqueID=True) self.prot_db = sqlgraph.SQLTable('%s.ensGtp' % self.ucsc_db, serverInfo=self.ucsc_server, primaryKey='protein', itemClass=EnsemblProteinRow) self.prot_db.gRes = self self.ucsc_ensPep = sqlgraph.SQLTable( '%s.ensPep' % self.ucsc_db, serverInfo=self.ucsc_server, itemClass=sqlgraph.ProteinSQLSequenceCached, itemSliceClass=seqdb.SeqDBSlice) self.ens_exon_stable_id = sqlgraph.SQLTable('%s.exon_stable_id' % self.ens_db, serverInfo=self.ens_server, primaryKey='stable_id') self.ens_transcript_stable_id = sqlgraph.SQLTable( '%s.transcript_stable_id' % self.ens_db, serverInfo=self.ens_server, primaryKey='stable_id') # We will need this too. self.genome_seq = worldbase(ucsc_genome_name) # Finally, initialise all UCSC-Ensembl databases. self.trans_db = annotation.AnnotationDB( self.ucsc_ensGene_trans, self.genome_seq, checkFirstID=False, sliceAttrDict=dict(id='chrom', start='txStart', stop='txEnd'), itemClass=EnsemblTranscriptAnnotationSeq) self.gene_db = annotation.AnnotationDB(self.ucsc_ensGene_gene, self.genome_seq, checkFirstID=False, sliceAttrDict=dict( id='chrom', start='txStart', stop='txEnd')) exon_slicedb = EnsemblExonOnDemandSliceDB(self) self.exon_db = annotation.AnnotationDB(exon_slicedb, self.genome_seq, checkFirstID=False, sliceAttrDict=dict( id=0, start=1, stop=2, orientation=3)) # Mappings. self.protein_transcript_id_map = sqlgraph.MapView( self.prot_db, self.trans_db, 'select transcript from %s.ensGtp \ where protein=%%s' % self.ucsc_db, inverseSQL='select protein \ from %s.ensGtp where transcript=%%s' % self.ucsc_db, serverInfo=self.ucsc_server) self.transcripts_in_genes_map = sqlgraph.GraphView( self.gene_db, self.trans_db, "select transcript from %s.ensGtp where gene=%%s" % self.ucsc_db, inverseSQL="select gene from %s.ensGtp where transcript=%%s" % self.ucsc_db, serverInfo=self.ucsc_server) self.ens_transcripts_of_exons_map = sqlgraph.GraphView( self.exon_db, self.trans_db, """\ select trans.stable_id from %s.exon_stable_id exon, \ %s.transcript_stable_id trans, %s.exon_transcript et where \ exon.exon_id=et.exon_id and trans.transcript_id=et.transcript_id and \ exon.stable_id=%%s""" % (self.ens_db, self.ens_db, self.ens_db), serverInfo=self.ens_server) self.ens_transcripts_of_exons_map2 = sqlgraph.GraphView( self.ens_exon_stable_id, self.trans_db, """\ select trans.stable_id from %s.exon_stable_id exon, \ %s.transcript_stable_id trans, %s.exon_transcript et where \ exon.exon_id=et.exon_id and trans.transcript_id=et.transcript_id and \ exon.stable_id=%%s""" % (self.ens_db, self.ens_db, self.ens_db), serverInfo=self.ens_server) self.ens_exons_in_transcripts_map = sqlgraph.GraphView( self.trans_db, self.exon_db, """\ select exon.stable_id from %s.exon_stable_id exon, %s.transcript_stable_id \ trans, %s.exon_transcript et where exon.exon_id=et.exon_id and \ trans.transcript_id=et.transcript_id and trans.stable_id=%%s order by \ et.rank""" % (self.ens_db, self.ens_db, self.ens_db), serverInfo=self.ens_server) self.ens_exons_in_transcripts_map2 = sqlgraph.GraphView( self.trans_db, self.ens_exon_stable_id, """\ select exon.stable_id from %s.exon_stable_id exon, %s.transcript_stable_id \ trans, %s.exon_transcript et where exon.exon_id=et.exon_id and \ trans.transcript_id=et.transcript_id and trans.stable_id=%%s order by \ et.rank""" % (self.ens_db, self.ens_db, self.ens_db), serverInfo=self.ens_server) self.trans_db.exons_map = self.ens_exons_in_transcripts_map2
def main(): """ Load the given csv file into an sqlite table, saving an annotationDB and an NLMSA version of the original file """ parser = optparse.OptionParser("%prog [options] infile.csv\n"+main.__doc__) parser.add_option("--datapath", '-p', dest="datapath", type="string", default='/home/shared/pygrdata/annotations/HUMAN/hg18', help="""Sets the datafile path. Default=%default""") parser.add_option("--table_name", '-t', dest="table_name", type="string", help="""The resource table's name and data stem, e.g., refGene => datapath/refGene.sqlite """) parser.add_option("--genome", '-g', dest="genome_resource", type="string", default='hg18', help="""The pygr resource for the genome, default=%default""") parser.add_option("--save_resource", '-r', dest="save_resource", type="string", help="""Where to save the created annotationDB and NLMSA. eg, Bio.Annotation.HUMAN.hg18.MotifMap.M0001""") parser.add_option("--bind_attribute", '-b', dest="bind_attribute", type="string", help="""The attribute to access annotationDB from genome region, eg, 'officialGenes' would be accessible via triCas3['ChLG2'][100:200].officialGenes Default is not to bind an attribute to genome""") parser.add_option("--slice_attrs", '-s', dest="slice_attrs", type="string", default='dict(id="chromosome", start="start", stop="stop", orientation="orientation")', help="""dictionary providing aliases in csv file for id, start, stop, etc. default=%default'""") parser.add_option("--bed_format", dest="bed_format", action='store_true', help="""csv file is in BED file format, without headers.""") opts, args = parser.parse_args() if len(args) < 1: parser.print_help() print 'Please specify at least one csv file to read' sys.exit(-1) if None in [opts.save_resource, opts.table_name]: parser.print_help() print 'Required options: save_resource, table_name' sys.exit(-1) fileIn = open(args[0]) if not opts.bed_format: reader = csv.DictReader(fileIn, delimiter='\t') else: fileIn = itertools.ifilter(bedCommentFilter, fileIn) reader = csv.DictReader(fileIn, delimiter='\t', fieldnames=['chromosome', 'start', 'stop'], restkey='junkData') fieldnames = reader.fieldnames print fieldnames print '# Loading genome %s' % opts.genome_resource genome = getGenome(opts.genome_resource) opts.table_name = opts.table_name.replace('.','_') # SQL interprets . as membership tablePath = os.path.join(opts.datapath,opts.table_name + '.sqlite') print '# Creating sqlite table for %s at %s' % (opts.table_name, tablePath) dataTable = convertBedToSQLite(reader, opts.table_name, fieldNames=fieldnames) print '# Making AnnotationDB and NLMSA...' annotDB = annotation.AnnotationDB(dataTable, genome, annotationType=opts.table_name+':', sliceAttrDict=eval(opts.slice_attrs)) annotDB.__doc__ = 'AnnotationDB for %s on %s' % (opts.table_name, opts.genome_resource) msaName = os.path.join(opts.datapath, opts.table_name + '_') annotMap = makeNLMSA([annotDB], dataPath=msaName) print '# Saving results to worldbase as %s and %s...' % (opts.save_resource, opts.save_resource+'_db') worldbase.add_resource(opts.save_resource, annotMap) worldbase.add_resource(opts.save_resource+'_db', annotDB) worldbase.commit()