def astuple(self, encoding=None): """ Return a tuple suitable for import into a database. Attributes field and extra field jsonified into strings. The order of fields is such that they can be supplied as arguments for the query defined in :attr:`gffutils.constants._INSERT`. If `encoding` is not None, then convert string fields to unicode using the provided encoding. Returns ------- Tuple """ if not encoding: return ( self.id, self.seqid, self.source, self.featuretype, self.start, self.end, self.score, self.strand, self.frame, helpers._jsonify(self.attributes), helpers._jsonify(self.extra), self.calc_bin() ) return ( self.id.decode(encoding), self.seqid.decode(encoding), self.source.decode(encoding), self.featuretype.decode(encoding), self.start, self.end, self.score.decode(encoding), self.strand.decode(encoding), self.frame.decode(encoding), helpers._jsonify(self.attributes).decode(encoding), helpers._jsonify(self.extra).decode(encoding), self.calc_bin() )
def test_unjsonify(): attributes, dialect = parser._split_keyvals('transcript_id "mRNA1"') assert attributes == {'transcript_id': ['mRNA1']}, attributes s = helpers._jsonify(attributes) assert s == '{"transcript_id":["mRNA1"]}', s d = helpers._unjsonify(s, isattributes=True) assert d == attributes
def _finalize(self): """ Various last-minute stuff to perform after file has been parsed and imported. In general, if you'll be adding stuff to the meta table, do it here. """ c = self.conn.cursor() c.executemany(''' INSERT INTO directives VALUES (?) ''', ((i,) for i in self.iterator.directives)) c.execute( ''' INSERT INTO meta (version, dialect) VALUES (:version, :dialect)''', dict(version=version.version, dialect=helpers._jsonify(self.iterator.dialect)) ) c.executemany( ''' INSERT OR REPLACE INTO autoincrements VALUES (?, ?) ''', list(self._autoincrements.items())) # These indexes are *well* worth the effort and extra storage: over # 500x speedup on code like this: # # genes = [] # for i in db.features_of_type('snoRNA'): # for k in db.parents(i, level=1, featuretype='gene'): # genes.append(k.id) # logger.info("Creating relations(parent) index") c.execute('DROP INDEX IF EXISTS relationsparent') c.execute('CREATE INDEX relationsparent ON relations (parent)') logger.info("Creating relations(child) index") c.execute('DROP INDEX IF EXISTS relationschild') c.execute('CREATE INDEX relationschild ON relations (child)') logger.info("Creating features(featuretype) index") c.execute('DROP INDEX IF EXISTS featuretype') c.execute('CREATE INDEX featuretype ON features (featuretype)') logger.info("Creating features (seqid, start, end) index") c.execute('DROP INDEX IF EXISTS seqidstartend') c.execute('CREATE INDEX seqidstartend ON features (seqid, start, end)') logger.info("Creating features (seqid, start, end, strand) index") c.execute('DROP INDEX IF EXISTS seqidstartendstrand') c.execute('CREATE INDEX seqidstartendstrand ON features (seqid, start, end, strand)') # speeds computation 1000x in some cases logger.info("Running ANALYSE features") c.execute('ANALYZE features') self.conn.commit() self.warnings = self.iterator.warnings
def astuple(self, encoding=None): """ Return a tuple suitable for import into a database, with attributes field and extra field jsonified into strings If `encoding` is not None, then convert string fields to unicode using the provided encoding. """ if not encoding: return ( self.id, self.seqid, self.source, self.featuretype, self.start, self.end, self.score, self.strand, self.frame, helpers._jsonify(self.attributes), helpers._jsonify(self.extra), self.bin ) return ( self.id.decode(encoding), self.seqid.decode(encoding), self.source.decode(encoding), self.featuretype.decode(encoding), self.start, self.end, self.score.decode(encoding), self.strand.decode(encoding), self.frame.decode(encoding), helpers._jsonify(self.attributes).decode(encoding), helpers._jsonify(self.extra).decode(encoding), self.bin )
def _update_relations(self): if not self.infer_gene_extent: return # TODO: do any indexes speed this up? c = self.conn.cursor() c2 = self.conn.cursor() logger.info("Creating relations(parent) index") c.execute('DROP INDEX IF EXISTS relationsparent') c.execute('CREATE INDEX relationsparent ON relations (parent)') logger.info("Creating relations(child) index") c.execute('DROP INDEX IF EXISTS relationschild') c.execute('CREATE INDEX relationschild ON relations (child)') logger.info('Inferring gene and transcript extents, ' 'and writing to tempfile') tmp = tempfile.NamedTemporaryFile(delete=False).name tmp = '/tmp/gffutils' fout = open(tmp, 'w') self._tmpfile = tmp # This takes some explanation... # # First, the nested subquery gets the level-1 parents of # self.subfeature featuretypes. For an on-spec GTF file, # self.subfeature = "exon". So this subquery translates to getting the # distinct level-1 parents of exons -- which are transcripts. # # OK, so this first subquery is now a list of transcripts; call it # "firstlevel". # # Then join firstlevel on relations, but the trick is to now consider # each transcript a *child* -- so that relations.parent (on the first # line of the query) will be the first-level parent of the transcript # (the gene). # # # The result is something like: # # transcript1 gene1 # transcript2 gene1 # transcript3 gene2 # # Note that genes are repeated; below we need to ensure that only one # is added. To ensure this, the results are ordered by the gene ID. c.execute( ''' SELECT DISTINCT firstlevel.parent, relations.parent FROM ( SELECT DISTINCT parent FROM relations JOIN features ON features.id = relations.child WHERE features.featuretype = ? AND relations.level = 1 ) AS firstlevel JOIN relations ON firstlevel.parent = child WHERE relations.level = 1 ORDER BY relations.parent ''', (self.subfeature,)) # Now we iterate through those results (using a new cursor) to infer # the extent of transcripts and genes. last_gene_id = None n_features = 0 for transcript_id, gene_id in c: # transcript extent c2.execute( ''' SELECT MIN(start), MAX(end), strand, seqid FROM features JOIN relations ON features.id = relations.child WHERE parent = ? AND featuretype == ? ''', (transcript_id, self.subfeature)) transcript_start, transcript_end, strand, seqid = c2.fetchone() transcript_attributes = { self.transcript_key: [transcript_id], self.gene_key: [gene_id] } transcript_bin = bins.bins( transcript_start, transcript_end, one=True) # Write out to file; we'll be reading it back in shortly. Omit # score, frame, source, and extra since they will always have the # same default values (".", ".", "gffutils_derived", and [] # respectively) fout.write('\t'.join(map(str, [ transcript_id, seqid, transcript_start, transcript_end, strand, 'transcript', transcript_bin, helpers._jsonify(transcript_attributes) ])) + '\n') n_features += 1 # Infer gene extent, but only if we haven't done so already. if gene_id != last_gene_id: c2.execute( ''' SELECT MIN(start), MAX(end), strand, seqid FROM features JOIN relations ON features.id = relations.child WHERE parent = ? AND featuretype == ? ''', (gene_id, self.subfeature)) gene_start, gene_end, strand, seqid = c2.fetchone() gene_attributes = {self.gene_key: [gene_id]} gene_bin = bins.bins(gene_start, gene_end, one=True) fout.write('\t'.join(map(str, [ gene_id, seqid, gene_start, gene_end, strand, 'gene', gene_bin, helpers._jsonify(gene_attributes) ])) + '\n') last_gene_id = gene_id n_features += 1 fout.close() def derived_feature_generator(): """ Generator of items from the file that was just created... """ keys = ['parent', 'seqid', 'start', 'end', 'strand', 'featuretype', 'bin', 'attributes'] for line in open(fout.name): d = dict(list(zip(keys, line.strip().split('\t')))) d.pop('parent') d['score'] = '.' d['source'] = 'gffutils_derived' d['frame'] = '.' d['extra'] = [] d['attributes'] = helpers._unjsonify(d['attributes']) f = feature.Feature(**d) f.id = self._id_handler(f) yield f # Drop the indexes so the inserts are faster c.execute('DROP INDEX IF EXISTS relationsparent') c.execute('DROP INDEX IF EXISTS relationschild') # Insert the just-inferred transcripts and genes. TODO: should we # *always* use "merge" here for the merge_strategy? logger.info("Importing inferred features into db") last_perc = None for i, f in enumerate(derived_feature_generator()): perc = int(i / float(n_features) * 100) if perc != last_perc: sys.stderr.write('%s of %s (%s%%)\r' % (i, n_features, perc)) sys.stderr.flush() last_perc = perc try: self._insert(f, c) except sqlite3.IntegrityError: fixed, final_strategy = self._do_merge(f, 'merge') c.execute( ''' UPDATE features SET attributes = ? WHERE id = ? ''', (helpers._jsonify(fixed.attributes), fixed.id)) logger.info("Committing changes") self.conn.commit() os.unlink(fout.name)
def _populate_from_lines(self, lines): msg = ( "Populating features table and first-order relations: %d " "features\r" ) c = self.conn.cursor() last_perc = 0 lines_seen = None for i, f in enumerate(lines): lines_seen = i # Percent complete if self.verbose: if i % 1000 == 0: sys.stderr.write(msg % i) sys.stderr.flush() f.id = self._id_handler(f) # Insert the feature itself... try: self._insert(f, c) except sqlite3.IntegrityError: fixed, final_strategy = self._do_merge(f, self.merge_strategy) if final_strategy in ['merge', 'replace']: c.execute( ''' UPDATE features SET attributes = ? WHERE id = ? ''', (helpers._jsonify(fixed.attributes), fixed.id)) # For any additional fields we're merging, update those as # well. if self.force_merge_fields: _set_clause = ', '.join( ['%s = ?' % field for field in self.force_merge_fields]) values = [getattr(fixed, field) for field in self.force_merge_fields]\ + [fixed.id] c.execute( ''' UPDATE features SET %s WHERE id = ? ''' % _set_clause, values) elif final_strategy == 'create_unique': self._insert(f, c) # For an on-spec GTF file, # self.transcript_key = "transcript_id" # self.gene_key = "gene_id" relations = [] parent = None grandparent = None if self.transcript_key in f.attributes: parent = f.attributes[self.transcript_key][0] relations.append((parent, f.id, 1)) if self.gene_key in f.attributes: grandparent = f.attributes[self.gene_key] if len(grandparent) > 0: grandparent = grandparent[0] relations.append((grandparent, f.id, 2)) if parent is not None: relations.append((grandparent, parent, 1)) # Note the IGNORE, so relationships defined many times in the file # (e.g., the transcript-gene relation on pretty much every line in # a GTF) will only be included once. c.executemany( ''' INSERT OR IGNORE INTO relations (parent, child, level) VALUES (?, ?, ?) ''', relations ) if lines_seen is None: raise ValueError("No lines parsed -- was an empty file provided?") logger.info('Committing changes') self.conn.commit() if self.verbose: logger.info(msg % i)
def _populate_from_lines(self, lines): c = self.conn.cursor() self._drop_indexes() last_perc = 0 logger.info("Populating features") msg = ("Populating features table and first-order relations: " "%d features\r") # c.executemany() was not as much of an improvement as I had expected. # # Compared to a benchmark of doing each insert separately: # executemany using a list of dicts to iterate over is ~15% slower # executemany using a list of tuples to iterate over is ~8% faster features_seen = None _features, _relations = [], [] for i, f in enumerate(lines): features_seen = i # Percent complete if self.verbose: if i % 1000 == 0: sys.stderr.write(msg % i) sys.stderr.flush() # TODO: handle ID creation here...should be combined with the # INSERT below (that is, don't IGNORE below but catch the error and # re-try with a new ID). However, is this doable with an # execute-many? f.id = self._id_handler(f) try: self._insert(f, c) except sqlite3.IntegrityError: fixed, final_strategy = self._do_merge(f, self.merge_strategy) if final_strategy in ['merge', 'replace']: c.execute( ''' UPDATE features SET attributes = ? WHERE id = ? ''', (helpers._jsonify(fixed.attributes), fixed.id)) # For any additional fields we're merging, update those as # well. if self.force_merge_fields: _set_clause = ', '.join( ['%s = ?' % field for field in self.force_merge_fields]) values = [ getattr(fixed, field) for field in self.force_merge_fields] + [fixed.id] c.execute( ''' UPDATE features SET %s WHERE id = ? ''' % _set_clause, tuple(values)) elif final_strategy == 'create_unique': self._insert(f, c) if 'Parent' in f.attributes: for parent in f.attributes['Parent']: c.execute( ''' INSERT OR IGNORE INTO relations VALUES (?, ?, 1) ''', (parent, f.id)) if features_seen is None: raise ValueError("No lines parsed -- was an empty file provided?") self.conn.commit() if self.verbose: logger.info(msg % i)
def _populate_from_lines(self, lines): msg = ( "Populating features table and first-order relations: %d " "features\r" ) c = self.conn.cursor() # Only check this many features to see if it's a gene or transcript and # issue the appropriate warning. gene_and_transcript_check_limit = 1000 last_perc = 0 lines_seen = 0 for i, f in enumerate(lines): # See issues #48 and #20. if lines_seen < gene_and_transcript_check_limit: if ( f.featuretype == 'transcript' and not self.disable_infer_transcripts ): warnings.warn( "It appears you have a transcript feature in your GTF " "file. You may want to use the " "`disable_infer_transcripts` " "option to speed up database creation") elif ( f.featuretype == 'gene' and not self.disable_infer_genes ): warnings.warn( "It appears you have a gene feature in your GTF " "file. You may want to use the " "`disable_infer_genes` " "option to speed up database creation") lines_seen = i + 1 # Percent complete if self.verbose: if i % 1000 == 0: sys.stderr.write(msg % i) sys.stderr.flush() f.id = self._id_handler(f) # Insert the feature itself... try: self._insert(f, c) except sqlite3.IntegrityError: fixed, final_strategy = self._do_merge(f, self.merge_strategy) if final_strategy == 'merge': c.execute( ''' UPDATE features SET attributes = ? WHERE id = ? ''', (helpers._jsonify(fixed.attributes), fixed.id)) # For any additional fields we're merging, update those as # well. if self.force_merge_fields: _set_clause = ', '.join( ['%s = ?' % field for field in self.force_merge_fields]) values = [getattr(fixed, field) for field in self.force_merge_fields]\ + [fixed.id] c.execute( ''' UPDATE features SET %s WHERE id = ? ''' % _set_clause, values) elif final_strategy == 'replace': self._replace(f, c) elif final_strategy == 'create_unique': self._insert(f, c) # For an on-spec GTF file, # self.transcript_key = "transcript_id" # self.gene_key = "gene_id" relations = [] parent = None grandparent = None if self.transcript_key in f.attributes: parent = f.attributes[self.transcript_key][0] relations.append((parent, f.id, 1)) if self.gene_key in f.attributes: grandparent = f.attributes[self.gene_key] if len(grandparent) > 0: grandparent = grandparent[0] relations.append((grandparent, f.id, 2)) if parent is not None: relations.append((grandparent, parent, 1)) # Note the IGNORE, so relationships defined many times in the file # (e.g., the transcript-gene relation on pretty much every line in # a GTF) will only be included once. c.executemany( ''' INSERT OR IGNORE INTO relations (parent, child, level) VALUES (?, ?, ?) ''', relations ) if lines_seen == 0: raise ValueError("No lines parsed -- was an empty file provided?") logger.info('Committing changes') self.conn.commit() if self.verbose: logger.info(msg % i)
def _populate_from_lines(self, lines): msg = "Populating features table and first-order relations: %d " "features\r" c = self.conn.cursor() last_perc = 0 for i, f in enumerate(lines): # Percent complete if self.verbose: if i % 1000 == 0: sys.stderr.write(msg % i) sys.stderr.flush() f.id = self._id_handler(f) # Insert the feature itself... try: self._insert(f, c) except sqlite3.IntegrityError: fixed, final_strategy = self._do_merge(f, self.merge_strategy) if final_strategy in ["merge", "replace"]: c.execute( """ UPDATE features SET attributes = ? WHERE id = ? """, (helpers._jsonify(fixed.attributes), fixed.id), ) # For any additional fields we're merging, update those as # well. if self.force_merge_fields: _set_clause = ", ".join(["%s = ?" % field for field in self.force_merge_fields]) values = [getattr(fixed, field) for field in self.force_merge_fields] + [fixed.id] c.execute( """ UPDATE features SET %s WHERE id = ? """ % _set_clause, values, ) elif final_strategy == "create_unique": self._insert(f, c) # For an on-spec GTF file, # self.transcript_key = "transcript_id" # self.gene_key = "gene_id" relations = [] parent = None grandparent = None if self.transcript_key in f.attributes: parent = f.attributes[self.transcript_key][0] relations.append((parent, f.id, 1)) if self.gene_key in f.attributes: grandparent = f.attributes[self.gene_key] if len(grandparent) > 0: grandparent = grandparent[0] relations.append((grandparent, f.id, 2)) if parent is not None: relations.append((grandparent, parent, 1)) # Note the IGNORE, so relationships defined many times in the file # (e.g., the transcript-gene relation on pretty much every line in # a GTF) will only be included once. c.executemany( """ INSERT OR IGNORE INTO relations (parent, child, level) VALUES (?, ?, ?) """, relations, ) logger.info("Committing changes") self.conn.commit() if self.verbose: sys.stderr.write((msg % i) + "\n")
def _populate_from_lines(self, lines): c = self.conn.cursor() self._drop_indexes() last_perc = 0 logger.info("Populating features") msg = "Populating features table and first-order relations: " "%d features\r" # c.executemany() was not as much of an improvement as I had expected. # # Compared to a benchmark of doing each insert separately: # executemany using a list of dicts to iterate over is ~15% slower # executemany using a list of tuples to iterate over is ~8% faster _features, _relations = [], [] for i, f in enumerate(lines): # Percent complete if self.verbose: if i % 1000 == 0: logger.info(msg % i) # TODO: handle ID creation here...should be combined with the # INSERT below (that is, don't IGNORE below but catch the error and # re-try with a new ID). However, is this doable with an # execute-many? f.id = self._id_handler(f) try: self._insert(f, c) except sqlite3.IntegrityError: fixed, final_strategy = self._do_merge(f, self.merge_strategy) if final_strategy in ["merge", "replace"]: c.execute( """ UPDATE features SET attributes = ? WHERE id = ? """, (helpers._jsonify(fixed.attributes), fixed.id), ) # For any additional fields we're merging, update those as # well. if self.force_merge_fields: _set_clause = ", ".join(["%s = ?" % field for field in self.force_merge_fields]) values = [getattr(fixed, field) for field in self.force_merge_fields] + [fixed.id] c.execute( """ UPDATE features SET %s WHERE id = ? """ % _set_clause, tuple(values), ) elif final_strategy == "create_unique": self._insert(f, c) if "Parent" in f.attributes: for parent in f.attributes["Parent"]: c.execute( """ INSERT OR IGNORE INTO relations VALUES (?, ?, 1) """, (parent, f.id), ) self.conn.commit() if self.verbose: logger.info(msg % i)
def _update_relations(self): if self.disable_infer_genes and self.disable_infer_transcripts: return # TODO: do any indexes speed this up? c = self.conn.cursor() c2 = self.conn.cursor() logger.info("Creating relations(parent) index") c.execute('DROP INDEX IF EXISTS relationsparent') c.execute('CREATE INDEX relationsparent ON relations (parent)') logger.info("Creating relations(child) index") c.execute('DROP INDEX IF EXISTS relationschild') c.execute('CREATE INDEX relationschild ON relations (child)') if not (self.disable_infer_genes or self.disable_infer_transcripts): msg = 'gene and transcript' elif self.disable_infer_transcripts: msg = 'gene' elif self.disable_infer_genes: msg = 'transcript' logger.info('Inferring %s extents ' 'and writing to tempfile' % msg) if isinstance(self._keep_tempfiles, six.string_types): suffix = self._keep_tempfiles else: suffix = '.gffutils' tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix).name fout = open(tmp, 'w') self._tmpfile = tmp # This takes some explanation... # # First, the nested subquery gets the level-1 parents of # self.subfeature featuretypes. For an on-spec GTF file, # self.subfeature = "exon". So this subquery translates to getting the # distinct level-1 parents of exons -- which are transcripts. # # OK, so this first subquery is now a list of transcripts; call it # "firstlevel". # # Then join firstlevel on relations, but the trick is to now consider # each transcript a *child* -- so that relations.parent (on the first # line of the query) will be the first-level parent of the transcript # (the gene). # # # The result is something like: # # transcript1 gene1 # transcript2 gene1 # transcript3 gene2 # # Note that genes are repeated; below we need to ensure that only one # is added. To ensure this, the results are ordered by the gene ID. # # By the way, we do this even if we're only looking for transcripts or # only looking for genes. c.execute( ''' SELECT DISTINCT firstlevel.parent, relations.parent FROM ( SELECT DISTINCT parent FROM relations JOIN features ON features.id = relations.child WHERE features.featuretype = ? AND relations.level = 1 ) AS firstlevel JOIN relations ON firstlevel.parent = child WHERE relations.level = 1 ORDER BY relations.parent ''', (self.subfeature, )) # Now we iterate through those results (using a new cursor) to infer # the extent of transcripts and/or genes. last_gene_id = None n_features = 0 for transcript_id, gene_id in c: if not self.disable_infer_transcripts: # transcript extent c2.execute( ''' SELECT MIN(start), MAX(end), strand, seqid FROM features JOIN relations ON features.id = relations.child WHERE parent = ? AND featuretype == ? ''', (transcript_id, self.subfeature)) transcript_start, transcript_end, strand, seqid = c2.fetchone() transcript_attributes = { self.transcript_key: [transcript_id], self.gene_key: [gene_id] } transcript_bin = bins.bins(transcript_start, transcript_end, one=True) # Write out to file; we'll be reading it back in shortly. Omit # score, frame, source, and extra since they will always have # the same default values (".", ".", "gffutils_derived", and [] # respectively) fout.write('\t'.join( map(str, [ transcript_id, seqid, transcript_start, transcript_end, strand, 'transcript', transcript_bin, helpers._jsonify(transcript_attributes) ])) + '\n') n_features += 1 if not self.disable_infer_genes: # Infer gene extent, but only if we haven't done so already if gene_id != last_gene_id: c2.execute( ''' SELECT MIN(start), MAX(end), strand, seqid FROM features JOIN relations ON features.id = relations.child WHERE parent = ? AND featuretype == ? ''', (gene_id, self.subfeature)) gene_start, gene_end, strand, seqid = c2.fetchone() gene_attributes = {self.gene_key: [gene_id]} gene_bin = bins.bins(gene_start, gene_end, one=True) fout.write('\t'.join( map(str, [ gene_id, seqid, gene_start, gene_end, strand, 'gene', gene_bin, helpers._jsonify(gene_attributes) ])) + '\n') last_gene_id = gene_id n_features += 1 fout.close() def derived_feature_generator(): """ Generator of items from the file that was just created... """ keys = [ 'parent', 'seqid', 'start', 'end', 'strand', 'featuretype', 'bin', 'attributes' ] for line in open(fout.name): d = dict(list(zip(keys, line.strip().split('\t')))) d.pop('parent') d['score'] = '.' d['source'] = 'gffutils_derived' d['frame'] = '.' d['extra'] = [] d['attributes'] = helpers._unjsonify(d['attributes']) f = feature.Feature(**d) f.id = self._id_handler(f) yield f # Drop the indexes so the inserts are faster c.execute('DROP INDEX IF EXISTS relationsparent') c.execute('DROP INDEX IF EXISTS relationschild') # Insert the just-inferred transcripts and genes. TODO: should we # *always* use "merge" here for the merge_strategy? logger.info("Importing inferred features into db") last_perc = None for i, f in enumerate(derived_feature_generator()): perc = int(i / float(n_features) * 100) if perc != last_perc: sys.stderr.write('%s of %s (%s%%)\r' % (i, n_features, perc)) sys.stderr.flush() last_perc = perc try: self._insert(f, c) except sqlite3.IntegrityError: fixed, final_strategy = self._do_merge(f, 'merge') c.execute( ''' UPDATE features SET attributes = ? WHERE id = ? ''', (helpers._jsonify(fixed.attributes), fixed.id)) logger.info("Committing changes") self.conn.commit() if not self._keep_tempfiles: os.unlink(fout.name)
def _populate_from_lines(self, lines): msg = ("Populating features table and first-order relations: %d " "features\r") c = self.conn.cursor() # Only check this many features to see if it's a gene or transcript and # issue the appropriate warning. gene_and_transcript_check_limit = 1000 last_perc = 0 lines_seen = 0 for i, f in enumerate(lines): # See issues #48 and #20. if lines_seen < gene_and_transcript_check_limit: if (f.featuretype == 'transcript' and not self.disable_infer_transcripts): warnings.warn( "It appears you have a transcript feature in your GTF " "file. You may want to use the " "`disable_infer_transcripts` " "option to speed up database creation") elif (f.featuretype == 'gene' and not self.disable_infer_genes): warnings.warn( "It appears you have a gene feature in your GTF " "file. You may want to use the " "`disable_infer_genes` " "option to speed up database creation") lines_seen = i + 1 # Percent complete if self.verbose: if i % 1000 == 0: sys.stderr.write(msg % i) sys.stderr.flush() f.id = self._id_handler(f) # Insert the feature itself... try: self._insert(f, c) except sqlite3.IntegrityError: fixed, final_strategy = self._do_merge(f, self.merge_strategy) if final_strategy == 'merge': c.execute( ''' UPDATE features SET attributes = ? WHERE id = ? ''', (helpers._jsonify(fixed.attributes), fixed.id)) # For any additional fields we're merging, update those as # well. if self.force_merge_fields: _set_clause = ', '.join([ '%s = ?' % field for field in self.force_merge_fields ]) values = [getattr(fixed, field) for field in self.force_merge_fields]\ + [fixed.id] c.execute( ''' UPDATE features SET %s WHERE id = ? ''' % _set_clause, values) elif final_strategy == 'replace': self._replace(f, c) elif final_strategy == 'create_unique': self._insert(f, c) # For an on-spec GTF file, # self.transcript_key = "transcript_id" # self.gene_key = "gene_id" relations = [] parent = None grandparent = None if self.transcript_key in f.attributes: parent = f.attributes[self.transcript_key][0] relations.append((parent, f.id, 1)) if self.gene_key in f.attributes: grandparent = f.attributes[self.gene_key] if len(grandparent) > 0: grandparent = grandparent[0] relations.append((grandparent, f.id, 2)) if parent is not None: relations.append((grandparent, parent, 1)) # Note the IGNORE, so relationships defined many times in the file # (e.g., the transcript-gene relation on pretty much every line in # a GTF) will only be included once. c.executemany( ''' INSERT OR IGNORE INTO relations (parent, child, level) VALUES (?, ?, ?) ''', relations) if lines_seen == 0: raise ValueError("No lines parsed -- was an empty file provided?") logger.info('Committing changes') self.conn.commit() if self.verbose: logger.info(msg % i)
def _populate_from_lines(self, lines): c = self.conn.cursor() self._drop_indexes() last_perc = 0 logger.info("Populating features") msg = ("Populating features table and first-order relations: " "%d features\r") # c.executemany() was not as much of an improvement as I had expected. # # Compared to a benchmark of doing each insert separately: # executemany using a list of dicts to iterate over is ~15% slower # executemany using a list of tuples to iterate over is ~8% faster features_seen = None _features, _relations = [], [] for i, f in enumerate(lines): features_seen = i # Percent complete if self.verbose: if i % 1000 == 0: sys.stderr.write(msg % i) sys.stderr.flush() # TODO: handle ID creation here...should be combined with the # INSERT below (that is, don't IGNORE below but catch the error and # re-try with a new ID). However, is this doable with an # execute-many? f.id = self._id_handler(f) try: self._insert(f, c) except sqlite3.IntegrityError: fixed, final_strategy = self._do_merge(f, self.merge_strategy) if final_strategy == 'merge': c.execute( ''' UPDATE features SET attributes = ? WHERE id = ? ''', (helpers._jsonify(fixed.attributes), fixed.id)) # For any additional fields we're merging, update those as # well. if self.force_merge_fields: _set_clause = ', '.join([ '%s = ?' % field for field in self.force_merge_fields ]) values = [ getattr(fixed, field) for field in self.force_merge_fields ] + [fixed.id] c.execute( ''' UPDATE features SET %s WHERE id = ? ''' % _set_clause, tuple(values)) elif final_strategy == 'replace': self._replace(f, c) elif final_strategy == 'create_unique': self._insert(f, c) if 'Parent' in f.attributes: for parent in f.attributes['Parent']: c.execute( ''' INSERT OR IGNORE INTO relations VALUES (?, ?, 1) ''', (parent, f.id)) if features_seen is None: raise ValueError("No lines parsed -- was an empty file provided?") self.conn.commit() if self.verbose: logger.info(msg % i)