def index(filename, k): '''''' start = time.time() print 'indexing', filename mer_count = 4**k dbname = '.'.join(filename.split('.')[:-1]) + '.mfe_index' kmer_lookup = collections.defaultdict(list) is_empty = False is_db_new = True contig_lengths = [] total_offset = 0 for record in FastaIterator.parse(open(filename)): is_empty = False print record.id start_time = time.time() fasta_seq = record.seq dna2int.update_lookup(kmer_lookup, fasta_seq, total_offset, k) contig_lengths.append((record.id, len(fasta_seq))) total_offset += len(fasta_seq) print '%i bp took %.2f seconds' % (len(fasta_seq), time.time() - start_time) store_index(dbname, kmer_lookup, contig_lengths, k) print "Time used: %s" % str(time.time() - start) print 'Done.'
def index(filename, k): '''''' start = time() mer_count = 4**k dbname = '.'.join(filename.split('.')[:-1]) + '.sqlite3.db' conn = sqlite3.connect(dbname) cur = conn.cursor() cur.executescript(''' drop table if exists pos; create table pos( mer_id integer primary key, plus text, minus text );''') plus = ['']*mer_count minus = ['']*mer_count is_empty = False is_db_new = True for record in FastaIterator.parse(open(filename)): is_empty = False print record.id fasta_seq = record.seq #print 'Time used: ', time() - start plus_mer_list = [''] * mer_count minus_mer_list = [''] * mer_count i_max = len(fasta_seq) - k i = 0 kmer = fasta_seq[:k] while i < i_max: #print i, len(fasta_seq), i_max #print kmer try: plus_mer_id, minus_mer_id = DNA2int_2(kmer) except: #print 'Unrecognized base: %s' % fasta_seq[i+k] # Skip the unrecognized base, such as 'N' i += 1 kmer = kmer[1:] + fasta_seq[i+k-1] continue if plus_mer_list[plus_mer_id]: plus_mer_list[plus_mer_id] += ',%i' % (i+k-1) else: plus_mer_list[plus_mer_id] = str(i+k-1) if minus_mer_list[minus_mer_id]: minus_mer_list[minus_mer_id] += ',%i' % (i) else: minus_mer_list[minus_mer_id] = str(i) i += 1 kmer = kmer[1:] + fasta_seq[i+k-1] if not i % 100000: print "%s: %.2f%%, %s" % (record.id, i/i_max*100, str(datetime.timedelta(seconds=(time() - start)))) else: pass #print 'Time used: ', time() - start for mer_id in xrange(mer_count): if plus_mer_list[mer_id]: if plus[mer_id]: plus[mer_id] += ';%s:%s' % (record.id, plus_mer_list[mer_id]) else: plus[mer_id] = '%s:%s' % (record.id, plus_mer_list[mer_id]) if minus_mer_list[mer_id]: if minus[mer_id]: minus[mer_id] += ';%s:%s' % (record.id, minus_mer_list[mer_id]) else: minus[mer_id] = '%s:%s' % (record.id, minus_mer_list[mer_id]) memory_percent = get_memory_percent() if memory_percent > 50: if is_db_new: insert_db(conn, mer_count, plus, minus) is_db_new = False else: update_db(conn, mer_count, plus, minus) # Empty the container plus = ['']*mer_count minus = ['']*mer_count is_empty = True print 'Empty plus and minus due to the memory: %s.' % memory_percent if not is_empty: if is_db_new: insert_db(conn, mer_count, plus, minus) else: update_db(conn, mer_count, plus, minus) print "Time used: %s" % str(datetime.timedelta(seconds=(time() - start))) print 'Done.'