def _filter_bad_species(cursor, read2tax, bad_species, tax_tree): questionable_reads = [] for read, tax in read2tax.iteritems(): if tax in bad_species: questionable_reads.append(read) with timeit('EXTRACTING BAD SPECIES READ INFO'): cursor.execute( 'SELECT * FROM alignment WHERE read_id in (%s)' % (','.join(['?'] * len(questionable_reads))), questionable_reads) data = cursor.fetchall() read2aln = defaultdict(list) for d in data: read2aln[d[0]].append(d[1:]) new_read2tax = {} with timeit('ASSIGNING NEW OTU TO BAD SPECIES READS'): reads = set(map(lambda d: d[0], data)) for read in reads: alns = read2aln[read] # alns = map(lambda d: d[1:], filter(lambda d: d[0] == read, data)) new_read2tax[read] = assign_OTU_from_scores( alns, tax_tree, bad_species) return new_read2tax
def parse_tab_delimited(input_file, db_type, at_once=1e5, detailed=False, read_alns=defaultdict(list), target_seqs={}, filter_low_scoring=True, annotate=True, entry_cnt=None): if entry_cnt is None: with timeit('Line count'): with open(input_file) as f: line_count = sum(1 for line in f) else: line_count = entry_cnt rolls = int(line_count / at_once + 1) with timeit('Pure file parsing'): with open(input_file, 'r') as fin: for i in xrange(rolls): with oneline_timeit('Step %d/%d' % (i, rolls)): new_reads = _parse_n_lines(fin, at_once, read_alns, target_seqs, db_type) if filter_low_scoring: _filter_low_scoring(read_alns, 100., 0.01, new_reads) if not annotate: return read_alns, target_seqs else: return annotate_targets(read_alns, target_seqs)
def fill_blast_aln_table(db, cursor, data_access, aln_file): gis = set() sql = 'INSERT INTO %s_tmp(read_id, version, nucl_gi, tax_id, score, start, end, strand) VALUES (?,?,?,?,?,?,?,?)' % _ALN_TABLE_NAME_ with timeit('PARSING BLAST'): for read_id, aln_data in BLASTParser().parse_file(aln_file): gis.add(aln_data.gi) with timeit('LOADING TAXIDS'): tax_ids = data_access.get_taxids(list(gis), format=dict) with timeit('INSERTING INTO DATABASE'): for read_id, aln_data in BLASTParser().parse_file(aln_file): tax_id = tax_ids.get(aln_data.gi, -1) cursor.execute(sql, (read_id, aln_data.nucleotide_accession, aln_data.gi, tax_id, aln_data.score, aln_data.start, aln_data.stop, aln_data.strand)) with timeit('REORDERING DATA'): cursor.execute('CREATE TABLE alignment(read_id TEXT, version TEXT, nucl_gi INTEGER, tax_id INTEGER, score REAL, start INTEGER, end INTEGER, strand TEXT)') cursor.execute('INSERT INTO alignment (read_id, version, nucl_gi, tax_id, score, start, end, strand) SELECT read_id, version, nucl_gi, tax_id, score, start, end, strand FROM alignment_tmp ORDER BY read_id') cursor.execute('DROP TABLE alignment_tmp') db.commit() db.close()
def parse_cds_sam(input_file, db_type, binary=False, annotate=True, read_alns=defaultdict(list), target_seqs={}, entry_cnt=None, detailed=False): with timeit('Pysam load of the %s file (%s)' % (input_file.split( os.path.sep)[-1], get_appropriate_file_size(input_file))): if binary: samfile = pysam.Samfile(input_file, 'rb') else: samfile = pysam.Samfile(input_file, 'r') if entry_cnt is not None: _1perc_reads = entry_cnt / 100 data_access = DataAccess() unmapped = set() lengths = np.array(samfile.lengths) with timeit('Sequence iteration'): for i, ar in enumerate(samfile.fetch()): if entry_cnt is not None: if i % _1perc_reads == 0: print '%d %%' % (i / _1perc_reads) else: if i % 10000 == 0: print 'Processed %d aligned reads.' % i if ar.is_unmapped: unmapped.add(ar.qname) continue read_id = ar.qname.split()[0] tstart, tend = ar.aend - ar.alen, ar.aend qstart, qend = ar.qstart, ar.qend if ar.tid == -1: continue target_name = samfile.getrname(ar.tid) target_len = lengths[ar.tid] target_seq = create_sequence(target_name, target_len, db_type) if target_seq.get_id() not in target_seqs: target_seqs[target_seq.get_id()] = target_seq aln = SamAlignment(read_id, target_seq.get_id(), qstart, qend, tstart, tend, ar.mapq) if not ar.is_secondary: aln.is_best = True read_alns[read_id].append(aln) if not annotate: return read_alns, target_seqs else: return annotate_targets(read_alns, target_seqs) samfile.close()
def parse_cds_sam(input_file, db_type, binary=False, annotate=True, read_alns=defaultdict(list), target_seqs={}, entry_cnt=None, detailed=False): with timeit('Pysam load of the %s file (%s)' % (input_file.split(os.path.sep)[-1], get_appropriate_file_size(input_file))): if binary: samfile = pysam.Samfile(input_file, 'rb') else: samfile = pysam.Samfile(input_file, 'r') if entry_cnt is not None: _1perc_reads = entry_cnt / 100 data_access = DataAccess() unmapped = set() lengths = np.array(samfile.lengths) with timeit('Sequence iteration'): for i, ar in enumerate(samfile.fetch()): if entry_cnt is not None: if i % _1perc_reads == 0: print '%d %%' % (i / _1perc_reads) else: if i % 10000 == 0: print 'Processed %d aligned reads.' % i if ar.is_unmapped: unmapped.add(ar.qname) continue read_id = ar.qname.split()[0] tstart, tend = ar.aend - ar.alen, ar.aend qstart, qend = ar.qstart, ar.qend if ar.tid == -1: continue target_name = samfile.getrname(ar.tid) target_len = lengths[ar.tid] target_seq = create_sequence(target_name, target_len, db_type) if target_seq.get_id() not in target_seqs: target_seqs[target_seq.get_id()] = target_seq aln = SamAlignment(read_id, target_seq.get_id(), qstart, qend, tstart, tend, ar.mapq) if not ar.is_secondary: aln.is_best = True read_alns[read_id].append(aln) if not annotate: return read_alns, target_seqs else: return annotate_targets(read_alns, target_seqs) samfile.close()
def analyze(): parser = get_aln_option_parser() namespace = parser.parse_args() db = sqlite3.connect(namespace.db_file) cursor = db.cursor() empty_reads_file = os.path.sep.join([namespace.output_dir, 'no_alignment_reads.fa']) basic_stats_file = os.path.sep.join([namespace.output_dir, 'basic_alignment_stats.txt']) phylum_composition_file = os.path.sep.join([namespace.output_dir, 'phylum_composition.csv']) no_otu_reads_file = os.path.sep.join([namespace.output_dir, 'no_OTU_assignment_reads.fa']) otu_csv = os.path.sep.join([namespace.output_dir, 'OTU_assignment.csv']) otu_json = os.path.sep.join([namespace.output_dir, 'OTU_assignment.json']) with timeit('LOADING TAX TREE'): tt = TaxTree() read2tax, low_count_taxa = analysis.perform_OTU_analysis_on_db(db, cursor, tt, empty_reads_file, basic_stats_file, phylum_composition_file) taxa = set(read2tax.values()) num_alns = len(read2tax) no_otu_reads = filter(lambda r: read2tax[r] == -1, read2tax.keys()) save_reads_to_fasta(db, cursor, no_otu_reads, no_otu_reads_file) export_OTU_to_csv(read2tax, tt, otu_csv) export_OTU_to_json(read2tax, tt, otu_json) db.close()
def analyze(): parser = get_aln_option_parser() namespace = parser.parse_args() db = sqlite3.connect(namespace.db_file) cursor = db.cursor() empty_reads_file = os.path.sep.join( [namespace.output_dir, 'no_alignment_reads.fa']) basic_stats_file = os.path.sep.join( [namespace.output_dir, 'basic_alignment_stats.txt']) phylum_composition_file = os.path.sep.join( [namespace.output_dir, 'phylum_composition.csv']) no_otu_reads_file = os.path.sep.join( [namespace.output_dir, 'no_OTU_assignment_reads.fa']) otu_csv = os.path.sep.join([namespace.output_dir, 'OTU_assignment.csv']) otu_json = os.path.sep.join([namespace.output_dir, 'OTU_assignment.json']) with timeit('LOADING TAX TREE'): tt = TaxTree() read2tax, low_count_taxa = analysis.perform_OTU_analysis_on_db( db, cursor, tt, empty_reads_file, basic_stats_file, phylum_composition_file) taxa = set(read2tax.values()) num_alns = len(read2tax) no_otu_reads = filter(lambda r: read2tax[r] == -1, read2tax.keys()) save_reads_to_fasta(db, cursor, no_otu_reads, no_otu_reads_file) export_OTU_to_csv(read2tax, tt, otu_csv) export_OTU_to_json(read2tax, tt, otu_json) db.close()
def annotate_targets(read_alns, target_seqs): with timeit('Removing reads with 0 alignments'): zero_aln_reads = filter(lambda r: len(read_alns[r]) == 0, read_alns) for read in zero_aln_reads: read_alns.pop(read) print 'Removed %d reads' % len(zero_aln_reads) # Annotate the best alignment for all the reads. with timeit('Adding alignments to target sequences'): total_reads = len(read_alns) step = max(1, total_reads / 100) print 'Number of reads to annotate:', total_reads progressbar(0, start=True) for i, (read, alns) in enumerate(read_alns.iteritems()): if i % step == 0: progressbar(i / step) alns.sort(reverse=True) max_score = alns[0].bitscore for aln in alns[:max(1, int(0.1 * len(alns)))]: if aln.bitscore == max_score: aln.is_best = True else: aln.is_best = False progressbar(100, end=True) with timeit('Functional transcript annotation'): for read, alns in read_alns.iteritems(): for aln in alns: target = target_seqs[aln.target_id] if aln.is_best: target.num_of_ba_reads += 1 target.num_of_aln_reads += 1 target.add_alignment(aln) with timeit('Removing 0 alignments target seqs'): zero_aln_targets = filter( lambda t: len(target_seqs[t].get_alignments()) == 0, target_seqs.iterkeys()) for target in zero_aln_targets: target_seqs.pop(target) print 'Removed %d targets.' % len(zero_aln_targets) for target in target_seqs.itervalues(): target.join_regions() target.join_ba_regions() return read_alns, target_seqs
def generate_consensus(): parser = get_consensus_parser() args = parser.parse_args() if args.seq_file is not None: generate_seq, seq = True, args.seq_file else: generate_seq, seq = False, None with utils.timeit('Loading tax tree'): tt = tax.TaxTree() with utils.timeit( 'Parsing aln file %s <%s>' % (args.aln_file, utils.get_appropriate_file_size(args.aln_file))): read_alns, target_seqs = din.parse(args.aln_file, args.db_type, detailed=True) with utils.timeit('Retrieving tax data'): utils.retrieve_tax_ids(target_seqs, args.db_type) with utils.timeit('Creating consensuses'): with open(args.output_vcf, 'w') as fout: filters = consensus_tools.get_filters() infos = consensus_tools.get_infos() fout.write(vcf_header(filters, infos, args.ref_db)) fout.write('\n') fout.write('#CHROM\tPOS\tID\tREF\tALT\tFILTER\tINFO\n') for ts in target_seqs.itervalues(): snps = 0 for loc, cov, e, fres, infos in consensus_tools.itervariants( ts.alignment, args.output_vcf, generate_seq, seq): snps += 1 filter_output = 'PASS' if fres == [] else ';'.join(fres) info_output = ';'.join('%s=%s' % (k, v) for k, v in infos.iteritems()) fout.write('%s\t%d\t.\t%s\t%s\t%s\t%s\n' % (ts.get_id(), loc, e.target, e.query, filter_output, info_output)) print '-' * 80 print 'Organism: ', tt.get_org_name(ts.tax_id) print 'Sequence length: ', ts.alignment.length print 'Total coverage: ', ts.alignment.get_coverage() print 'Fold: ', ts.alignment.get_fold() print 'Confirmed SNPs: ', snps print
def annotate_targets(read_alns, target_seqs): with timeit('Removing reads with 0 alignments'): zero_aln_reads = filter(lambda r: len(read_alns[r])==0, read_alns) for read in zero_aln_reads: read_alns.pop(read) print 'Removed %d reads' % len(zero_aln_reads) # Annotate the best alignment for all the reads. with timeit('Adding alignments to target sequences'): total_reads = len(read_alns) step = max(1, total_reads/100) print 'Number of reads to annotate:', total_reads progressbar(0, start=True) for i, (read, alns) in enumerate(read_alns.iteritems()): if i % step == 0: progressbar(i/step) alns.sort(reverse=True) max_score = alns[0].bitscore for aln in alns[:max(1, int(0.1*len(alns)))]: if aln.bitscore == max_score: aln.is_best = True else: aln.is_best = False progressbar(100, end=True) with timeit('Functional transcript annotation'): for read, alns in read_alns.iteritems(): for aln in alns: target = target_seqs[aln.target_id] if aln.is_best: target.num_of_ba_reads += 1 target.num_of_aln_reads += 1 target.add_alignment(aln) with timeit('Removing 0 alignments target seqs'): zero_aln_targets = filter(lambda t: len(target_seqs[t].get_alignments()) == 0, target_seqs.iterkeys()) for target in zero_aln_targets: target_seqs.pop(target) print 'Removed %d targets.' % len(zero_aln_targets) for target in target_seqs.itervalues(): target.join_regions() target.join_ba_regions() return read_alns, target_seqs
def __init__(self, seq_dir, freq_tool): super(SpeciesCorpus, self).__init__() self.seq_dir = seq_dir self.freq_tool = freq_tool with open(os.path.sep.join([seq_dir, 'name2tax.txt'])) as fin: self.name2tax = dict(map(lambda d: (d[1], int(d[0])), [l.strip().split('|') for l in fin])) self.index2tax = {} with timeit('Initializing dictionary'): self.dictionary = corpora.Dictionary((data for data in iter_over_species_files(self.seq_dir)), freq_tool=freq_tool)
def lsi_corpus(): parser = get_freq_analysis_parser() namespace = parser.parse_args() seq_dir = namespace.seq_dir freq_analysis_tool = namespace.freq_analysis_tool with timeit('Calculating species corpus'): sc = SpeciesCorpus(seq_dir, namespace.freq_analysis_tool) corpora.MmCorpus.serialize('/home/abulovic/tmp/bact_corpus.mm', sc) sc.dictionary.save('/home/abulovic/tmp/bact_triplets.dict')
def parse(): parser = get_parse_option_parser() namespace = parser.parse_args() if namespace.type != 'cds': raise NotImplementedError('Genome parsing will be up in a jiffy.') file_type = get_file_type(namespace.input_file) with timeit('Parsing alignment file'): if file_type == 'blast': read_alns, transcripts = parse_cds_megablast( namespace.input_file, namespace.output_dir) elif file_type == 'sam': read_alns, transcripts = parse_cds_sam(namespace.input_file, namespace.output_dir, binary=False) elif file_type == 'bam': read_alns, transcripts = parse_cds_sam(namespace.input_file, namespace.output_dir, binary=True) with timeit('Loading tax tree'): tax_tree = TaxTree() spec2trans = get_species_transcript_distribution(transcripts, tax_tree) for spec, trans in spec2trans.iteritems(): if len(filter(lambda t: t.total_coverage > 0.9, trans)) > 0: if spec <= 0: continue int1 = filter(lambda t: t.total_coverage > 0.9, trans) print tax_tree.nodes[spec].organism_name, print len(trans) print ' '.join( map( lambda t: "(%.3f, %.3f)" % (t.total_coverage, t.coverage_fold), int1)) print get_read_aln_distribution( read_alns, os.path.sep.join([namespace.output_dir, 'plot.png'])) get_spec_transcript_distribution( spec2trans, os.path.sep.join([namespace.output_dir, 'plot2.png']))
def generate_consensus(): parser = get_consensus_parser() args = parser.parse_args() if args.seq_file is not None: generate_seq, seq = True, args.seq_file else: generate_seq, seq = False, None with utils.timeit('Loading tax tree'): tt = tax.TaxTree() with utils.timeit('Parsing aln file %s <%s>' % (args.aln_file, utils.get_appropriate_file_size(args.aln_file))): read_alns, target_seqs = din.parse(args.aln_file, args.db_type, detailed=True) with utils.timeit('Retrieving tax data'): utils.retrieve_tax_ids(target_seqs, args.db_type) with utils.timeit('Creating consensuses'): with open(args.output_vcf, 'w') as fout: filters = consensus_tools.get_filters() infos = consensus_tools.get_infos() fout.write(vcf_header(filters, infos, args.ref_db)) fout.write('\n') fout.write('#CHROM\tPOS\tID\tREF\tALT\tFILTER\tINFO\n') for ts in target_seqs.itervalues(): snps = 0 for loc, cov, e, fres, infos in consensus_tools.itervariants(ts.alignment, args.output_vcf, generate_seq, seq): snps += 1 filter_output = 'PASS' if fres == [] else ';'.join(fres) info_output = ';'.join('%s=%s' % (k, v) for k, v in infos.iteritems()) fout.write('%s\t%d\t.\t%s\t%s\t%s\t%s\n' % (ts.get_id(), loc, e.target, e.query, filter_output, info_output)) print '-'*80 print 'Organism: ', tt.get_org_name(ts.tax_id) print 'Sequence length: ', ts.alignment.length print 'Total coverage: ', ts.alignment.get_coverage() print 'Fold: ', ts.alignment.get_fold() print 'Confirmed SNPs: ', snps print
def _filter_bad_species(cursor, read2tax, bad_species, tax_tree): questionable_reads = [] for read, tax in read2tax.iteritems(): if tax in bad_species: questionable_reads.append(read) with timeit('EXTRACTING BAD SPECIES READ INFO'): cursor.execute('SELECT * FROM alignment WHERE read_id in (%s)' % (','.join(['?']*len(questionable_reads))), questionable_reads) data = cursor.fetchall() read2aln = defaultdict(list) for d in data: read2aln[d[0]].append(d[1:]) new_read2tax = {} with timeit('ASSIGNING NEW OTU TO BAD SPECIES READS'): reads = set(map(lambda d: d[0], data)) for read in reads: alns = read2aln[read] # alns = map(lambda d: d[1:], filter(lambda d: d[0] == read, data)) new_read2tax[read] = assign_OTU_from_scores(alns, tax_tree, bad_species) return new_read2tax
def annotate_targets(read_alns, target_seqs): with timeit('Semantic target sequence annotation'): for read, alns in read_alns.iteritems(): for aln in alns: target = target_seqs[aln.target_id] if aln.is_best: target.num_of_ba_reads += 1 target.num_of_aln_reads += 1 target.add_alignment(aln) for target in target_seqs.itervalues(): target.join_regions() target.join_ba_regions() return read_alns, target_seqs
def fill_read_table(db, cursor, fasta1, fasta2, format, pair_end, store_seq): if pair_end: with nested(open(fasta1), open(fasta2)) as (fin1, fin2): records1 = SeqIO.parse(fin1, format) records2 = SeqIO.parse(fin2, format) for rec1, rec2 in izip(records1, records2): if store_seq: cursor.execute('INSERT INTO %s(read_id, sequence1, sequence2) VALUES (?,?,?)' % _READ_TABLE_NAME_, (rec1.id, str(rec1.seq), str(rec2.seq))) else: cursor.execute('INSERT INTO %s(read_id) VALUES (?)' % _READ_TABLE_NAME_, (rec1.id,)) else: with timeit('STORING SEQUENCES'): for rec in iter_input_records(fasta1, format): if store_seq: cursor.execute('INSERT INTO %s(read_id, sequence) VALUES (?,?)' % _READ_TABLE_NAME_, (rec.id, str(rec.seq))) else: cursor.execute('INSERT INTO %s(read_id) VALUES (?)' % _READ_TABLE_NAME_, (rec.id,)) db.commit() db.close()
def greedy(): parser = get_OTU_assign_option_parser() args = parser.parse_args() if args.read_count != 0: total_read_count = args.read_count all_reads = utils.reads_from_fasta(args.original_fasta) with Report(args.output_dir, 'microbe') as report: file_type = utils.get_file_type(args.input_file) with report.timeit('Parsing %s <%s>' % (args.input_file, utils.get_appropriate_file_size(args.input_file))): if file_type == 'blast': count_entries = blast.get_entry_cnt_tab parse_func = blast.parse_tab_delimited elif file_type in ('sam', 'bam'): count_entries = sam.get_entry_cnt_sam parse_func = sam.parse_cds_sam elif file_type == 'xml': count_entries = blast.get_entry_cnt_xml parse_func = blast.parse_xml else: raise ValueError('%s alignment format not supported!' % file_type) with utils.timeit('Retrieving entry count'): entry_cnt = count_entries(args.input_file) report.mark('\tTotal entries: %d' % entry_cnt) with utils.timeit('File parsing'): read_alns, target_seqs = parse_func(args.input_file, args.db_type, entry_cnt=entry_cnt, detailed=True) report.mark('\tTarget sequences: %d' % len(target_seqs)) with report.timeit('GI2TAX database querying'): utils.retrieve_tax_ids(target_seqs, args.db_type) with report.timeit('loading tax tree'): tt = TaxTree() report.mark('Loaded %d nodes.' % len(tt.nodes)) #profiling.get_read_overlap(target_seqs, read_alns, tt, all_reads) #profiling.sequential_read_set_analysis(read_alns, target_seqs, tt) coverage_limit = 0.6 fold_limit = 1. with report.timeit('greedy transcript assignment'): report.mark('Coverage threshold: %.2f' % coverage_limit) report.mark('Fold threshold: %.2f' % fold_limit) report.mark('#Transcripts (before read assignment) : %d' % len(target_seqs)) prefilt_transcripts = otu.greedy_transcript_assign(target_seqs, read_alns) report.mark('#Transcripts (after read assignemnt) : %d' % len(prefilt_transcripts)) #if args.db_type == 'cds': final_transcripts = otu.filter_by_coverage_fold(prefilt_transcripts, 0.6, 1.) #else: # final_transcripts = prefilt_transcripts report.mark('#Transcripts (after cov-fold filtering): %d' % len(final_transcripts)) total_reads = len(read_alns) with report.timeit('Species assignment stats'): s2t_nofilt = utils.get_species_transcript_distribution(target_seqs, tt) s2t_greedy = utils.get_species_transcript_distribution(final_transcripts, tt) report.mark('#Species (pre-read-assignment) : %d' % len(s2t_nofilt)) report.mark('#Species (post-read-assignment): %d' % len(s2t_greedy)) report.rank_distribution(s2t_nofilt, tt, 'nofilt') report.rank_distribution(s2t_greedy, tt, 'greedy') report.tax_tree(s2t_nofilt, tt, 'nofilt') report.tax_tree(s2t_greedy, tt, 'greedy') if args.db_type == 'cds': new_s2t = otu.remove_orthologue_strains(s2t_greedy) else: new_s2t = s2t_greedy if args.db_type == 'cds': with report.timeit('Transcript stats'): report.transcript_stats(s2t_nofilt, tt, 'nofilt') report.transcript_stats(new_s2t, tt, 'greedy', assigned=True) report.tax2reads(new_s2t, tt, 'greedy', 'json') if args.db_type == 'cds': with report.timeit('Outputing gene expression'): report.gene_expression(s2t_nofilt, tt, 'gene_expression_nofilt', assigned=False) report.gene_expression(new_s2t, tt, 'gene_expression', assigned=True) report.summary(read_alns, s2t_nofilt, new_s2t, args.original_fasta)