def _offline_counter(args): """ Offline counting from SAM/BAM file. """ # Offline counting from SAM/BAM file: counts = read_counter.count_reads(args.bam.name, in_format=args.f, min_aln_qual=args.a, verbose=not args.Q) counts = OrderedDict(six.iteritems(counts)) calc_words = [int(k) for k in args.k.split(",")] data = OrderedDict() # Calculate sequence properties: if args.z is not None: lengths, gc_contents, word_freqs = {}, {}, defaultdict( lambda: defaultdict(dict)) ref_iter = seq_util.read_seq_records(args.z) if not args.Q: sys.stderr.write("Calculating sequence features:\n") ref_iter = tqdm.tqdm(ref_iter) for ref in ref_iter: # Augment counts dictionary with missing reference entries: if ref.id not in counts: counts[ref.id] = 0 lengths[ref.id] = len(ref) gc_contents[ref.id] = seq_util.gc_content(str(ref.seq)) if args.k is not None: for word_size in calc_words: bf = seq_util.word_composition(ref.seq, word_size) for word, count in six.iteritems(bf): word_freqs[word_size][ ref.id][word] = float(count) / len(ref) data['Length'] = [lengths[tr] for tr in six.iterkeys(counts)] data['GC_content'] = [gc_contents[tr] for tr in six.iterkeys(counts)] data['Reference'] = list(counts.keys()) data['Count'] = list(counts.values()) # Calculate word frequencies: if args.k is not None and args.z: for ks in calc_words: for word in next(iter((word_freqs[ks].values()))).keys(): tmp = [] for ref in counts.keys(): tmp.append(word_freqs[ks][ref][word]) data[word] = tmp data_frame = pd.DataFrame(data) data_frame = data_frame.sort_values(['Count', 'Reference'], ascending=False) if args.t is not None: data_frame.to_csv(args.t, sep='\t', index=False) if args.p is not None: misc.pickle_dump(data, args.p)
def _online_counter(args): """ Online counting from SAM stream. """ # Open counts stream: counts_iter = read_counter.count_reads_realtime(alignment_file='-', in_format=args.f, min_aln_qual=args.a, verbose=not args.Q, yield_freq=args.F) for counts in counts_iter: data_frame = pd.DataFrame( OrderedDict([('Reference', list(counts.keys())), ('Count', list(counts.values()))])) data_frame = data_frame.sort_values(['Count', 'Reference'], ascending=False) if args.t is not None: data_frame.to_csv(args.t, sep='\t', index=False) if args.p is not None: misc.pickle_dump(counts, args.p)
type=str, help="Save pickled results in this file.", default=None) parser.add_argument('input_fastx', nargs='?', help='Input (default: stdin).', type=argparse.FileType('r'), default=sys.stdin) if __name__ == '__main__': args = parser.parse_args() in_format = args.f input_iterator = seq_util.read_seq_records(args.input_fastx, format=in_format) total_bases = 0 for record in input_iterator: total_bases += len(record) results = {'total_bases': total_bases} print("Total bases\t{}".format(total_bases)) if args.s is not None: results['genome_size'] = args.s results['coverage'] = float(total_bases) / args.s print("Genome size\t{}".format(results['genome_size'])) print("Coverage\t{}".format(results['coverage'])) if args.p is not None: misc.pickle_dump(results, args.p)
verbose=verbose) read_stats = err_read_stats['read_stats'] error_stats = err_read_stats['events'] base_stats = err_read_stats['base_stats'] indel_stats = err_read_stats['indel_dists'] read_qual_qc(read_stats, plotter, args.i) base_stats_qc(base_stats, plotter) error_stat_qc(error_stats, plotter, context_sizes, ommit_diagonal=True) indel_dist_qc(indel_stats, plotter) pileup_stats = None if not args.x: pileup_stats = bam_stats.pileup_stats(args.bam, region=args.c, verbose=verbose) ref_qual_qc(pileup_stats, plotter, verbose) plotter.close() # Dump results of parsing into output pickle: rd = { 'tag': tag, 'read_stats': read_stats, 'error_stats': error_stats, 'indel_stats': indel_stats, 'pileup_stats': pileup_stats, 'base_stats': base_stats } misc.pickle_dump(rd, args.p)
plt.tight_layout() plotter.pages.savefig() stats.plot(kind='barh', subplots=True, legend=False, sharex=False) plt.tight_layout() plotter.pages.savefig() match.plot(kind='barh', subplots=True, legend=False) plt.tight_layout() plotter.pages.savefig() miss.plot(kind='barh', subplots=True, legend=False, sharex=False) plt.tight_layout() plotter.pages.savefig() novel.plot(kind='barh', subplots=True, legend=False, sharex=False) plt.tight_layout() plotter.pages.savefig() plotter.close() if args.p is not None: p = { 'total': total, 'stats': stats, 'match': match, 'miss': miss, 'novel': novel } misc.pickle_dump(p, args.p)
verbose=not args.Q) read_stats['tag'] = tag base_stats = read_stats['base_stats'] precision_stats = read_stats['read_stats'] base_stats_qc(base_stats, plotter) modes = read_precision_qc(precision_stats, plotter) plotter.close() global_stats = OrderedDict([ ('Accuracy', [read_stats['base_stats']['accuracy']]), ('AccuracyMode', modes['accuracy_mode']), ('Identity', [read_stats['base_stats']['identity']]), ('IdentityMode', modes['identity_mode']), ('Mapped', [read_stats['mapped']]), ('Unmapped', [read_stats['unmapped']]), ('Tag', [read_stats['tag']]), ]) global_stats = pd.DataFrame(global_stats) if args.g is not None: global_stats.to_csv(args.g, sep="\t", index=False) if args.l is not None: read_df = pd.DataFrame(precision_stats) read_df.to_csv(args.l, sep="\t", index=False) if args.p is not None: misc.pickle_dump(read_stats, args.p)
{'PerQuerySim': stats['PerQueryBaseSim']}, title="Distribution of percent bases with matched alignment", xlab="Percent bases with matched alignment", ylab="Count", legend=False) plotter.plot_histograms( {'PerQuerySimClipped': stats['PerQueryBaseSimClipped']}, title= "Distribution of percent bases with matched alignment (with clipping)", xlab="Percent bases with matched alignment", ylab="Count", legend=False) plotter.close() if args.p is not None: misc.pickle_dump(dict(stats), args.p) if args.t is not None: data_map = stats.copy() del data_map['PerQueryBaseSim'] del data_map['PerQueryBaseSimClipped'] for bam in data_map['BamFiles']: del data_map[bam] del data_map['BamFiles'] data_map = OrderedDict( (key, [value]) for key, value in six.iteritems(data_map)) data_frame = pd.DataFrame(data_map) data_frame.to_csv(args.t, sep="\t", index=False)