def __print_scaffold_stats(si_dict, s_dict, cg_dict, cov_dict, tax_dict, agp_obj): title = "Detailed Scaffold Stats" headers = [ "Scaffold", "Num Contigs", "Length", "GC", "Coverage(F/J/LR)", "BLAST Hit", "BLAST Covered", "Circular" ] data = [] for s in sorted(s_dict.iterkeys()): tmp = [] tmp.append(s) tmp.append(len(s_dict[s])) tmp.append(agp_obj.get_scaffold_length(s)) tmp.append(__get_scaffold_gc(s_dict[s], si_dict, cg_dict)) tmp.append(__get_scaffold_coverage(s_dict[s], cov_dict)) hit, hit_len = __get_scaffold_blast_hits(s_dict[s], tax_dict, si_dict) tmp.append(hit) tmp.append(hit_len) tmp.append("NA") data.append(tmp) st = SimpleTable(headers, data, title) output = None if options.s_table: output = options.s_table + ".scaffold_detail" st.print_output(output, options.html)
def reportStats(list_of_classes, pair_count, output): title = "Scaffold Accuracy Stats" headers = ['Stat', 'Total', 'Pct'] data = [] unaligned, multimap, cross_chrom, valid_internal, invalid_ln, valid_circular, invalid_orientation, other = list_of_classes total_valid = valid_internal + valid_circular total_invalid = cross_chrom + invalid_ln + invalid_orientation pct_unaligned = "%.3f" % ((float(unaligned) / pair_count) * 100) pct_mm = "%.3f" % ((float(multimap) / pair_count) * 100) pct_cc = "%.3f" % ((float(cross_chrom) / pair_count) * 100) pct_invalid_ln = "%.3f" % ((float(invalid_ln) / pair_count) * 100) pct_invalid_o = "%.3f" % ((float(invalid_orientation) / pair_count) * 100) pct_valid = "%.3f" % ((float(valid_internal) / pair_count) * 100) pct_valid_circ = "%.3f" % ((float(valid_circular) / pair_count) * 100) pct_other = "%.3f" % ((float(other) / pair_count) * 100) total = unaligned + multimap + cross_chrom + valid_internal + invalid_ln + valid_circular + invalid_orientation + other #print pair_count, unaligned, pct_unaligned, multimap, pct_mm, cross_chrom, pct_cc, invalid_orientation, pct_invalid_o, invalid_ln, pct_invalid_ln, valid_internal, pct_valid, valid_circular, pct_valid_circ, other, pct_other data.append(["Total Input Pairs", pair_count, "100.0"]) data.append(["Multiply Mapped", multimap, pct_mm]) data.append(["Unaligned", unaligned, pct_unaligned]) data.append(["Cross Chromosome", cross_chrom, pct_cc]) data.append(["Invalid Orientation", invalid_orientation, pct_invalid_o]) data.append(["Invalid Length", invalid_ln, pct_invalid_ln]) data.append(["Valid", valid_internal, pct_valid]) data.append(["Valid Circular", valid_circular, pct_valid_circ]) data.append(["Other", other, pct_other]) st = SimpleTable(headers, data, title) st.print_output(output, 1)
def __print_contig_stats(si_dict, cov_dict, tax_dict): #Contig scaffold, length, gc, cov (frg/jump) blast_hit blast_cvg ambig title = "Detailed Contig Stats" headers = [ "Contig", "Scaffold", "Length", "GC", "Coverage(F/J/LR)", "BLAST Hit", "BLAST Covered" ] data = [] for c in sorted(si_dict.iterkeys()): tmp = [] tmp.append(c) if 'scaffold' in si_dict[c]: tmp.append(si_dict[c]['scaffold']) else: tmp.append("NA") tmp.append(si_dict[c]['length']) tmp.append("%.2f" % si_dict[c]['gc']) tmp.append(__get_coverage_string(c, cov_dict)) hit, covered = __get_blast_strings(c, tax_dict, si_dict[c]['length']) tmp.append(hit) tmp.append(covered) data.append(tmp) st = SimpleTable(headers, data, title) output = None if options.c_table: output = options.c_table + ".contig_detail" st.print_output(output, options.html)
def main(): data = [] if len(args) != 1: parser.error("Must supply assembly list file.") sys.exit(1) try: # open file or fail f = open(args[0]) for lines in f.readlines(): data.append(lines.rstrip('\n').split(',')) f.close() # load assemblies and load stats module assemblies = __load_assemblies(data) s = AssemblyStatsUtil(assemblies) #print s.get_stats() title = "Basic Assembly Stats" headers = s.get_assembly_names() data = s.get_stats() st = SimpleTable(headers, data, title) output = None if options.output: output = options.output + ".assembly_stats_comparison" st.print_output(output, options.html) except IOError as (errno, strerror): print "I/O Error({0}): {1}".format(errno, strerror) return -1
def main(): nodes_db, names_db, blast_db = check_dbs(options.nodes_db,options.names_db,options.db) data = __get_blast_data(args[0]) lengths = __get_lengths(args[1]) tax_obj = Taxonomy(nodes_db=nodes_db, names_db=names_db, blast_db=blast_db) gi_nums = __get_gis_from_blast_line(data) blast_hit,hit_lengths = __get_longest_covered_hit(data) gi_tax_dict = tax_obj.get_gi_tax_lookup(gi_nums) title = "Taxonomic Classification of BLAST Hits" headers = ["QueryId","QueryLen","QueryHitLen","PctCovered","TaxonomicString"] t_data = __get_coverage_string(lengths, blast_hit, tax_obj, gi_tax_dict, gi_nums) st = SimpleTable(headers,t_data,title) output = None if options.output: output = options.output + ".blast_hit_taxonomy" st.print_output(output,options.html) if options.hm_data: try: output = open(options.hm_data,'w') except IOError as (errno,strerror): print "I/O Error({0}): {1}".format(errno,strerror) return -1 output.write(__get_heatmap_data(gi_tax_dict, tax_obj, lengths, data, hit_lengths)) output.close()
def __get_details_info(base, extension, title, headers, data_function, coord_obj=None): output = None if base: output = base + extension if coord_obj: data = data_function(coord_obj) else: data = data_function st = SimpleTable(headers,data,title) st.print_output(output, options.html)
def main(): qc_pass = __check_qc_options(options.qc_pass, options.qc_fail) title, headers, data = __get_table_data(qc_pass) st = SimpleTable(headers, data, title) output = None if options.output: output = options.output + ".simple_bam_stats" st.print_output(output, options.html) return 0
def main(): title, headers, data = __get_cvg_data(options.phys_cvg) st = SimpleTable(headers, data, title) output = None if options.output: type = "seq_cvg" if options.phys_cvg: type = "phys_cvg" output = options.output + ".bam_" + type + "_stats" st.print_output(output, options.html) return 0
def main(): name = "No_Name" assembler = "None_Given" if options.name: name = options.name if options.assembler: assembler = options.assembler a = Assembly(fasta=args[0], name=name, assembler=assembler, minGapSize=options.minGap, minConSize=options.minContig, minScaffSize=options.minScaffold, agp=options.agp, min_output_gap_size=options.min_output_gap_size) assembly = [] assembly.append(a) s = AssemblyStatsUtil(assembly) title = "Basic Assembly Stats" headers = s.get_assembly_names() data = s.get_stats() st = SimpleTable(headers, data, title) output = None if options.output: output = options.output + ".basic_assembly_stats" st.print_output(output, options.html) cumulative_data = {} num_charts = 0 if options.cumulative_c: key = "Cumulative Contigs" cumulative_data[key] = [] cumulative_data[key] = a.get_cumulative_contigs() num_charts += 1 if options.cumulative_s: key = "Cumulative Scaffolds" cumulative_data[key] = [] cumulative_data[key] = a.get_cumulative_scaffolds() num_charts += 1 if num_charts: __plot_data(a, num_charts, cumulative_data, options.chart_output) return 0
def __print_contig_stats(si_dict=None, cov_dict=None, tax_dict=None, most_common_org=None): #Contig scaffold, length, gc, cov (frg/jump) blast_hit blast_cvg ambig title = "Detailed Contig Stats" headers = [ "Contig", "Scaffold", "Length", "GC", "Coverage(F/J/LR)", "BLAST Hit", "BLAST Covered", "Best BLAST Score", "Best Covered" ] if most_common_org: headers += [ "Most Common (" + most_common_org + ")", "SequenceAnnotations" ] else: headers += ["Most Common ( None )", "SequenceAnnotations"] data = [] for c in sorted(si_dict.iterkeys()): tmp = [] tmp.append(c) if 'scaffold' in si_dict[c]: tmp.append(si_dict[c]['scaffold']) else: tmp.append("NA") tmp.append(si_dict[c]['length']) tmp.append("%.2f" % si_dict[c]['gc']) tmp.append(__get_coverage_string(c, cov_dict)) name, len = __get_blast_strings(c, tax_dict, si_dict[c]['length'], 'genus', 'hit len') tmp += [name, len] name, len = __get_blast_strings(c, tax_dict, si_dict[c]['length'], 'best', 'best len') tmp += [name, len] if most_common_org and tax_dict: tmp += [tax_dict[c]['common'], tax_dict[c]['tags']] elif tax_dict: tmp += ["NA", tax_dict[c]['tags']] else: tmp += ["NA", "NA"] data.append(tmp) st = SimpleTable(headers, data, title) output = None if options.c_table: output = options.c_table + ".contig_detail" st.print_output(output, options.html)
def print_metrics(data,labels,m_output): headers = __get_header_line(data,labels) title = 'Insert Size Metrics' table_data = [] metrics = data[0].get_metrics_headers() if len(data) > 0 else [] ordered_metrics = ['PAIR_ORIENTATION','READ_PAIRS','MEAN_INSERT_SIZE','STANDARD_DEVIATION', 'MEDIAN_INSERT_SIZE','MEDIAN_ABSOLUTE_DEVIATION','MIN_INSERT_SIZE', 'MAX_INSERT_SIZE','WIDTH_OF_50_PERCENT', 'WIDTH_OF_90_PERCENT', 'SAMPLE','LIBRARY','READ_GROUP'] if len(metrics): for i in ordered_metrics: tmp = [i] for j in data: for k in data[j].get_metric_value(i): tmp.append(k) table_data.append(tmp) st = SimpleTable(headers,table_data,title) st.print_output(m_output,options.html)
def main(): nodes_db, names_db, blast_db = check_dbs(options.nodes_db, options.names_db, options.db) blast_data = _get_blast_data(args[0]) # dictionary of list of BlastLines gi_list = list(_get_gi_set(blast_data)) lengths = _get_lengths(args[1]) # lengths will be a list of ContigLength objects tax_obj = Taxonomy(nodes_db=nodes_db, names_db=names_db, blast_db=blast_db, gi_list=gi_list) if not options.no_annotate: _make_annotated_blast_file(args[0], lengths, blast_data, tax_obj) contig_gi_hit_lengths_dict = _get_gi_hits_length(blast_data) gi_tax_dict = tax_obj.get_gi_tax_lookup(gi_list) gi_name_lookup = tax_obj.get_gi_tax_name_lookup_by_level( gi_tax_dict, 'genus') #print contig_gi_hit_lengths_dict most_common_genus = _get_most_common_genus(blast_data, gi_name_lookup) #print most_common_genus blast_hit, hit_lengths = _get_longest_covered_hit( blast_data, contig_gi_hit_lengths_dict) #print blast_hit, hit_lengths blast_score = _get_best_hit_by_score(blast_data) #print blast_score most_common_genus_hits = _get_most_common_genus_hits( blast_data, gi_name_lookup, most_common_genus[0], contig_gi_hit_lengths_dict) #print most_common_genus_hits non_genomic_hits = _find_non_genomic_types(blast_data, tax_obj) title = "Taxonomic Classification of BLAST Hits" headers = [ "QueryId", "QueryLen", "LongestHitLen", "LongestPctCovered", "LongestHitTaxonomy", "BestScoringHitLen", "BestScoringPctCovered", "BestScoringTaxonomy", "MostCommonOrg (" + most_common_genus[0] + ")", "SequenceAnnotations" ] t_data = _get_coverage_string(lengths, blast_hit, tax_obj, gi_tax_dict, blast_score, most_common_genus_hits, non_genomic_hits) st = SimpleTable(headers, t_data, title) output = None if options.output: output = options.output + ".blast_hit_taxonomy" st.print_output(output, options.html) if options.hm_data: try: output = open(options.hm_data, 'w') except IOError as (errno, strerror): print "I/O Error({0}): {1}".format(errno, strerror) return -1 output.write( _get_heatmap_data(gi_tax_dict, tax_obj, lengths, blast_data, hit_lengths)) output.close()
distinct = 0 by_count = {} for n in counts.itervalues(): by_count[n] = by_count.get(n, 0) + 1 distinct += 1 total += n cn = by_count.keys() cn.sort() mers = "%d-mers" % options.k print '#', total, 'total', mers print '#', distinct, 'distinct', mers table = SimpleTable( ['CopyNumber', 'Count', 'BasePct', 'BaseTotal', 'KmerPct', 'KmerTotal'], [], 'Kmer Copy Number (k=' + str(options.k) + ')') btotal = 0.0 ktotal = 0.0 for n in cn: c = by_count[n] bpct = n * c * 100.0 / total btotal += bpct kpct = c * 100.0 / distinct ktotal += kpct table.add_row([ n, c, "%.2f%%" % bpct, "%.2f%%" % btotal, "%.2f%%" % kpct,
if a: distinct_covered += 1 if a > n: over += a - n def format_row(label, n, d): return [label, n, d, "%.2f%%" % (n * 100.0 / d)] headers = ['Category', 'Count', 'RefCount', 'Pct'] data = [] data.append(format_row('Covered', covered, total)) data.append(format_row('DistinctCovered', distinct_covered, distinct)) data.append(format_row('OverCovered', over, total)) for k, n in assembly_counts.iteritems(): if k not in reference_counts: novel += n novel_distinct += 1 data.append(format_row('Novel', novel, total)) data.append(format_row('NovelDistinct', novel_distinct, distinct)) table = SimpleTable(headers, data, 'Kmer Coverage (k=' + str(options.k) + ')') if options.o: table.print_output(options.o + ".kmer_coverage", options.html) else: print table.to_table()
def __print_rna_analysis(molecules): # print out the info title = 'rRNA Analysis' rna_table = [] rna_table_headers = ['Gene','Genus','Query ID','Query Start','Query End','Hit Direction','Length'] if options.classify: rna_table_headers.append('Taxonomy') rna_count = 0 summary_dict = {} gene_count = {} for m in sorted(molecules.iterkeys()): for i in molecules[m]: rna_table.append([]) gene = i.get_molecule() lineage = i.get_genus() if options.gene == "all": rna_table[rna_count] += [gene, i.get_genus(), i.get_contig(), i.get_start(), i.get_end(), i.get_direction(), i.get_length()] if options.classify: if gene not in summary_dict: summary_dict[gene] = {} gene_count[gene] = 0 if lineage not in summary_dict[gene]: summary_dict[gene][lineage] = 0 summary_dict[gene][lineage] += 1 gene_count[gene] += 1 else: if m == options.gene: rna_table[rna_count] += [i.get_molecule(), i.get_genus(), i.get_contig(), i.get_start(), i.get_end(), i.get_direction(), i.get_length()] if options.classify: if gene not in summary_dict: summary_dict[gene] = {} gene_count[gene] = 0 if lineage not in summary_dict[gene]: summary_dict[gene][lineage] = 0 summary_dict[gene][lineage] += 1 gene_count[gene] += 1 if options.classify: rna_table[rna_count] += [i.get_taxonomy()] rna_count += 1 st = SimpleTable(rna_table_headers,rna_table,title) output = None if options.output: output = options.output + ".rna_analysis_details" st.print_output(output,options.html) if options.classify: headers, data, title = __summarize_results(summary_dict,gene_count) output = None if options.output: output = options.output + ".rna_analysis_summary" st = SimpleTable(headers,data,title) st.print_output(output,options.html)
def __write_output_data(agp_file=None, flank_kmer=None, asm_kmer=None, contig_seqs=None, asm_counts=None, cg_ss_counts=None, ue_ss_counts=None): #captured gap stats cg_seqs = [] cg_sizes = [] cg_dist = [] cg_cn = [] cg_gc = [] #uncaptured end stats ue_seqs = [] ue_dist = [] ue_cn = [] ue_gc = [] print "Pulling gap end sequence..." agp = AgpFile(agp_file) scaffolds = agp.get_agp_scaffolds() for scaffold in scaffolds: print "Scaffold:", scaffold for record in agp.get_agp_file_record(scaffold): if (not agp.is_gap(scaffold, record)): ctg_seq = contig_seqs[agp.get_contig_id(scaffold, record)] if (len(ctg_seq) < options.e): print "WARNING:", agp.get_contig_id( scaffold, record), " is less than ", options.e, ". Ignoring." continue if (record == 1): #print "\tFirst contig..." #print "SEQ: ", contig_seqs[agp.get_contig_id(scaffold,record)][:options.e] (gc, dist, copy) = analyze_seq( contig_seqs[agp.get_contig_id(scaffold, record)][:options.e], flank_kmer, asm_kmer, asm_counts) ue_dist.append(dist) ue_cn.append(copy) ue_gc.append(gc) ue_ss_counts = get_simple_sequences( ue_ss_counts, contig_seqs[agp.get_contig_id(scaffold, record)][:options.e]) #print "UE", ue_ss_counts if (record == len(agp.get_agp_file_record(scaffold))): #print "\tLast contig..." #print "SEQ: ", contig_seqs[agp.get_contig_id(scaffold,record)][-options.e:] (gc, dist, copy) = analyze_seq( contig_seqs[agp.get_contig_id(scaffold, record)][-options.e:], flank_kmer, asm_kmer, asm_counts) ue_dist.append(dist) ue_cn.append(copy) ue_gc.append(gc) ue_ss_counts = get_simple_sequences( ue_ss_counts, contig_seqs[agp.get_contig_id(scaffold, record)][-options.e:]) #print "UE", ue_ss_counts if (agp.is_gap(scaffold, record)): #print "Gap..." left_ctg_record = record - 1 right_ctg_record = record + 1 if (len(contig_seqs[agp.get_contig_id( scaffold, left_ctg_record)]) < options.e) or (len( contig_seqs[agp.get_contig_id( scaffold, right_ctg_record)]) < options.e): print "Warning: This gap is flanked by a contig less than", options.e, "long. Skipping analysis." continue left_seq = contig_seqs[agp.get_contig_id( scaffold, left_ctg_record)][-options.e:] #left_seq = contig_seqs[agp.get_contig_id(scaffold,left_ctg_record)][:options.e] #print "LEFT: ", left_seq (left_gc, left_dist, left_copy) = analyze_seq(left_seq, flank_kmer, asm_kmer, asm_counts) cg_ss_counts = get_simple_sequences(cg_ss_counts, left_seq) #print "CG", cg_ss_counts #print left_seq #print left_gc,left_dist,left_copy right_seq = contig_seqs[agp.get_contig_id( scaffold, right_ctg_record)][:options.e] #right_seq = contig_seqs[agp.get_contig_id(scaffold,right_ctg_record)][-options.e:] #print "RIGHT SEQ: ", right_seq (right_gc, right_dist, right_copy) = analyze_seq(right_seq, flank_kmer, asm_kmer, asm_counts) cg_ss_counts = get_simple_sequences(cg_ss_counts, right_seq) #print "CG", cg_ss_counts #print right_gc,right_dist,right_copy cg_sizes.append(agp.get_feature_length(scaffold, record)) cg_dist.append((left_dist + right_dist) / 2) cg_gc.append((left_gc + right_gc) / 2) cg_cn.append((left_copy + right_copy) / 2) headers = ["Metric", "Uncaptured Ends", "Captured Gaps"] table = SimpleTable(headers, [], "Gap Analysis Metrics") ss_table = SimpleTable([ "Sequence", "Uncaptured Ends Bases", "Uncaptured Ends Percent", "Captured Gap Bases", "Captured Gap Percent" ], [], "Gap Simple Sequence Analysis") if (len(cg_sizes) > 0): chart_tools.gen_histogram(cg_sizes, "Gap Sizes", "Number of Gaps", "Histogram of Captured Gap Sizes", options.header + ".cg_sizes") chart_tools.gen_histogram(cg_dist, "Gap Flank Complexity", "Number of Gaps", "Histogram of Gap Flank Complexity", options.header + ".cg_distinctness") chart_tools.gen_histogram(cg_cn, "Gap Flank Copy Number", "Number of Gaps", "Histogram of Gap Flank Copy Number", options.header + ".cg_copy_number") chart_tools.gen_histogram(cg_gc, "Gap Flank GC", "Number of Gaps", "Histogram of Gap Flank GC", options.header + ".cg_gc") table.add_row(["Number", len(ue_dist), len(cg_dist)]) table.add_row([ "Average Complexity", "%.0f" % (sum(ue_dist) / len(ue_dist)), "%.0f" % (sum(cg_dist) / len(cg_dist)) ]) table.add_row([ "Less than " + str(options.l) + "% Complex", len(filter(lambda x: x < options.l, ue_dist)), len(filter(lambda x: x < options.l, cg_dist)) ]) table.add_row([ "Average GC", "%.0f" % (sum(ue_gc) / len(ue_gc)), "%.0f" % (sum(cg_gc) / len(cg_gc)) ]) table.add_row([ "Less than 30% GC", len(filter(lambda x: x < 30, ue_gc)), len(filter(lambda x: x < 30, cg_gc)) ]) table.add_row([ "Greater than 70% GC", len(filter(lambda x: x > 70, ue_gc)), len(filter(lambda x: x > 70, cg_gc)) ]) table.add_row([ "Average Copy Number", "%.0f" % (sum(ue_cn) / len(ue_cn)), "%.0f" % (sum(cg_cn) / len(cg_cn)) ]) ss_table.add_row([ "End Bases", len(ue_dist) * options.e, "", len(cg_dist) * options.e, "" ]) ss_table.add_row([ "Total SS", sum(ue_ss_counts.itervalues()), "%.2f%%" % (float(sum(ue_ss_counts.itervalues()) * 100) / (len(ue_dist) * options.e)), sum(cg_ss_counts.itervalues()), "%.2f%%" % (float(sum(cg_ss_counts.itervalues()) * 100) / (len(cg_dist) * options.e)) ]) for n in cg_ss_counts: ss_table.add_row([ n, ue_ss_counts[n], "%.2f%%" % (float(ue_ss_counts[n] * 100) / (len(ue_dist) * options.e)), cg_ss_counts[n], "%.2f%%" % (float(cg_ss_counts[n] * 100) / (len(cg_dist) * options.e)) ]) else: print "No captured gaps." table.add_row(["Number", len(ue_dist), len(cg_dist)]) table.add_row([ "Average Uniqueness", "%.0f" % (sum(ue_dist) / len(ue_dist)), "N/A" ]) table.add_row([ "Less than " + str(options.l) + "% distinct", "%.2f" % (len(filter(lambda x: x < options.l, ue_dist))), "%.2f" % (len(filter(lambda x: x < options.l, cg_dist))) ]) table.add_row( ["Average GC", "%.0f" % (sum(ue_gc) / len(ue_gc)), "N/A"]) table.add_row([ "Less than 30% GC", len(filter(lambda x: x < 30, ue_gc)), len(filter(lambda x: x < 30, cg_gc)) ]) table.add_row([ "Greater than 70% GC", len(filter(lambda x: x > 70, ue_gc)), len(filter(lambda x: x > 70, cg_gc)) ]) table.add_row( ["Average Copy Number", "%.0f" % (sum(ue_cn) / len(ue_cn)), "N/A"]) ss_table.add_row(["End Bases", len(ue_dist) * options.e, "", "NA", ""]) ss_table.add_row([ "Total SS", sum(ue_ss_counts.itervalues()), "%.2f%%" % (float(sum(ue_ss_counts.itervalues()) * 100) / (len(ue_dist) * options.e)), "NA", "NA" ]) for n in cg_ss_counts: ss_table.add_row([ n, ue_ss_counts[n], "%.2f%%" % (float(ue_ss_counts[n] * 100) / (len(ue_dist) * options.e)), "NA", "NA" ]) chart_tools.gen_histogram(ue_dist, "End Distinctness", "Number of Uncaptured Ends", "Histogram of End Complexity", options.header + ".ue_distinctness") chart_tools.gen_histogram(ue_cn, "End Copy Number", "Number of Uncaptured Ends", "Histogram of End Copy Number", options.header + ".ue_copy_number") chart_tools.gen_histogram(ue_gc, "End GC", "Number of Uncaptured Ends", "Histogram of End GC", options.header + ".ue_gc") table.print_output(options.t_header + ".gap_analysis", options.html) ss_table.print_output(options.t_header + ".gap_ss_analysis", options.html)