cn.sort() mers = "%d-mers" % options.k print '#', total, 'total', mers print '#', distinct, 'distinct', mers table = SimpleTable( ['CopyNumber', 'Count', 'BasePct', 'BaseTotal', 'KmerPct', 'KmerTotal'], [], 'Kmer Copy Number (k=' + str(options.k) + ')') btotal = 0.0 ktotal = 0.0 for n in cn: c = by_count[n] bpct = n * c * 100.0 / total btotal += bpct kpct = c * 100.0 / distinct ktotal += kpct table.add_row([ n, c, "%.2f%%" % bpct, "%.2f%%" % btotal, "%.2f%%" % kpct, "%.2f%%" % ktotal ]) if options.o: table.print_output(options.o + ".kmer_copy_number", options.html) else: print table.to_table()
def __write_output_data(agp_file=None, flank_kmer=None, asm_kmer=None, contig_seqs=None, asm_counts=None, cg_ss_counts=None, ue_ss_counts=None): #captured gap stats cg_seqs = [] cg_sizes = [] cg_dist = [] cg_cn = [] cg_gc = [] #uncaptured end stats ue_seqs = [] ue_dist = [] ue_cn = [] ue_gc = [] print "Pulling gap end sequence..." agp = AgpFile(agp_file) scaffolds = agp.get_agp_scaffolds() for scaffold in scaffolds: print "Scaffold:", scaffold for record in agp.get_agp_file_record(scaffold): if (not agp.is_gap(scaffold, record)): ctg_seq = contig_seqs[agp.get_contig_id(scaffold, record)] if (len(ctg_seq) < options.e): print "WARNING:", agp.get_contig_id( scaffold, record), " is less than ", options.e, ". Ignoring." continue if (record == 1): #print "\tFirst contig..." #print "SEQ: ", contig_seqs[agp.get_contig_id(scaffold,record)][:options.e] (gc, dist, copy) = analyze_seq( contig_seqs[agp.get_contig_id(scaffold, record)][:options.e], flank_kmer, asm_kmer, asm_counts) ue_dist.append(dist) ue_cn.append(copy) ue_gc.append(gc) ue_ss_counts = get_simple_sequences( ue_ss_counts, contig_seqs[agp.get_contig_id(scaffold, record)][:options.e]) #print "UE", ue_ss_counts if (record == len(agp.get_agp_file_record(scaffold))): #print "\tLast contig..." #print "SEQ: ", contig_seqs[agp.get_contig_id(scaffold,record)][-options.e:] (gc, dist, copy) = analyze_seq( contig_seqs[agp.get_contig_id(scaffold, record)][-options.e:], flank_kmer, asm_kmer, asm_counts) ue_dist.append(dist) ue_cn.append(copy) ue_gc.append(gc) ue_ss_counts = get_simple_sequences( ue_ss_counts, contig_seqs[agp.get_contig_id(scaffold, record)][-options.e:]) #print "UE", ue_ss_counts if (agp.is_gap(scaffold, record)): #print "Gap..." left_ctg_record = record - 1 right_ctg_record = record + 1 if (len(contig_seqs[agp.get_contig_id( scaffold, left_ctg_record)]) < options.e) or (len( contig_seqs[agp.get_contig_id( scaffold, right_ctg_record)]) < options.e): print "Warning: This gap is flanked by a contig less than", options.e, "long. Skipping analysis." continue left_seq = contig_seqs[agp.get_contig_id( scaffold, left_ctg_record)][-options.e:] #left_seq = contig_seqs[agp.get_contig_id(scaffold,left_ctg_record)][:options.e] #print "LEFT: ", left_seq (left_gc, left_dist, left_copy) = analyze_seq(left_seq, flank_kmer, asm_kmer, asm_counts) cg_ss_counts = get_simple_sequences(cg_ss_counts, left_seq) #print "CG", cg_ss_counts #print left_seq #print left_gc,left_dist,left_copy right_seq = contig_seqs[agp.get_contig_id( scaffold, right_ctg_record)][:options.e] #right_seq = contig_seqs[agp.get_contig_id(scaffold,right_ctg_record)][-options.e:] #print "RIGHT SEQ: ", right_seq (right_gc, right_dist, right_copy) = analyze_seq(right_seq, flank_kmer, asm_kmer, asm_counts) cg_ss_counts = get_simple_sequences(cg_ss_counts, right_seq) #print "CG", cg_ss_counts #print right_gc,right_dist,right_copy cg_sizes.append(agp.get_feature_length(scaffold, record)) cg_dist.append((left_dist + right_dist) / 2) cg_gc.append((left_gc + right_gc) / 2) cg_cn.append((left_copy + right_copy) / 2) headers = ["Metric", "Uncaptured Ends", "Captured Gaps"] table = SimpleTable(headers, [], "Gap Analysis Metrics") ss_table = SimpleTable([ "Sequence", "Uncaptured Ends Bases", "Uncaptured Ends Percent", "Captured Gap Bases", "Captured Gap Percent" ], [], "Gap Simple Sequence Analysis") if (len(cg_sizes) > 0): chart_tools.gen_histogram(cg_sizes, "Gap Sizes", "Number of Gaps", "Histogram of Captured Gap Sizes", options.header + ".cg_sizes") chart_tools.gen_histogram(cg_dist, "Gap Flank Complexity", "Number of Gaps", "Histogram of Gap Flank Complexity", options.header + ".cg_distinctness") chart_tools.gen_histogram(cg_cn, "Gap Flank Copy Number", "Number of Gaps", "Histogram of Gap Flank Copy Number", options.header + ".cg_copy_number") chart_tools.gen_histogram(cg_gc, "Gap Flank GC", "Number of Gaps", "Histogram of Gap Flank GC", options.header + ".cg_gc") table.add_row(["Number", len(ue_dist), len(cg_dist)]) table.add_row([ "Average Complexity", "%.0f" % (sum(ue_dist) / len(ue_dist)), "%.0f" % (sum(cg_dist) / len(cg_dist)) ]) table.add_row([ "Less than " + str(options.l) + "% Complex", len(filter(lambda x: x < options.l, ue_dist)), len(filter(lambda x: x < options.l, cg_dist)) ]) table.add_row([ "Average GC", "%.0f" % (sum(ue_gc) / len(ue_gc)), "%.0f" % (sum(cg_gc) / len(cg_gc)) ]) table.add_row([ "Less than 30% GC", len(filter(lambda x: x < 30, ue_gc)), len(filter(lambda x: x < 30, cg_gc)) ]) table.add_row([ "Greater than 70% GC", len(filter(lambda x: x > 70, ue_gc)), len(filter(lambda x: x > 70, cg_gc)) ]) table.add_row([ "Average Copy Number", "%.0f" % (sum(ue_cn) / len(ue_cn)), "%.0f" % (sum(cg_cn) / len(cg_cn)) ]) ss_table.add_row([ "End Bases", len(ue_dist) * options.e, "", len(cg_dist) * options.e, "" ]) ss_table.add_row([ "Total SS", sum(ue_ss_counts.itervalues()), "%.2f%%" % (float(sum(ue_ss_counts.itervalues()) * 100) / (len(ue_dist) * options.e)), sum(cg_ss_counts.itervalues()), "%.2f%%" % (float(sum(cg_ss_counts.itervalues()) * 100) / (len(cg_dist) * options.e)) ]) for n in cg_ss_counts: ss_table.add_row([ n, ue_ss_counts[n], "%.2f%%" % (float(ue_ss_counts[n] * 100) / (len(ue_dist) * options.e)), cg_ss_counts[n], "%.2f%%" % (float(cg_ss_counts[n] * 100) / (len(cg_dist) * options.e)) ]) else: print "No captured gaps." table.add_row(["Number", len(ue_dist), len(cg_dist)]) table.add_row([ "Average Uniqueness", "%.0f" % (sum(ue_dist) / len(ue_dist)), "N/A" ]) table.add_row([ "Less than " + str(options.l) + "% distinct", "%.2f" % (len(filter(lambda x: x < options.l, ue_dist))), "%.2f" % (len(filter(lambda x: x < options.l, cg_dist))) ]) table.add_row( ["Average GC", "%.0f" % (sum(ue_gc) / len(ue_gc)), "N/A"]) table.add_row([ "Less than 30% GC", len(filter(lambda x: x < 30, ue_gc)), len(filter(lambda x: x < 30, cg_gc)) ]) table.add_row([ "Greater than 70% GC", len(filter(lambda x: x > 70, ue_gc)), len(filter(lambda x: x > 70, cg_gc)) ]) table.add_row( ["Average Copy Number", "%.0f" % (sum(ue_cn) / len(ue_cn)), "N/A"]) ss_table.add_row(["End Bases", len(ue_dist) * options.e, "", "NA", ""]) ss_table.add_row([ "Total SS", sum(ue_ss_counts.itervalues()), "%.2f%%" % (float(sum(ue_ss_counts.itervalues()) * 100) / (len(ue_dist) * options.e)), "NA", "NA" ]) for n in cg_ss_counts: ss_table.add_row([ n, ue_ss_counts[n], "%.2f%%" % (float(ue_ss_counts[n] * 100) / (len(ue_dist) * options.e)), "NA", "NA" ]) chart_tools.gen_histogram(ue_dist, "End Distinctness", "Number of Uncaptured Ends", "Histogram of End Complexity", options.header + ".ue_distinctness") chart_tools.gen_histogram(ue_cn, "End Copy Number", "Number of Uncaptured Ends", "Histogram of End Copy Number", options.header + ".ue_copy_number") chart_tools.gen_histogram(ue_gc, "End GC", "Number of Uncaptured Ends", "Histogram of End GC", options.header + ".ue_gc") table.print_output(options.t_header + ".gap_analysis", options.html) ss_table.print_output(options.t_header + ".gap_ss_analysis", options.html)