#!/usr/bin/env python import argparse import os from libkuleshov.stats import n50 ############################################################################## parser = argparse.ArgumentParser() parser.add_argument('--contigs') args = parser.parse_args() ############################################################################## with os.popen("cat %s | grep '>'" % args.contigs) as contigs: lengths = list() for line in contigs: name, length, unk = line[1:].strip().split() lengths.append(int(length)) print 'N50:', n50(lengths)
# and ctg_lengths[o[0]] > 700 and ctg_lengths[o[1]] > 700} overlaps = {e for e,c in edge_counts.iteritems() if c >= 3} validated_overlaps = overlaps & true_overlaps print len(overlaps), len(validated_overlaps), len(true_overlaps) # validated_overlaps = true_overlaps # compute connected components G = nx.Graph() G.add_nodes_from(fasta.references) for i, ovl1 in enumerate(validated_overlaps): if i % 1000 == 0: print '%d/%d' % (i, len(validated_overlaps)) ol = list(ovl1) if len(ol) > 2: assert False elif len(ol) < 2: continue G.add_edge(ol[0],ol[1]) connected_components = nx.connected_components(G) cc_list = [cc for cc in connected_components] component_lengths = [sum([ctg_lengths[ctg] for ctg in component]) for component in cc_list] print sum(component_lengths) print n50(component_lengths)
import os import sys from libkuleshov.stats import n50 print "cat %s | tr -d '\\n' | sed -re 's/>ctg([0-9])+/\\n/g' | perl -nle 'print length'" % sys.argv[ 1] fasta_length_pipe = os.popen( "cat %s | tr -d '\\n' | sed -re 's/>ctg([0-9])+/\\n/g' | perl -nle 'print length'" % sys.argv[1]) fasta_lengths = [int(line) for line in fasta_length_pipe] total_len = sum(fasta_lengths) print 'N50: %d (%d total)' % (n50(fasta_lengths), total_len)
assert False elif len(ol) < 2: continue G.add_edge(ol[0], ol[1]) connected_components = nx.connected_components(G) cc_list = [cc for cc in connected_components] pickle.dump(cc_list, open('connected_components.pkl', 'wb')) component_lengths = [ sum([ctg_lengths[ctg] for ctg in component]) for component in cc_list ] print sum(fasta.lengths) print sum(component_lengths) print n50(component_lengths) # ---------------------------------------------------------------------------- # compute overlaps def myalign(s1, s2): return align.globalms(s1, s2, 2, -1, -2, -0.2, penalize_end_gaps=False)[0] def ctgs_overlap(ctg1, ctg2): head_seq1 = ctg_seq[ctg1][:100] tail_seq1 = ctg_seq[ctg1][-100:] head_seq2 = ctg_seq[ctg2][:100] tail_seq2 = ctg_seq[ctg2][-100:]
import os import sys from libkuleshov.stats import n50 print "cat %s | tr -d '\\n' | sed -re 's/>ctg([0-9])+/\\n/g' | perl -nle 'print length'" % sys.argv[1] fasta_length_pipe = os.popen("cat %s | tr -d '\\n' | sed -re 's/>ctg([0-9])+/\\n/g' | perl -nle 'print length'" % sys.argv[1]) fasta_lengths = [int(line) for line in fasta_length_pipe] total_len = sum(fasta_lengths) print 'N50: %d (%d total)' % (n50(fasta_lengths), total_len)
#!/usr/bin/env python import argparse from libkuleshov.stats import n50 from libkuleshov.fastx import read_bed ############################################################################## parser = argparse.ArgumentParser() parser.add_argument('--contigs') args = parser.parse_args() ############################################################################## bed = read_bed(args.contigs) lengths = [(end - start + 1) for ctg in bed.keys() for (start, end) in bed[ctg]] print 'Contigs:', len(lengths) print 'Average length:', float(sum(lengths)) / len(lengths) print 'N50 length:', n50(lengths)