from runutils import read_run_details import os import sys runs = read_run_details(sys.argv[1]) for run in runs: print run['RunID'] if run['Tech'] == "454": filetype = 'sff' elif run['Tech'] == "MiSeq": filetype = 'fastq-illumina' elif run['Tech'] == "Ion Torrent": filetype = 'fastq' else: print "unsupported format" raise SystemExit title = "%s - %s - Quality Scores" % (run['RunID'], run['Tech']) cmd = "python qual.py reads/%s %s \"%s\" > images/%s_qual.png" % ( run['Filename'], filetype, title, run['RunID']) os.system(cmd)
import pysam import sys from runutils import read_run_details from Bio import SeqIO reference = dict([(rec.id, rec) for rec in SeqIO.parse(sys.argv[2], "fasta")]) def has_masked(s): return len([c for c in s if c.islower()]) MINIMUM_MAPPING_QUALITY = 1 print "sample\tref\trid\tmapped\tmapq\tinsertions\tl_insertions\tdeletions\tl_deletions\trlen" samples = read_run_details(sys.argv[1]) for sample in samples: mapped = 0 unmapped = 0 samfile = pysam.Samfile(sample['Path'], "rb") id = 1 for read in samfile: if read.is_unmapped or \ read.mapq < MINIMUM_MAPPING_QUALITY or \ has_masked(str(reference[samfile.getrname(read.tid)][read.pos : read.pos + read.alen].seq)): unmapped += 1 print "%s\t%s\t%s\t0\t0\t0\t0\t0\t0\t%s" % (sample['Description'], sample['Reference'], id, read.qlen) else: mapped += 1
return Stats(contig_lengths, n_vals) def stats(seq_name, fh, fmt): h = get_stats(seq_name, fh, fmt) print "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d" % (len(h.contig_lengths), min(h.contig_lengths), max(h.contig_lengths), sum(h.contig_lengths), sum(h.contig_lengths) / len(h.contig_lengths), h.n_vals[0.5], h.n_vals[0.75], h.n_vals[0.9]) if __name__ == "__main__": x = read_run_details(sys.argv[1]) try: filter = sys.argv[2] except: filter = None print "Centre\tRunID\tStrain\tTech\tNotes\tReads\tMin\tMax\tSum\tAvg\tN50\tN75\tN90" for r in x: if filter and filter != r['RunID']: continue print "%s\t%s\t%s\t%s\t%s\t" % (r['Centre'], r['RunID'], r['Strain'], r['Tech'], r['Notes']), if r['Tech'] == '454': fmt = 'sff' else:
import sys from runutils import read_assemblies, read_run_details, hashit assemblies = read_assemblies(sys.argv[1]) summaries = hashit(read_run_details(sys.argv[2]), 'Name') #Name NumContigs NumRefReplicons NumAssemblyBases NumReferenceBases NumLCBs DCJ_Distance NumDCJBlocks NumSNPs NumMisCalled NumUnCalled NumGapsRef NumGapsAssembly TotalBasesMissed PercBasesMissed ExtraBases PercExtraBases MissingChromosomes ExtraContigs NumSharedBoundaries NumInterLcbBoundaries BrokenCDS IntactCDS ContigN50 ContigN90 MinContigLength MaxContigLength AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT fields = ['NumContigs', 'NumAssemblyBases', 'MaxContigLength', 'ContigN50', 'NumLCBs', 'NumGapsRef', 'NumGapsAssembly', 'PercBasesMissed'] print "Sample" + "\t" + "Assembler" + "\t" + "\t".join(fields) for a in assemblies: try: s = summaries[a['Name'] + '.fas'] except: s = summaries[a['Name']] print "%s\t%s\t" % (a['Desc'], a['AssemblySoftware']) , print "\t".join([s[f] for f in fields])