seqs = dict( [(s.id,s) for s in SeqIO.parse(reference_file,'fasta')] ) summaries = dict(( (name,{'variants':[]}) for name in seqs.keys())) ''' GATK variants ''' print >>sys.stderr, "[ Reading GATK variants ]" vlines = [l.strip('\n') for l in open('%s/GATK/snps.gatk.vcf' % job_path,'rU') if not l.startswith('#')] for l in vlines: v = Variant.from_vcf(l) v.caller = 'gatk' summaries[v.chrom]['variants'].append(v) ''' PacBio variants ''' print >>sys.stderr, "[ Reading GenCons variants ]" glines = [l.strip('\n') for l in gzip.open('%s/data/variants.gff.gz' % job_path,'rb') if not l.startswith('#')] for l in glines: v = Variant.from_gff(l) v.caller = 'gencons' summaries[v.chrom]['variants'].append(v) ''' coverage variants ''' print >>sys.stderr, "[ Reading coverage variants ]" covdata = parse_covdepth('%s/GATK/covdepth' % job_path) covvars = {} for ref in covdata.keys(): assert ref in summaries, "Error: ref %s is not in summaries" % ref result = find_variants(covdata[ref],seqs[ref],ref,exclude_edges=True,exclude_overlaps=True) summaries[ref]['mean_cov'] = result['mean_cov'] summaries[ref]['pct_cov'] = result['pct_cov'] if 'variants' in result: covvars[ref] = {'variants':[]} for v in result['variants']: