def run(self): workspace = self.get_workspace() reference = reference_directory.Reference(self.reference, must_exist=True) reader_f = io.open_possibly_compressed_file(self.vcf) reader = vcf.Reader(reader_f) variants = collections.defaultdict(list) for record in reader: variants[record.CHROM].append(record) reader_f.close() for chrom in variants: variants[chrom].sort(key=lambda item: item.POS) filenames = [workspace / (item + '.fa') for item in reader.samples] for filename in filenames: with open(filename, 'wb'): pass for name, seq in io.read_sequences( reference.reference_fasta_filename()): for i, sample in enumerate(reader.samples): revised = [] pos = 0 for variant in variants[name]: gt = variant.samples[i].data.GT if gt is None: continue assert gt.isdigit( ), 'Unsupported genotype (can only use haploid genotypes): ' + gt gt_number = int(gt) if gt_number == 0: var_seq = variant.REF else: var_seq = str(variant.ALT[gt_number - 1]) assert re.match( '[ACGTN]*$', var_seq), 'Unsupported variant type: ' + var_seq new_pos = variant.POS - 1 assert new_pos >= pos, 'Variants overlap.' revised.append(seq[pos:new_pos]) pos = new_pos revised.append(var_seq) assert seq[pos:pos + len(variant.REF)].upper( ) == variant.REF, 'REF column in VCF does not match reference sequence' pos += len(variant.REF) revised.append(seq[pos:]) with open(filenames[i], 'ab') as f: io.write_fasta(f, name, ''.join(revised)) del variants[name] assert not variants, 'Chromosome names in VCF not in reference: ' + ' '.join( variants)
def run(self): f = self.begin_output() for filename in self.filenames: info = io.get_file_info(filename) any = False name = os.path.splitext(os.path.split(filename)[1])[0] if info.matches('sequences'): total = 0 total_length = 0 for seq in io.read_sequences(filename, qualities=True): total += 1 total_length += len(seq[1]) print >> f, grace.datum(name, 'sequences', total) print >> f, grace.datum(name, 'total bases', total_length) if total: print >> f, grace.datum(name, 'average length', float(total_length) / total) print >> f any = True if info.matches('annotations'): total = 0 counts = {} for item in annotation.read_annotations(filename, "/"): total += 1 counts[item.type] = counts.get(item.type, 0) + 1 print >> f, grace.datum(name, 'features', total) for key in sorted(counts): print >> f, grace.datum(name, key + ' features', counts[key]) print >> f any = True if info.matches('type-vcf'): reader_f = io.open_possibly_compressed_file(filename) reader = vcf.Reader(reader_f) n = 0 for item in reader: n += 1 print >> f, grace.datum(name, 'variants', n) any = True if not any: raise grace.Error('Don\'t know what to do with ' + filename) self.end_output(f)
def run(self): workspace = self.get_workspace() read_length = 100 left = rand_seq(read_length - 1) while True: flank = rand_seq(1) if flank != self.ref[:1]: break left += flank right = rand_seq(read_length - 1) while True: flank = rand_seq(1) if flank != self.ref[-1:]: break right = flank + right i = 0 variants_used = [] with open(workspace / 'reads.fq', 'wb') as f: for i, variant in enumerate(self.variants): if 'x' in variant: variant, count = variant.split('x') count = int(count) else: count = 10 variants_used.append((variant, count)) seq = left + variant + right for j in xrange(count): pos = len(variant) + random.randrange(read_length - len(variant)) read = seq[pos:pos + read_length] if random.randrange(2): read = bio.reverse_complement(read) i += 1 io.write_fastq(f, 'read_%s_%d' % (variant, i), read, chr(64 + 30) * len(read)) reference = left + self.ref + right primary_variant = left + variants_used[0][0] + right with open(workspace / 'reference.fa', 'wb') as f: io.write_fasta(f, 'chr1', reference) legion.remake_needed() self.analysis( workspace / 'sample', workspace / 'reference.fa', reads=[workspace / 'reads.fq'], ).run() self.freebayes( workspace / 'freebayes', workspace / 'sample', ).run() self.vcf_filter( workspace / 'filtered', workspace / 'freebayes.vcf', ).run() Vcf_patch(workspace / 'patch', workspace / ('sample', 'reference'), workspace / 'filtered.vcf').run() patched = io.read_sequences(workspace / ('patch', 'sample.fa')).next()[1] masked = io.read_sequences( workspace / ('sample', 'consensus_masked.fa')).next()[1].upper() with open(workspace / 'freebayes.vcf', 'rU') as f: reader = vcf.Reader(f) raw_count = len(list(reader)) with open(workspace / 'filtered.vcf', 'rU') as f: reader = vcf.Reader(f) filtered_count = len( list(vcf.Reader(open(workspace / 'filtered.vcf', 'rU')))) with open(workspace / ('sample', 'report.txt'), 'rb') as f: nesoni_count = len(f.readlines()) - 1 self.log.log('\n') self.log.datum(workspace.name, 'changes found by "nesoni consensus:"', nesoni_count) self.log.datum(workspace.name, 'is correctly patched by "nesoni consensus:"', masked == primary_variant) self.log.log('\n') self.log.datum(workspace.name, 'raw variants', raw_count) self.log.datum(workspace.name, 'variants after filtering', filtered_count) self.log.datum(workspace.name, 'is correctly patched by VCF pipeline', patched == primary_variant) self.log.log('\n')
def run(self): reader_f = io.open_possibly_compressed_file(self.vcf) reader = vcf.Reader(reader_f) tags = {} for item in reader.metadata.get('sampleTags', []): parts = item.split(',') tags[parts[0]] = parts assert 'reference' not in reader.samples, 'Can\'t have a sample called reference, sorry.' samples = ['reference'] + reader.samples for sample in samples: if sample not in tags: tags[sample] = [sample, 'all'] samples = selection.select_and_sort(self.select, self.sort, samples, lambda sample: tags[sample]) required = [ i for i, sample in enumerate(samples) if selection.matches(self.require, tags[sample]) ] sample_number = dict((b, a) for a, b in enumerate(reader.samples)) items = [] for record in reader: variants = get_variants(record) genotypes = [] counts = [] qualities = [] for sample in samples: if sample == 'reference': genotypes.append([0]) counts.append([1]) qualities.append(float('inf')) else: genotypes.append( get_genotype(record.samples[sample_number[sample]])) counts.append( get_variant_counts( record.samples[sample_number[sample]])) qualities.append( record.samples[sample_number[sample]].data.GQ) # Only output when there are at least two genotypes any_interesting = False for i in xrange(len(genotypes)): for j in xrange(i): if (genotypes[i] is not None and genotypes[j] is not None and not genotypes_equal(genotypes[i], genotypes[j])): any_interesting = True break if any_interesting: break if not any_interesting: continue if any(genotypes[i] is None for i in required): continue if self.only_snps and any(genotype is not None and any( len(variants[i]) != 1 for i in genotype) for genotype in genotypes): continue snpeff = snpeff_describe(record.INFO.get('EFF', '')) if not any( selection.matches(self.snpeff_filter, item[1]) for item in (snpeff or [('', [])])): continue items.append( _Nway_record(variants=variants, genotypes=genotypes, counts=counts, qualities=qualities, snpeff=snpeff, record=record)) self.log.log('%d variants\n\n' % len(items)) if self.as_ == 'table': self._write_table(samples, items) elif self.as_ == 'nexus': self._write_nexus(samples, items) elif self.as_ == 'splitstree': self._write_nexus(samples, items) io.execute( 'SplitsTree +g -i INPUT -x COMMAND', no_display=True, INPUT=self.prefix + '.nex', COMMAND='UPDATE; ' 'SAVE FILE=\'%s.nex\' REPLACE=yes; ' 'EXPORTGRAPHICS format=svg file=\'%s.svg\' REPLACE=yes TITLE=\'NeighborNet from %d variants\'; ' 'QUIT' % (self.prefix, self.prefix, len(items)), ) elif self.as_ == 'vcf': self._write_vcf(samples, items, reader) else: raise grace.Error('Unknown output format: ' + self.as_)
def run(self): if self.dirichlet: assert self.ploidy == 1, 'Dirichlet mode is not available for ploidy > 1' reader_f = io.open_possibly_compressed_file(self.vcf) reader = vcf.Reader(reader_f) writer = vcf.Writer(open(self.prefix + '.vcf', 'wb'), reader) #print dir(reader) #print reader.formats #print #print reader.infos #print n = 0 n_kept = 0 for record in reader: n += 1 variants = get_variants(record) any = False for sample in record.samples: self._modify_sample(variants, sample) any = any or (sample.data.GT != self._blank_gt() and sample.data.GT != self._reference_gt()) #print call.sample #for key in call.data._fields: # print key, getattr(call.data,key), reader.formats[key].desc # #counts = [ call.data.RO ] #if isinstance(call.data.QA,list): # counts.extend(call.data.QA) #else: # counts.append(call.data.QA) #print variants, counts # # #if self.min_gq is not None and call.data.GQ < self.min_gq: # call.data = call.data._replace(GT='.') # print call.data #else: # any = True if self.dirichlet: record.QUAL = min( MAX_QUALITY, sum(sample.data.GQ for sample in record.samples)) if any: writer.write_record(record) n_kept += 1 writer.close() reader_f.close() self.log.datum('variants', 'input', n) self.log.datum('variants', 'kept', n_kept) index_vcf(self.prefix + '.vcf')