예제 #1
0
    def run(self):
        workspace = self.get_workspace()

        reference = reference_directory.Reference(self.reference,
                                                  must_exist=True)

        reader_f = io.open_possibly_compressed_file(self.vcf)
        reader = vcf.Reader(reader_f)
        variants = collections.defaultdict(list)
        for record in reader:
            variants[record.CHROM].append(record)
        reader_f.close()

        for chrom in variants:
            variants[chrom].sort(key=lambda item: item.POS)

        filenames = [workspace / (item + '.fa') for item in reader.samples]
        for filename in filenames:
            with open(filename, 'wb'):
                pass

        for name, seq in io.read_sequences(
                reference.reference_fasta_filename()):
            for i, sample in enumerate(reader.samples):
                revised = []
                pos = 0
                for variant in variants[name]:
                    gt = variant.samples[i].data.GT
                    if gt is None: continue
                    assert gt.isdigit(
                    ), 'Unsupported genotype (can only use haploid genotypes): ' + gt
                    gt_number = int(gt)
                    if gt_number == 0:
                        var_seq = variant.REF
                    else:
                        var_seq = str(variant.ALT[gt_number - 1])
                        assert re.match(
                            '[ACGTN]*$',
                            var_seq), 'Unsupported variant type: ' + var_seq
                    new_pos = variant.POS - 1
                    assert new_pos >= pos, 'Variants overlap.'
                    revised.append(seq[pos:new_pos])
                    pos = new_pos
                    revised.append(var_seq)
                    assert seq[pos:pos + len(variant.REF)].upper(
                    ) == variant.REF, 'REF column in VCF does not match reference sequence'
                    pos += len(variant.REF)
                revised.append(seq[pos:])

                with open(filenames[i], 'ab') as f:
                    io.write_fasta(f, name, ''.join(revised))

            del variants[name]
        assert not variants, 'Chromosome names in VCF not in reference: ' + ' '.join(
            variants)
예제 #2
0
    def run(self):
        f = self.begin_output()

        for filename in self.filenames:
            info = io.get_file_info(filename)

            any = False

            name = os.path.splitext(os.path.split(filename)[1])[0]

            if info.matches('sequences'):
                total = 0
                total_length = 0
                for seq in io.read_sequences(filename, qualities=True):
                    total += 1
                    total_length += len(seq[1])
                print >> f, grace.datum(name, 'sequences', total)
                print >> f, grace.datum(name, 'total bases', total_length)
                if total:
                    print >> f, grace.datum(name, 'average length',
                                            float(total_length) / total)
                print >> f
                any = True

            if info.matches('annotations'):
                total = 0
                counts = {}
                for item in annotation.read_annotations(filename, "/"):
                    total += 1
                    counts[item.type] = counts.get(item.type, 0) + 1

                print >> f, grace.datum(name, 'features', total)
                for key in sorted(counts):
                    print >> f, grace.datum(name, key + ' features',
                                            counts[key])
                print >> f
                any = True

            if info.matches('type-vcf'):
                reader_f = io.open_possibly_compressed_file(filename)
                reader = vcf.Reader(reader_f)
                n = 0
                for item in reader:
                    n += 1
                print >> f, grace.datum(name, 'variants', n)
                any = True

            if not any:
                raise grace.Error('Don\'t know what to do with ' + filename)

        self.end_output(f)
예제 #3
0
    def run(self):
        workspace = self.get_workspace()

        read_length = 100
        left = rand_seq(read_length - 1)
        while True:
            flank = rand_seq(1)
            if flank != self.ref[:1]: break
        left += flank

        right = rand_seq(read_length - 1)
        while True:
            flank = rand_seq(1)
            if flank != self.ref[-1:]: break
        right = flank + right

        i = 0

        variants_used = []

        with open(workspace / 'reads.fq', 'wb') as f:
            for i, variant in enumerate(self.variants):
                if 'x' in variant:
                    variant, count = variant.split('x')
                    count = int(count)
                else:
                    count = 10
                variants_used.append((variant, count))
                seq = left + variant + right
                for j in xrange(count):
                    pos = len(variant) + random.randrange(read_length -
                                                          len(variant))
                    read = seq[pos:pos + read_length]
                    if random.randrange(2):
                        read = bio.reverse_complement(read)
                    i += 1
                    io.write_fastq(f, 'read_%s_%d' % (variant, i), read,
                                   chr(64 + 30) * len(read))

        reference = left + self.ref + right
        primary_variant = left + variants_used[0][0] + right

        with open(workspace / 'reference.fa', 'wb') as f:
            io.write_fasta(f, 'chr1', reference)

        legion.remake_needed()

        self.analysis(
            workspace / 'sample',
            workspace / 'reference.fa',
            reads=[workspace / 'reads.fq'],
        ).run()

        self.freebayes(
            workspace / 'freebayes',
            workspace / 'sample',
        ).run()

        self.vcf_filter(
            workspace / 'filtered',
            workspace / 'freebayes.vcf',
        ).run()

        Vcf_patch(workspace / 'patch', workspace / ('sample', 'reference'),
                  workspace / 'filtered.vcf').run()

        patched = io.read_sequences(workspace /
                                    ('patch', 'sample.fa')).next()[1]

        masked = io.read_sequences(
            workspace / ('sample', 'consensus_masked.fa')).next()[1].upper()

        with open(workspace / 'freebayes.vcf', 'rU') as f:
            reader = vcf.Reader(f)
            raw_count = len(list(reader))

        with open(workspace / 'filtered.vcf', 'rU') as f:
            reader = vcf.Reader(f)
            filtered_count = len(
                list(vcf.Reader(open(workspace / 'filtered.vcf', 'rU'))))

        with open(workspace / ('sample', 'report.txt'), 'rb') as f:
            nesoni_count = len(f.readlines()) - 1

        self.log.log('\n')
        self.log.datum(workspace.name, 'changes found by "nesoni consensus:"',
                       nesoni_count)
        self.log.datum(workspace.name,
                       'is correctly patched by "nesoni consensus:"',
                       masked == primary_variant)
        self.log.log('\n')
        self.log.datum(workspace.name, 'raw variants', raw_count)
        self.log.datum(workspace.name, 'variants after filtering',
                       filtered_count)
        self.log.datum(workspace.name, 'is correctly patched by VCF pipeline',
                       patched == primary_variant)
        self.log.log('\n')
예제 #4
0
    def run(self):
        reader_f = io.open_possibly_compressed_file(self.vcf)
        reader = vcf.Reader(reader_f)

        tags = {}
        for item in reader.metadata.get('sampleTags', []):
            parts = item.split(',')
            tags[parts[0]] = parts

        assert 'reference' not in reader.samples, 'Can\'t have a sample called reference, sorry.'

        samples = ['reference'] + reader.samples

        for sample in samples:
            if sample not in tags:
                tags[sample] = [sample, 'all']

        samples = selection.select_and_sort(self.select, self.sort, samples,
                                            lambda sample: tags[sample])

        required = [
            i for i, sample in enumerate(samples)
            if selection.matches(self.require, tags[sample])
        ]

        sample_number = dict((b, a) for a, b in enumerate(reader.samples))

        items = []
        for record in reader:
            variants = get_variants(record)
            genotypes = []
            counts = []
            qualities = []
            for sample in samples:
                if sample == 'reference':
                    genotypes.append([0])
                    counts.append([1])
                    qualities.append(float('inf'))
                else:
                    genotypes.append(
                        get_genotype(record.samples[sample_number[sample]]))
                    counts.append(
                        get_variant_counts(
                            record.samples[sample_number[sample]]))
                    qualities.append(
                        record.samples[sample_number[sample]].data.GQ)

            # Only output when there are at least two genotypes
            any_interesting = False
            for i in xrange(len(genotypes)):
                for j in xrange(i):
                    if (genotypes[i] is not None and genotypes[j] is not None
                            and
                            not genotypes_equal(genotypes[i], genotypes[j])):
                        any_interesting = True
                        break
                if any_interesting: break
            if not any_interesting:
                continue

            if any(genotypes[i] is None for i in required):
                continue

            if self.only_snps and any(genotype is not None and any(
                    len(variants[i]) != 1 for i in genotype)
                                      for genotype in genotypes):
                continue

            snpeff = snpeff_describe(record.INFO.get('EFF', ''))
            if not any(
                    selection.matches(self.snpeff_filter, item[1])
                    for item in (snpeff or [('', [])])):
                continue

            items.append(
                _Nway_record(variants=variants,
                             genotypes=genotypes,
                             counts=counts,
                             qualities=qualities,
                             snpeff=snpeff,
                             record=record))

        self.log.log('%d variants\n\n' % len(items))

        if self.as_ == 'table':
            self._write_table(samples, items)
        elif self.as_ == 'nexus':
            self._write_nexus(samples, items)
        elif self.as_ == 'splitstree':
            self._write_nexus(samples, items)

            io.execute(
                'SplitsTree +g -i INPUT -x COMMAND',
                no_display=True,
                INPUT=self.prefix + '.nex',
                COMMAND='UPDATE; '
                'SAVE FILE=\'%s.nex\' REPLACE=yes; '
                'EXPORTGRAPHICS format=svg file=\'%s.svg\' REPLACE=yes TITLE=\'NeighborNet from %d variants\'; '
                'QUIT' % (self.prefix, self.prefix, len(items)),
            )
        elif self.as_ == 'vcf':
            self._write_vcf(samples, items, reader)

        else:
            raise grace.Error('Unknown output format: ' + self.as_)
예제 #5
0
    def run(self):
        if self.dirichlet:
            assert self.ploidy == 1, 'Dirichlet mode is not available for ploidy > 1'

        reader_f = io.open_possibly_compressed_file(self.vcf)
        reader = vcf.Reader(reader_f)

        writer = vcf.Writer(open(self.prefix + '.vcf', 'wb'), reader)

        #print dir(reader)
        #print reader.formats
        #print
        #print reader.infos
        #print

        n = 0
        n_kept = 0

        for record in reader:
            n += 1
            variants = get_variants(record)

            any = False

            for sample in record.samples:
                self._modify_sample(variants, sample)

                any = any or (sample.data.GT != self._blank_gt()
                              and sample.data.GT != self._reference_gt())

                #print call.sample
                #for key in call.data._fields:
                #    print key, getattr(call.data,key), reader.formats[key].desc
                #
                #counts = [ call.data.RO ]
                #if isinstance(call.data.QA,list):
                #    counts.extend(call.data.QA)
                #else:
                #    counts.append(call.data.QA)
                #print variants, counts
                #
                #
                #if self.min_gq is not None and call.data.GQ < self.min_gq:
                #    call.data = call.data._replace(GT='.')
                #    print call.data
                #else:
                #    any = True

            if self.dirichlet:
                record.QUAL = min(
                    MAX_QUALITY,
                    sum(sample.data.GQ for sample in record.samples))

            if any:
                writer.write_record(record)
                n_kept += 1

        writer.close()
        reader_f.close()

        self.log.datum('variants', 'input', n)
        self.log.datum('variants', 'kept', n_kept)

        index_vcf(self.prefix + '.vcf')