Exemplo n.º 1
0
    def run(self):
        # Cluster together variants in each vcf
        if self.filter_and_cluster_vcf:
            EvaluateRecall._filter_vcf_for_clustering(self.truth_vcf_file, self.filtered_truth_vcf, self.discard_ref_calls)
            EvaluateRecall._filter_vcf_for_clustering(self.query_vcf_file, self.filtered_query_vcf, self.discard_ref_calls)
            if self.discard_ref_calls:
                clusterer_query = vcf_clusterer.VcfClusterer([self.filtered_query_vcf], self.query_vcf_ref, self.clustered_vcf_query, merge_method='simple', max_distance_between_variants=self.merge_length)
                clusterer_truth = vcf_clusterer.VcfClusterer([self.filtered_truth_vcf], self.truth_vcf_ref, self.clustered_vcf_truth, merge_method='simple', max_distance_between_variants=self.merge_length)
            else:
                clusterer_query = vcf_clusterer.VcfClusterer([self.filtered_query_vcf], self.query_vcf_ref, self.clustered_vcf_query, merge_method='gt_aware', max_distance_between_variants=self.merge_length)
                clusterer_truth = vcf_clusterer.VcfClusterer([self.filtered_truth_vcf], self.truth_vcf_ref, self.clustered_vcf_truth, merge_method='gt_aware', max_distance_between_variants=self.merge_length)
            clusterer_query.run()
            clusterer_truth.run()

        vcf_header, vcf_records_truth = vcf_file_read.vcf_file_to_dict(self.vcf_to_check_truth, sort=True, remove_useless_start_nucleotides=True)
        vcf_header, vcf_records_query = vcf_file_read.vcf_file_to_dict(self.vcf_to_check_query, sort=True, remove_useless_start_nucleotides=True)
        sample_from_header = vcf_file_read.get_sample_name_from_vcf_header_lines(vcf_header)
        if sample_from_header is None:
            sample_from_header = 'sample'
        truth_vcf_ref_seqs = {}
        pyfastaq.tasks.file_to_dict(self.truth_vcf_ref, truth_vcf_ref_seqs)
        query_vcf_ref_seqs = {}
        pyfastaq.tasks.file_to_dict(self.query_vcf_ref, query_vcf_ref_seqs)

        EvaluateRecall._write_vars_plus_flanks_to_fasta(self.seqs_out_truth, vcf_records_truth, truth_vcf_ref_seqs, self.flank_length, ref_only=True)
        EvaluateRecall._write_vars_plus_flanks_to_fasta(self.seqs_out_query, vcf_records_query, query_vcf_ref_seqs, self.flank_length, number_ns=self.number_ns)
        EvaluateRecall._map_seqs_to_seqs(self.seqs_out_query, self.seqs_out_truth, self.sam_file_out)
        #for f in glob.glob(self.seqs_out_truth + '*'):
            #os.unlink(f)
        #for f in glob.glob(self.seqs_out_query + '*'):
            #os.unlink(f)

        EvaluateRecall._index_vcf(self.vcf_to_check_query)
        self.vcf_to_check_query = self.vcf_to_check_query + ".gz"
        EvaluateRecall._parse_sam_files(self.vcf_to_check_truth, self.sam_file_out, self.vcf_to_check_query,
                                        self.sam_summary, self.flank_length,
                                        allow_mismatches=self.allow_flank_mismatches,
                                        exclude_regions=self.exclude_regions,
                                        max_soft_clipped=self.max_soft_clipped,
                                        number_ns=self.number_ns)
        stats, gt_conf_hist = EvaluateRecall._gather_stats(self.sam_summary)
        #os.unlink(self.seqs_out_truth)
        #os.unlink(self.seqs_out_truth)
        #for f in glob.glob(self.vcf_to_check_truth + '*'):
        #    os.unlink(f)
        #for f in glob.glob(self.vcf_to_check_query + '*'):
        #    os.unlink(f)

        # write stats file
        with open(self.stats_out, 'w') as f:
            keys = stats.keys()
            print(*keys, sep='\t', file=f)
            print(*[stats[x] for x in keys], sep='\t', file=f)


        # write GT_CONF histogram files
        with open(self.gt_conf_hist_out, 'w') as f:
            print('GT_CONF\tCount', file=f)
            for gt_conf, count in sorted(gt_conf_hist.items()):
                print(gt_conf, count, sep='\t', file=f)
Exemplo n.º 2
0
    def test_vcf_file_to_dict(self):
        '''test vcf_file_to_dict'''
        expected_header = ['# header1', '# header2']
        lines = [
            'ref_42\t11\tid_foo\tA\tG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80',
            'ref_42\t12\tid_foo\tC\tG\t42.43\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,53:39.81',
            'ref_43\t42\tid_foo\tT\tG\t43.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,54:39.82',
            'ref_43\t43\tid_foo\tT\tG,*\t43.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,54:39.83',
            'ref_43\t44\tid_foo\tT\t*\t43.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,54:39.84',
        ]

        expected_records = {
            'ref_42': [vcf_record.VcfRecord(lines[0]), vcf_record.VcfRecord(lines[1])],
            'ref_43': [vcf_record.VcfRecord(lines[x]) for x in (2, 3, 4)],
        }

        infile = os.path.join(data_dir, 'vcf_file_to_dict.vcf')
        got_header, got_records = vcf_file_read.vcf_file_to_dict(infile)
        self.assertEqual(expected_records, got_records)
        self.assertEqual(expected_header, got_header)

        infile = os.path.join(data_dir, 'vcf_file_to_dict.vcf.gz')
        got_header, got_records = vcf_file_read.vcf_file_to_dict(infile)
        self.assertEqual(expected_records, got_records)
        self.assertEqual(expected_header, got_header)


        expected_records['ref_43'].pop()
        expected_records['ref_43'][-1].remove_asterisk_alts()
        infile = os.path.join(data_dir, 'vcf_file_to_dict.vcf')
        got_header, got_records = vcf_file_read.vcf_file_to_dict(infile, remove_asterisk_alts=True)
        self.assertEqual(expected_records, got_records)
        self.assertEqual(expected_header, got_header)
Exemplo n.º 3
0
 def _parse_sam_files(cls, truth_vcf_file, samfile, query_vcf_file, outfile, flank_length, allow_mismatches=True, exclude_regions=None, max_soft_clipped=3, number_ns=0):
     '''Input is the original dnadiff snps file of sites we are searching for
     and 2 SAM files made by _map_seqs_to_seqs(), which show mappings of snp sites
     from from the dnadiff snps file to the vcf (i.e. searches if VCF contains an record
     with the appropriate sequence.
     Creates a tsv detailing whether the snp difference could be detected and at what
     GT_CONF threshold.
     '''
     header_lines, vcf_records = vcf_file_read.vcf_file_to_dict(truth_vcf_file, sort=True,
                                                                homozygous_only=False,
                                                                remove_asterisk_alts=True,
                                                                remove_useless_start_nucleotides=True)
     id = []
     ref = []
     alt = []
     for ref_name in vcf_records:
         for record in vcf_records[ref_name]:
             id.append(record.POS)
             ref.append(record.REF)
             alt.append(record.ALT[0])
     query_found, query_conf, query_allele, query_match_flag, query_allele_flag = EvaluateRecall._parse_sam_file_and_vcf(samfile, query_vcf_file,
                                                                                    flank_length, allow_mismatches,
                                                                                    exclude_regions, max_soft_clipped, number_ns)
     assert len(id) == len(query_found)
     out_df = pd.DataFrame({'id': id,
                            'ref': ref,
                            'alt': alt,
                            'query_found': query_found,
                            'query_conf': query_conf,
                            'query_allele': query_allele,
                            'query_match_flag': query_match_flag,
                            'query_allele_correct': query_allele_flag})
     out_df.to_csv(outfile, sep='\t')
Exemplo n.º 4
0
    def _filter_vcf_for_clustering(cls, infile, outfile, discard_ref_calls=True):
        header_lines, vcf_records = vcf_file_read.vcf_file_to_dict(infile, sort=True, homozygous_only=False, remove_asterisk_alts=True, remove_useless_start_nucleotides=True)

        with open(outfile, 'w') as f:
            print(*header_lines, sep='\n', file=f)
            for ref_name in vcf_records:
                for vcf_record in vcf_records[ref_name]:
                    if 'MISMAPPED_UNPLACEABLE' in vcf_record.FILTER:
                        continue
                    if vcf_record.FORMAT is None or 'GT' not in vcf_record.FORMAT:
                        logging.warning('No GT in vcf record:' + str(vcf_record))
                        continue
                    if vcf_record.REF in [".", ""]:
                        continue

                    genotype = vcf_record.FORMAT['GT']
                    genotypes = genotype.split('/')
                    called_alleles = set(genotypes)
                    if len(called_alleles) != 1 or (discard_ref_calls and called_alleles == {'0'}) or '.' in called_alleles:
                        continue

                    if len(vcf_record.ALT) > 1:
                        if called_alleles != {'0'}:
                            vcf_record.set_format_key_value('GT', '1/1')
                            try:
                                vcf_record.ALT = [vcf_record.ALT[int(genotypes[0]) - 1]]
                            except:
                                raise Error('BAD VCf line:' + str(vcf_record))
                        else:
                            vcf_record.set_format_key_value('GT', '0/0')
                            vcf_record.ALT = [vcf_record.ALT[0]]
                    if vcf_record.ALT[0] in [".",""]:
                        continue

                    if vcf_record.FORMAT['GT'] == '0':
                        vcf_record.FORMAT['GT'] = '0/0'
                    elif vcf_record.FORMAT['GT'] == '1':
                        vcf_record.FORMAT['GT'] = '1/1'

                    if 'GL' in vcf_record.FORMAT.keys() and 'GT_CONF' not in vcf_record.FORMAT.keys():
                        likelihoods = vcf_record.FORMAT['GL'].split(',')
                        assert(len(likelihoods) > 2)
                        if called_alleles == {'0'}:
                            vcf_record.set_format_key_value('GT_CONF',str(float(likelihoods[0]) - float(likelihoods[1])))
                        else:
                            vcf_record.set_format_key_value('GT_CONF', str(float(likelihoods[int(genotypes[0])]) - float(likelihoods[0])))
                    if 'SupportFraction' in vcf_record.INFO.keys() and 'GT_CONF' not in vcf_record.FORMAT.keys():
                        vcf_record.set_format_key_value('GT_CONF',
                                                        str(float(vcf_record.INFO['SupportFraction'])*100))
                    print(vcf_record, file=f)
    def test_write_vars_plus_flanks_to_fasta2(self):
        vcfref_file_in = os.path.join(data_dir, 'vcfref.fa')
        sample_file_in = os.path.join(data_dir, 'sample2a.vcf')
        tmp_out = 'tmp.write_vars_plus_flanks_to_fasta.out.2.fa'
        expected_out = os.path.join(data_dir, 'sample2a.plusflanks.fa')

        vcf_header, vcf_records = vcf_file_read.vcf_file_to_dict(sample_file_in, sort=True, remove_useless_start_nucleotides=True)
        vcf_ref_seqs = {}
        pyfastaq.tasks.file_to_dict(vcfref_file_in, vcf_ref_seqs)
        flank = 5
        ns = 1
        dnadiff_mapping_based_verifier.DnadiffMappingBasedVerifier._write_vars_plus_flanks_to_fasta(tmp_out,vcf_records, vcf_ref_seqs, flank, ns)

        self.assertTrue(filecmp.cmp(expected_out, tmp_out, shallow=False))
        os.unlink(tmp_out)
Exemplo n.º 6
0
    def test_write_vars_plus_flanks_to_fasta_ref(self):
        vcfref_file_in = os.path.join(data_dir, 'vcfref.fa')
        sample_file_in = os.path.join(data_dir, 'sample1a.vcf')
        tmp_out = 'tmp.write_vars_plus_flanks_to_fasta.ref.fa'
        expected_out = os.path.join(data_dir, 'sample1a.plusflanks.fa')

        vcf_header, vcf_records = vcf_file_read.vcf_file_to_dict(sample_file_in, sort=True,
                                                                 remove_useless_start_nucleotides=True)
        vcf_ref_seqs = {}
        pyfastaq.tasks.file_to_dict(vcfref_file_in, vcf_ref_seqs)
        flank = 5
        evaluate_recall.EvaluateRecall._write_vars_plus_flanks_to_fasta(tmp_out, vcf_records, vcf_ref_seqs, flank, ref_only=True)

        self.assertTrue(filecmp.cmp(expected_out, tmp_out, shallow=False))
        os.unlink(tmp_out)
Exemplo n.º 7
0
    def _filter_input_file_for_clustering(cls, infile, outfile):
        header_lines, vcf_records = vcf_file_read.vcf_file_to_dict(
            infile,
            sort=True,
            homozygous_only=False,
            remove_asterisk_alts=True,
            remove_useless_start_nucleotides=True,
        )
        with open(outfile, "w") as f:
            print(*header_lines, sep="\n", file=f)
            for ref_name in vcf_records:
                for vcf_record in vcf_records[ref_name]:
                    if "MISMAPPED_UNPLACEABLE" in vcf_record.FILTER:
                        continue
                    if vcf_record.FORMAT is None or "GT" not in vcf_record.FORMAT:
                        logging.warning("No GT in vcf record:" +
                                        str(vcf_record))
                        continue

                    genotype = vcf_record.FORMAT["GT"]
                    genotypes = genotype.split("/")

                    called_alleles = set(genotypes)
                    if called_alleles == {"0"} or "." in called_alleles:
                        continue

                    genotypes = sorted([int(x) for x in genotypes])

                    if len(called_alleles) == 1:
                        assert 0 not in genotypes
                        vcf_record.set_format_key_value("GT", "1/1")
                        vcf_record.ALT = [
                            vcf_record.ALT[int(genotypes[0]) - 1]
                        ]
                    else:
                        assert len(called_alleles) == 2
                        vcf_record.set_format_key_value("GT", "0/1")
                        if 0 in genotypes:
                            vcf_record.set_format_key_value("GT", "0/1")
                            vcf_record.ALT = [vcf_record.ALT[genotypes[1] - 1]]
                        else:
                            vcf_record.set_format_key_value("GT", "1/2")
                            vcf_record.ALT = [
                                vcf_record.ALT[genotypes[0] - 1],
                                vcf_record.ALT[genotypes[1] - 1],
                            ]

                    print(vcf_record, file=f)
Exemplo n.º 8
0
    def _filter_input_file_for_clustering(cls, infile, outfile):
        header_lines, vcf_records = vcf_file_read.vcf_file_to_dict(
            infile,
            sort=True,
            homozygous_only=False,
            remove_asterisk_alts=True,
            remove_useless_start_nucleotides=True)
        with open(outfile, 'w') as f:
            print(*header_lines, sep='\n', file=f)
            for ref_name in vcf_records:
                for vcf_record in vcf_records[ref_name]:
                    if 'MISMAPPED_UNPLACEABLE' in vcf_record.FILTER:
                        continue
                    if vcf_record.FORMAT is None or 'GT' not in vcf_record.FORMAT:
                        logging.warning('No GT in vcf record:' +
                                        str(vcf_record))
                        continue

                    genotype = vcf_record.FORMAT['GT']
                    genotypes = genotype.split('/')

                    called_alleles = set(genotypes)
                    if called_alleles == {'0'} or '.' in called_alleles:
                        continue

                    genotypes = sorted([int(x) for x in genotypes])

                    if len(called_alleles) == 1:
                        assert 0 not in genotypes
                        vcf_record.set_format_key_value('GT', '1/1')
                        vcf_record.ALT = [
                            vcf_record.ALT[int(genotypes[0]) - 1]
                        ]
                    else:
                        assert len(called_alleles) == 2
                        vcf_record.set_format_key_value('GT', '0/1')
                        if 0 in genotypes:
                            vcf_record.set_format_key_value('GT', '0/1')
                            vcf_record.ALT = [vcf_record.ALT[genotypes[1] - 1]]
                        else:
                            vcf_record.set_format_key_value('GT', '1/2')
                            vcf_record.ALT = [
                                vcf_record.ALT[genotypes[0] - 1],
                                vcf_record.ALT[genotypes[1] - 1]
                            ]

                    print(vcf_record, file=f)
Exemplo n.º 9
0
    def test_snps_file_file_to_unmerged_vcf(self):
        """test _snps_file_file_to_unmerged_vcf"""
        ref_fa = "tmp.test_snps_file_file_to_unmerged_vcf.ref.fa"
        qry_fa = "tmp.test_snps_file_file_to_unmerged_vcf.qry.fa"
        ref_seqs, qry_seqs, expected_vcf_records, expected_regions = write_fasta_files(
            ref_fa, qry_fa)
        outprefix = "tmp.test_snps_file_file_to_unmerged_vcf.dnadiff"
        vcf_out = outprefix + ".out.vcf"
        dnadiff.Dnadiff._run_dnadiff(ref_fa, qry_fa, outprefix)
        dnadiff.Dnadiff._snps_file_file_to_unmerged_vcf(
            outprefix + ".snps", qry_seqs, vcf_out)
        header, got = vcf_file_read.vcf_file_to_dict(vcf_out)
        self.assertTrue(expected_vcf_records, got)

        dnadiff.Dnadiff.clean_dnadiff_files(outprefix)
        os.unlink(ref_fa)
        os.unlink(qry_fa)
        os.unlink(vcf_out)
Exemplo n.º 10
0
    def test_snps_file_file_to_unmerged_vcf(self):
        '''test _snps_file_file_to_unmerged_vcf'''
        ref_fa = 'tmp.test_snps_file_file_to_unmerged_vcf.ref.fa'
        qry_fa = 'tmp.test_snps_file_file_to_unmerged_vcf.qry.fa'
        ref_seqs, qry_seqs, expected_vcf_records, expected_regions = write_fasta_files(
            ref_fa, qry_fa)
        outprefix = 'tmp.test_snps_file_file_to_unmerged_vcf.dnadiff'
        vcf_out = outprefix + '.out.vcf'
        dnadiff.Dnadiff._run_dnadiff(ref_fa, qry_fa, outprefix)
        dnadiff.Dnadiff._snps_file_file_to_unmerged_vcf(
            outprefix + '.snps', qry_seqs, vcf_out)
        header, got = vcf_file_read.vcf_file_to_dict(vcf_out)
        self.assertTrue(expected_vcf_records, got)

        dnadiff.Dnadiff.clean_dnadiff_files(outprefix)
        os.unlink(ref_fa)
        os.unlink(qry_fa)
        os.unlink(vcf_out)
Exemplo n.º 11
0
def add_qc_to_vcf(infile, outfile, want_ref_calls=False):
    """Annotated VCF file with QC info needed for calculating precision and recall.
    Adds various tags to each record."""
    header_lines, vcf_records = vcf_file_read.vcf_file_to_dict(
        infile, remove_useless_start_nucleotides=True)
    assert header_lines[-1].startswith("#CHROM")

    with open(outfile, "w") as f:
        print(*header_lines[:-1], sep="\n", file=f)
        print(
            '##FORMAT=<ID=VFR_FILTER,Number=1,Type=String,Description="Initial filtering of VCF record. If PASS, then it is evaluated, otherwise is skipped">',
            file=f,
        )
        print(header_lines[-1], file=f)

        for chrom, records in sorted(vcf_records.items()):
            _annotate_sorted_list_of_records(records,
                                             want_ref_calls=want_ref_calls)
            print(*records, sep="\n", file=f)
Exemplo n.º 12
0
    def run(self):
        snps_file = self.outprefix + ".snps"
        qdiff_file = self.outprefix + ".qdiff"
        self.unmerged_vcf = self.outprefix + ".raw.vcf"
        self.merged_vcf = self.outprefix + ".merged.vcf"
        for filename in [snps_file, qdiff_file]:
            if os.path.exists(filename):
                os.unlink(filename)
        tmp_prefix = self.outprefix + ".tmp"

        for ref_name, query_name in zip(self.ref_seq_names, self.query_seq_names):
            ref_fasta = tmp_prefix + ".ref.fa"
            query_fasta = tmp_prefix + ".query.fa"
            with open(ref_fasta, "w") as f:
                print(self.ref_seqs[ref_name], file=f)
            with open(query_fasta, "w") as f:
                print(self.query_seqs[query_name], file=f)
            Dnadiff._run_dnadiff(ref_fasta, query_fasta, tmp_prefix)
            utils.syscall("cat " + tmp_prefix + ".snps >> " + snps_file)
            utils.syscall("cat " + tmp_prefix + ".qdiff >> " + qdiff_file)
            Dnadiff.clean_dnadiff_files(tmp_prefix)
            os.unlink(ref_fasta)
            os.unlink(query_fasta)

        Dnadiff._snps_file_file_to_unmerged_vcf(
            self.outprefix + ".snps", self.query_seqs, self.unmerged_vcf
        )
        clusterer = vcf_clusterer.VcfClusterer(
            [self.unmerged_vcf],
            self.query_fasta,
            self.merged_vcf,
            merge_method="simple",
            cluster_boundary_size=31,
        )
        clusterer.run()
        header, self.variants = vcf_file_read.vcf_file_to_dict(
            self.merged_vcf, remove_useless_start_nucleotides=True
        )
        self.big_variant_intervals = Dnadiff._load_qdiff_file(self.outprefix + ".qdiff")
        self.all_variant_intervals = Dnadiff._make_all_variants_intervals(
            self.variants, self.big_variant_intervals
        )
Exemplo n.º 13
0
    def run(self):
        snps_file = self.outprefix + '.snps'
        qdiff_file = self.outprefix + '.qdiff'
        self.unmerged_vcf = self.outprefix + '.raw.vcf'
        self.merged_vcf = self.outprefix + '.merged.vcf'
        for filename in [snps_file, qdiff_file]:
            if os.path.exists(filename):
                os.unlink(filename)
        tmp_prefix = self.outprefix + '.tmp'

        for ref_name, query_name in zip(self.ref_seq_names,
                                        self.query_seq_names):
            ref_fasta = tmp_prefix + '.ref.fa'
            query_fasta = tmp_prefix + '.query.fa'
            with open(ref_fasta, 'w') as f:
                print(self.ref_seqs[ref_name], file=f)
            with open(query_fasta, 'w') as f:
                print(self.query_seqs[query_name], file=f)
            Dnadiff._run_dnadiff(ref_fasta, query_fasta, tmp_prefix)
            utils.syscall('cat ' + tmp_prefix + '.snps >> ' + snps_file)
            utils.syscall('cat ' + tmp_prefix + '.qdiff >> ' + qdiff_file)
            Dnadiff.clean_dnadiff_files(tmp_prefix)
            os.unlink(ref_fasta)
            os.unlink(query_fasta)

        Dnadiff._snps_file_file_to_unmerged_vcf(self.outprefix + '.snps',
                                                self.query_seqs,
                                                self.unmerged_vcf)
        clusterer = vcf_clusterer.VcfClusterer([self.unmerged_vcf],
                                               self.query_fasta,
                                               self.merged_vcf,
                                               merge_method='simple')
        clusterer.run()
        header, self.variants = vcf_file_read.vcf_file_to_dict(
            self.merged_vcf, remove_useless_start_nucleotides=True)
        self.big_variant_intervals = Dnadiff._load_qdiff_file(self.outprefix +
                                                              '.qdiff')
        self.all_variant_intervals = Dnadiff._make_all_variants_intervals(
            self.variants, self.big_variant_intervals)
Exemplo n.º 14
0
    def _load_vcf_files(cls, filename_list, homozygous_only=False, max_REF_len=None, min_SNP_qual=None, min_dp4=None, min_GT_conf=None):
        '''Loads all the vcf files from filename_list. Returns tuple of:
        1. Sample name. If more than one sample name found, uses the first one
        and warns to stderr
        2. Dictionary. filename => list of header lines for that file
        3. Dictionary. ref name => list of VcfRecords sorted by position'''
        headers = {}
        vcf_records = None
        sample_name = None

        for filename in filename_list:
            headers[filename], new_records = vcf_file_read.vcf_file_to_dict(filename, homozygous_only=homozygous_only, remove_asterisk_alts=True, max_REF_len=max_REF_len, remove_useless_start_nucleotides=True, min_SNP_qual=min_SNP_qual, min_dp4=min_dp4, min_GT_conf=min_GT_conf)

            new_sample_name = vcf_file_read.get_sample_name_from_vcf_header_lines(headers[filename])
            if sample_name is None and new_sample_name is not None:
                sample_name = new_sample_name
            elif new_sample_name != sample_name:
                logging.warning('Using first sample name found "' + str(sample_name) + '". Found a different (or no) sample name "' + str(new_sample_name) + '", which will not be used')

            if vcf_records is None:
                vcf_records = new_records
            else:
                for ref_name, record_list in new_records.items():
                    if ref_name not in vcf_records:
                        vcf_records[ref_name] = record_list
                    else:
                        vcf_records[ref_name].extend(record_list)

        for record_list in vcf_records.values():
            record_list.sort(key=operator.attrgetter('POS'))

        if sample_name is None:
            logging.warning('No sample name found in VCF files. Going to use "sample"')
            sample_name = 'sample'

        return sample_name, headers, vcf_records
Exemplo n.º 15
0
    def run(self):
        if self.filter_and_cluster_vcf:
            MappingBasedVerifier._filter_vcf_for_clustering(
                self.vcf_file_in,
                self.filtered_vcf,
                discard_ref_calls=self.discard_ref_calls)
            if self.discard_ref_calls:
                clusterer = vcf_clusterer.VcfClusterer(
                    [self.filtered_vcf],
                    self.vcf_reference_file,
                    self.clustered_vcf,
                    merge_method='simple',
                    max_distance_between_variants=self.merge_length)
            else:
                clusterer = vcf_clusterer.VcfClusterer(
                    [self.filtered_vcf],
                    self.vcf_reference_file,
                    self.clustered_vcf,
                    merge_method='gt_aware',
                    max_distance_between_variants=self.merge_length)
            clusterer.run()

        vcf_header, vcf_records = vcf_file_read.vcf_file_to_dict(
            self.vcf_to_check,
            sort=True,
            remove_useless_start_nucleotides=True)
        sample_from_header = vcf_file_read.get_sample_name_from_vcf_header_lines(
            vcf_header)
        if sample_from_header is None:
            sample_from_header = 'sample'
        vcf_ref_seqs = {}
        pyfastaq.tasks.file_to_dict(self.vcf_reference_file, vcf_ref_seqs)
        verify_ref_seqs = {}
        pyfastaq.tasks.file_to_dict(self.verify_reference_file,
                                    verify_ref_seqs)

        MappingBasedVerifier._write_vars_plus_flanks_to_fasta(
            self.seqs_out, vcf_records, vcf_ref_seqs, self.flank_length)
        MappingBasedVerifier._map_seqs_to_ref(self.seqs_out,
                                              self.verify_reference_file,
                                              self.sam_file_out)
        os.unlink(self.seqs_out)
        stats, gt_conf_hists = MappingBasedVerifier._parse_sam_file_and_update_vcf_records_and_gather_stats(
            self.sam_file_out,
            vcf_records,
            self.flank_length,
            verify_ref_seqs,
            allow_mismatches=self.allow_flank_mismatches,
            exclude_regions=self.exclude_regions,
            max_soft_clipped=self.max_soft_clipped)

        with open(self.vcf_file_out, 'w') as f:
            print(*vcf_header, sep='\n', file=f)
            for r in vcf_records:
                for v in vcf_records[r]:
                    print(v, file=f)

        # false negative stats, if possible
        stats['variant_regions_total'] = 'NA'
        stats['called_variant_regions'] = 'NA'

        if self.run_dnadiff:
            dnadiffer = dnadiff.Dnadiff(
                self.verify_reference_file,
                self.vcf_reference_file,
                self.dnadiff_outprefix,
            )
            dnadiffer.run()
            stats['variant_regions_total'], stats[
                'called_variant_regions'] = MappingBasedVerifier._get_total_length_of_expected_regions_called(
                    dnadiffer.all_variant_intervals, vcf_records)
            expected_variants = dnadiffer.variants
        elif self.expected_variants_vcf is not None:
            header, expected_variants = vcf_file_read.vcf_file_to_dict(
                self.expected_variants_vcf,
                sort=True,
                remove_useless_start_nucleotides=True)
        else:
            expected_variants = None

        if expected_variants is None:
            stats['false_negatives'] = 'NA'
        else:
            missed_vcf_records = MappingBasedVerifier._get_missing_vcf_records(
                vcf_records, expected_variants)
            stats['false_negatives'] = 0
            with open(self.vcf_false_negatives_file_out, 'w') as f:
                print('##fileformat=VCFv4.2', file=f)
                print('#CHROM',
                      'POS',
                      'ID',
                      'REF',
                      'ALT',
                      'QUAL',
                      'FILTER',
                      'INFO',
                      'FORMAT',
                      sample_from_header,
                      sep='\t',
                      file=f)
                for vcf_list in missed_vcf_records.values():
                    stats['false_negatives'] += len(vcf_list)
                    if len(vcf_list) > 0:
                        print(*vcf_list, sep='\n', file=f)

        # write stats file
        with open(self.stats_out, 'w') as f:
            keys = [
                'total', 'gt_correct', 'gt_wrong', 'gt_excluded', 'HET',
                'tp_edit_dist', 'fp_edit_dist', 'UNKNOWN_NO_GT',
                'variant_regions_total', 'called_variant_regions',
                'false_negatives'
            ]
            print(*keys, sep='\t', file=f)
            print(*[stats[x] for x in keys], sep='\t', file=f)

        # write GT_CONG histogram files
        for key, filename in self.gt_conf_hists_filenames.items():
            with open(filename, 'w') as f:
                print('GT_CONF\tCount', file=f)
                for gt_conf, count in sorted(gt_conf_hists[key].items()):
                    print(gt_conf, count, sep='\t', file=f)

        plots.plots_from_minos_vcf(self.vcf_file_out, self.vcf_file_plots_out)
Exemplo n.º 16
0
def gvcf_from_minos_vcf_and_samtools_gvcf(ref_fasta, minos_vcf, samtools_vcf, out_vcf):
    minos_header, minos_records = vcf_file_read.vcf_file_to_dict(minos_vcf)
    samtools_header = vcf_file_read.get_header_lines_from_vcf_file(samtools_vcf)

    ref_seqs = {}
    pyfastaq.tasks.file_to_dict(ref_fasta, ref_seqs)
    used_ref_seqs = set()
    ref_seq = None
    ref_pos = -1
    minos_record = None

    with open(out_vcf, "w") as f_out, open(samtools_vcf) as f_samtools:
        print(
            *_combine_minos_and_samtools_header(
                minos_header, samtools_header, ref_seqs
            ),
            sep="\n",
            file=f_out,
        )

        # Read the samtools VCF file line by line. It's huge, so don't want
        # to load into memory. Within each CHROM in the samtools VCF, keep
        # track of the current position in the CHROM, and the next minos record
        # for that CHROM (if there is one). For each position in the ref genome,
        # we want in order of preference to write one of:
        # 1. minos record if there is one
        # 2. samtools record if there is one
        # 3. a "no data" record if position is not in minos or samtools records
        # Not loading into memory and following where we are the ref genome, and
        # in the list of minos records, makes this a bit fiddly.
        for line in f_samtools:
            if line.startswith("#"):
                continue

            samtools_record = vcf_record.VcfRecord(line)

            # If we've found a new CHROM in the samtools VCF file
            if ref_seq is None or ref_seq.id != samtools_record.CHROM:
                if ref_seq is not None:
                    _finish_contig(ref_pos, ref_seq, minos_record, minos_iter, f_out)
                ref_seq = ref_seqs[samtools_record.CHROM]
                used_ref_seqs.add(ref_seq.id)
                minos_iter, minos_record = _get_minos_iter_and_record(
                    minos_records, ref_seq.id
                )
                ref_pos = 0

            if samtools_record.POS < ref_pos:
                continue

            # Fill in any missing gaps between current position and the samtools
            # record using minos records if found, or if not then "no data" records.
            while ref_pos < samtools_record.POS:
                if minos_record is not None and ref_pos == minos_record.POS:
                    minos_record.INFO["CALLER"] = "minos"
                    print(minos_record, file=f_out)
                    ref_pos = minos_record.ref_end_pos() + 1
                    minos_record = _update_minos_iter(minos_iter)
                else:
                    _print_no_data_vcf_record(ref_seq, ref_pos, f_out)
                    ref_pos += 1

            # If there's a minos record, use it instead of samtools record.
            while minos_record is not None and minos_record.POS <= samtools_record.POS:
                minos_record.INFO["CALLER"] = "minos"
                print(minos_record, file=f_out)
                ref_pos = minos_record.ref_end_pos() + 1
                minos_record = _update_minos_iter(minos_iter)

            # If we haven't used a minos record, then current ref position is
            # same as the samtools record, and we should use the samtools record
            if ref_pos == samtools_record.POS:
                _move_info_fields_to_format(samtools_record)
                samtools_record.INFO = {"CALLER": "samtools"}
                print(samtools_record, file=f_out)
                ref_pos = samtools_record.POS + 1

        _finish_contig(ref_pos, ref_seq, minos_record, minos_iter, f_out)
        _print_non_samtools_seqs(ref_seqs, used_ref_seqs, minos_records, f_out)
Exemplo n.º 17
0
    def _filter_vcf_for_clustering(cls,
                                   infile,
                                   outfile,
                                   discard_ref_calls=True):
        header_lines, vcf_records = vcf_file_read.vcf_file_to_dict(
            infile,
            sort=True,
            homozygous_only=False,
            remove_asterisk_alts=True,
            remove_useless_start_nucleotides=True,
        )

        with open(outfile, "w") as f:
            print(*header_lines, sep="\n", file=f)
            for ref_name in vcf_records:
                for vcf_record in vcf_records[ref_name]:
                    if "MISMAPPED_UNPLACEABLE" in vcf_record.FILTER:
                        continue
                    if vcf_record.FORMAT is None or "GT" not in vcf_record.FORMAT:
                        logging.warning("No GT in vcf record:" +
                                        str(vcf_record))
                        continue
                    if vcf_record.REF in [".", ""]:
                        continue

                    genotype = vcf_record.FORMAT["GT"]
                    genotypes = genotype.split("/")
                    called_alleles = set(genotypes)
                    if (len(called_alleles) != 1
                            or (discard_ref_calls and called_alleles == {"0"})
                            or "." in called_alleles):
                        continue

                    if len(vcf_record.ALT) > 1:
                        if called_alleles != {"0"}:
                            vcf_record.set_format_key_value("GT", "1/1")
                            try:
                                vcf_record.ALT = [
                                    vcf_record.ALT[int(genotypes[0]) - 1]
                                ]
                            except:
                                raise Exception("BAD VCf line:" +
                                                str(vcf_record))
                        else:
                            vcf_record.set_format_key_value("GT", "0/0")
                            vcf_record.ALT = [vcf_record.ALT[0]]
                    if vcf_record.ALT[0] in [".", ""]:
                        continue

                    if vcf_record.FORMAT["GT"] == "0":
                        vcf_record.FORMAT["GT"] = "0/0"
                    elif vcf_record.FORMAT["GT"] == "1":
                        vcf_record.FORMAT["GT"] = "1/1"

                    if ("GL" in vcf_record.FORMAT.keys()
                            and "GT_CONF" not in vcf_record.FORMAT.keys()):
                        likelihoods = vcf_record.FORMAT["GL"].split(",")
                        assert len(likelihoods) > 2
                        if called_alleles == {"0"}:
                            vcf_record.set_format_key_value(
                                "GT_CONF",
                                str(
                                    float(likelihoods[0]) -
                                    float(likelihoods[1])),
                            )
                        else:
                            vcf_record.set_format_key_value(
                                "GT_CONF",
                                str(
                                    float(likelihoods[int(genotypes[0])]) -
                                    float(likelihoods[0])),
                            )
                    if ("SupportFraction" in vcf_record.INFO.keys()
                            and "GT_CONF" not in vcf_record.FORMAT.keys()):
                        vcf_record.set_format_key_value(
                            "GT_CONF",
                            str(
                                float(vcf_record.INFO["SupportFraction"]) *
                                100),
                        )
                    print(vcf_record, file=f)
Exemplo n.º 18
0
    def run(self):
        if self.filter_and_cluster_vcf:
            MappingBasedVerifier._filter_vcf_for_clustering(
                self.vcf_file_in,
                self.filtered_vcf,
                discard_ref_calls=self.discard_ref_calls,
            )
            if self.discard_ref_calls:
                clusterer = vcf_clusterer.VcfClusterer(
                    [self.filtered_vcf],
                    self.vcf_reference_file,
                    self.clustered_vcf,
                    merge_method="simple",
                    cluster_boundary_size=self.merge_length,
                )
            else:
                clusterer = vcf_clusterer.VcfClusterer(
                    [self.filtered_vcf],
                    self.vcf_reference_file,
                    self.clustered_vcf,
                    merge_method="gt_aware",
                    cluster_boundary_size=self.merge_length,
                )
            clusterer.run()

        vcf_header, vcf_records = vcf_file_read.vcf_file_to_dict(
            self.vcf_to_check,
            sort=True,
            remove_useless_start_nucleotides=True)
        sample_from_header = vcf_file_read.get_sample_name_from_vcf_header_lines(
            vcf_header)
        if sample_from_header is None:
            sample_from_header = "sample"
        vcf_ref_seqs = {}
        pyfastaq.tasks.file_to_dict(self.vcf_reference_file, vcf_ref_seqs)
        verify_ref_seqs = {}
        pyfastaq.tasks.file_to_dict(self.verify_reference_file,
                                    verify_ref_seqs)

        MappingBasedVerifier._write_vars_plus_flanks_to_fasta(
            self.seqs_out, vcf_records, vcf_ref_seqs, self.flank_length)
        MappingBasedVerifier._map_seqs_to_ref(self.seqs_out,
                                              self.verify_reference_file,
                                              self.sam_file_out)
        os.unlink(self.seqs_out)
        stats, gt_conf_hists = MappingBasedVerifier._parse_sam_file_and_update_vcf_records_and_gather_stats(
            self.sam_file_out,
            vcf_records,
            self.flank_length,
            verify_ref_seqs,
            allow_mismatches=self.allow_flank_mismatches,
            exclude_regions=self.exclude_regions,
            max_soft_clipped=self.max_soft_clipped,
        )

        with open(self.vcf_file_out, "w") as f:
            print(*vcf_header, sep="\n", file=f)
            for r in vcf_records:
                for v in vcf_records[r]:
                    print(v, file=f)

        # false negative stats, if possible
        stats["variant_regions_total"] = "NA"
        stats["called_variant_regions"] = "NA"

        if self.run_dnadiff:
            dnadiffer = dnadiff.Dnadiff(
                self.verify_reference_file,
                self.vcf_reference_file,
                self.dnadiff_outprefix,
            )
            dnadiffer.run()
            stats["variant_regions_total"], stats[
                "called_variant_regions"] = MappingBasedVerifier._get_total_length_of_expected_regions_called(
                    dnadiffer.all_variant_intervals, vcf_records)
            expected_variants = dnadiffer.variants
        elif self.expected_variants_vcf is not None:
            header, expected_variants = vcf_file_read.vcf_file_to_dict(
                self.expected_variants_vcf,
                sort=True,
                remove_useless_start_nucleotides=True,
            )
        else:
            expected_variants = None

        if expected_variants is None:
            stats["false_negatives"] = "NA"
        else:
            missed_vcf_records = MappingBasedVerifier._get_missing_vcf_records(
                vcf_records, expected_variants)
            stats["false_negatives"] = 0
            with open(self.vcf_false_negatives_file_out, "w") as f:
                print("##fileformat=VCFv4.2", file=f)
                print(
                    "#CHROM",
                    "POS",
                    "ID",
                    "REF",
                    "ALT",
                    "QUAL",
                    "FILTER",
                    "INFO",
                    "FORMAT",
                    sample_from_header,
                    sep="\t",
                    file=f,
                )
                for vcf_list in missed_vcf_records.values():
                    stats["false_negatives"] += len(vcf_list)
                    if len(vcf_list) > 0:
                        print(*vcf_list, sep="\n", file=f)

        # write stats file
        with open(self.stats_out, "w") as f:
            keys = [
                "total",
                "gt_correct",
                "gt_wrong",
                "gt_excluded",
                "HET",
                "tp_edit_dist",
                "fp_edit_dist",
                "UNKNOWN_NO_GT",
                "variant_regions_total",
                "called_variant_regions",
                "false_negatives",
            ]
            print(*keys, sep="\t", file=f)
            print(*[stats[x] for x in keys], sep="\t", file=f)

        # write GT_CONG histogram files
        for key, filename in self.gt_conf_hists_filenames.items():
            with open(filename, "w") as f:
                print("GT_CONF\tCount", file=f)
                for gt_conf, count in sorted(gt_conf_hists[key].items()):
                    print(gt_conf, count, sep="\t", file=f)

        plots.plots_from_minos_vcf(self.vcf_file_out, self.vcf_file_plots_out)
Exemplo n.º 19
0
    def run(self):
        # Write files of sequences to search for in each vcf
        DnadiffMappingBasedVerifier._write_dnadiff_plus_flanks_to_fastas(
            self.dnadiff_snps_file, self.dnadiff_file1, self.dnadiff_file2,
            self.seqs_out_dnadiff1, self.seqs_out_dnadiff2, self.flank_length)

        # Cluster together variants in each vcf
        if self.filter_and_cluster_vcf:
            DnadiffMappingBasedVerifier._filter_vcf_for_clustering(
                self.vcf_file_in1, self.filtered_vcf1, self.discard_ref_calls)
            DnadiffMappingBasedVerifier._filter_vcf_for_clustering(
                self.vcf_file_in2, self.filtered_vcf2, self.discard_ref_calls)
            if self.discard_ref_calls:
                clusterer1 = vcf_clusterer.VcfClusterer(
                    [self.filtered_vcf1],
                    self.vcf_reference_file,
                    self.clustered_vcf1,
                    merge_method='simple',
                    max_distance_between_variants=self.merge_length)
                clusterer2 = vcf_clusterer.VcfClusterer(
                    [self.filtered_vcf2],
                    self.vcf_reference_file,
                    self.clustered_vcf2,
                    merge_method='simple',
                    max_distance_between_variants=self.merge_length)
            else:
                clusterer1 = vcf_clusterer.VcfClusterer(
                    [self.filtered_vcf1],
                    self.vcf_reference_file,
                    self.clustered_vcf1,
                    merge_method='gt_aware',
                    max_distance_between_variants=self.merge_length)
                clusterer2 = vcf_clusterer.VcfClusterer(
                    [self.filtered_vcf2],
                    self.vcf_reference_file,
                    self.clustered_vcf2,
                    merge_method='gt_aware',
                    max_distance_between_variants=self.merge_length)
            clusterer1.run()
            clusterer2.run()

        vcf_header, vcf_records1 = vcf_file_read.vcf_file_to_dict(
            self.vcf_to_check1,
            sort=True,
            remove_useless_start_nucleotides=True)
        vcf_header, vcf_records2 = vcf_file_read.vcf_file_to_dict(
            self.vcf_to_check2,
            sort=True,
            remove_useless_start_nucleotides=True)
        sample_from_header = vcf_file_read.get_sample_name_from_vcf_header_lines(
            vcf_header)
        if sample_from_header is None:
            sample_from_header = 'sample'
        vcf_ref_seqs = {}
        pyfastaq.tasks.file_to_dict(self.vcf_reference_file, vcf_ref_seqs)

        DnadiffMappingBasedVerifier._write_vars_plus_flanks_to_fasta(
            self.seqs_out_vcf1, vcf_records1, vcf_ref_seqs, self.flank_length,
            self.number_ns)
        DnadiffMappingBasedVerifier._write_vars_plus_flanks_to_fasta(
            self.seqs_out_vcf2, vcf_records2, vcf_ref_seqs, self.flank_length,
            self.number_ns)
        DnadiffMappingBasedVerifier._map_seqs_to_seqs(self.seqs_out_vcf1,
                                                      self.seqs_out_dnadiff1,
                                                      self.sam_file_out1)
        DnadiffMappingBasedVerifier._map_seqs_to_seqs(self.seqs_out_vcf2,
                                                      self.seqs_out_dnadiff2,
                                                      self.sam_file_out2)
        #for f in glob.glob(self.seqs_out_vcf1 + '*'):
        #os.unlink(f)
        #for f in glob.glob(self.seqs_out_vcf2 + '*'):
        #os.unlink(f)

        DnadiffMappingBasedVerifier._index_vcf(self.vcf_to_check1)
        self.vcf_to_check1 = self.vcf_to_check1 + ".gz"
        DnadiffMappingBasedVerifier._index_vcf(self.vcf_to_check2)
        self.vcf_to_check2 = self.vcf_to_check2 + ".gz"
        DnadiffMappingBasedVerifier._parse_sam_files(
            self.dnadiff_snps_file,
            self.sam_file_out1,
            self.sam_file_out2,
            self.vcf_to_check1,
            self.vcf_to_check2,
            self.seqs_out_dnadiff1,
            self.seqs_out_dnadiff2,
            self.sam_summary,
            self.flank_length,
            allow_mismatches=self.allow_flank_mismatches,
            exclude_regions1=self.exclude_regions1,
            exclude_regions2=self.exclude_regions2,
            max_soft_clipped=self.max_soft_clipped,
            number_ns=self.number_ns)
        stats, gt_conf_hist = DnadiffMappingBasedVerifier._gather_stats(
            self.sam_summary)
        #os.unlink(self.seqs_out_dnadiff1)
        #os.unlink(self.seqs_out_dnadiff2)
        #for f in glob.glob(self.vcf_to_check1 + '*'):
        #    os.unlink(f)
        #for f in glob.glob(self.vcf_to_check2 + '*'):
        #    os.unlink(f)

        # write stats file
        with open(self.stats_out, 'w') as f:
            keys = stats.keys()
            print(*keys, sep='\t', file=f)
            print(*[stats[x] for x in keys], sep='\t', file=f)

        # write GT_CONF histogram files
        with open(self.gt_conf_hist_out, 'w') as f:
            print('GT_CONF\tCount', file=f)
            for gt_conf, count in sorted(gt_conf_hist.items()):
                print(gt_conf, count, sep='\t', file=f)
Exemplo n.º 20
0
    def test_vcf_file_to_dict(self):
        """test vcf_file_to_dict"""
        ref_seqs = {
            #                   10        20        30        40
            #          123456789012345678901234567890123456789012345
            "ref_42": "GTAGTACGTAACATGT",
            "ref_43": "AGCTGCGAGCGCGTCGACTGCATGCATCGATCGAGCTAGCTTTTA",
            "ref_44": "AGTA",
        }

        expected_header = ["# header1", "# header2"]
        lines = [
            "ref_42\t11\tid_foo\tA\tG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80",
            "ref_42\t12\tid_foo\tC\tG\t42.43\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,53:39.81",
            "ref_43\t42\tid_foo\tT\tG\t43.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,54:39.82",
            "ref_43\t43\tid_foo\tT\tG,*\t43.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,54:39.83",
            "ref_43\t44\tid_foo\tT\t*\t43.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,54:39.84",
            "ref_44\t2\tid_foo\tT\tA\t43.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,54:39.84",
        ]

        expected_records = {
            "ref_42":
            [vcf_record.VcfRecord(lines[0]),
             vcf_record.VcfRecord(lines[1])],
            "ref_43": [vcf_record.VcfRecord(lines[x]) for x in (2, 3, 4)],
            "ref_44": [vcf_record.VcfRecord(lines[5])],
        }

        infile = os.path.join(data_dir, "vcf_file_to_dict.vcf")
        with self.assertRaises(ValueError):
            got_header, got_records = vcf_file_read.vcf_file_to_dict(
                infile, error_on_bad_POS=True)
        got_header, got_records = vcf_file_read.vcf_file_to_dict(
            infile, error_on_bad_POS=False)
        self.assertEqual(expected_records, got_records)
        self.assertEqual(expected_header, got_header)

        del expected_records["ref_44"]
        infile = os.path.join(data_dir, "vcf_file_to_dict.vcf")
        got_header, got_records = vcf_file_read.vcf_file_to_dict(
            infile, reference_seqs=ref_seqs, error_on_bad_POS=False)
        self.assertEqual(expected_records, got_records)
        self.assertEqual(expected_header, got_header)

        infile = os.path.join(data_dir, "vcf_file_to_dict.vcf.gz")
        got_header, got_records = vcf_file_read.vcf_file_to_dict(
            infile, reference_seqs=ref_seqs, error_on_bad_POS=False)
        self.assertEqual(expected_records, got_records)
        self.assertEqual(expected_header, got_header)

        expected_records["ref_43"].pop()
        expected_records["ref_43"][-1].remove_asterisk_alts()
        infile = os.path.join(data_dir, "vcf_file_to_dict.vcf")
        got_header, got_records = vcf_file_read.vcf_file_to_dict(
            infile,
            remove_asterisk_alts=True,
            reference_seqs=ref_seqs,
            error_on_bad_POS=False,
        )
        self.assertEqual(expected_records, got_records)
        self.assertEqual(expected_header, got_header)
Exemplo n.º 21
0
    def _load_vcf_files(
        cls,
        filename_list,
        reference_seqs,
        homozygous_only=False,
        max_REF_len=None,
        min_SNP_qual=None,
        min_dp4=None,
        min_GT_conf=None,
    ):
        """Loads all the vcf files from filename_list. Returns tuple of:
        1. Sample name. If more than one sample name found, uses the first one
        and warns to stderr
        2. Dictionary. filename => list of header lines for that file
        3. Dictionary. ref name => list of VcfRecords sorted by position.

        reference_seqs should be a dictionary of sequence name -> sequence.
        This causes all records from the VCF to be sanity checked against the reference sequence,
        and any records where the REF seq does not match the expected sequence is removed."""
        headers = {}
        vcf_records = None
        sample_name = None

        for filename in filename_list:
            headers[filename], new_records = vcf_file_read.vcf_file_to_dict(
                filename,
                homozygous_only=homozygous_only,
                remove_asterisk_alts=True,
                max_REF_len=max_REF_len,
                remove_useless_start_nucleotides=True,
                min_SNP_qual=min_SNP_qual,
                min_dp4=min_dp4,
                min_GT_conf=min_GT_conf,
                reference_seqs=reference_seqs,
                error_on_bad_POS=False,
            )

            new_sample_name = vcf_file_read.get_sample_name_from_vcf_header_lines(
                headers[filename]
            )
            if sample_name is None and new_sample_name is not None:
                sample_name = new_sample_name
            elif new_sample_name != sample_name:
                logging.warning(
                    'Using first sample name found "'
                    + str(sample_name)
                    + '". Found a different (or no) sample name "'
                    + str(new_sample_name)
                    + '", which will not be used'
                )

            if vcf_records is None:
                vcf_records = new_records
            else:
                for ref_name, record_list in new_records.items():
                    if ref_name not in vcf_records:
                        vcf_records[ref_name] = record_list
                    else:
                        vcf_records[ref_name].extend(record_list)

        for record_list in vcf_records.values():
            record_list.sort(key=operator.attrgetter("POS"))

        if sample_name is None:
            logging.warning('No sample name found in VCF files. Going to use "sample"')
            sample_name = "sample"

        return sample_name, headers, vcf_records