示例#1
0
 def add_vcf_header_info(self, vcf_template):
     """
     Adds vcf INFO headers for the annotated values provided
     This is just a base method you need to override in your own implementation
     depending on the annotations added through the annotate_record method
     :param vcf_template: vcf reader object
     :return:
     """
     vcf_template.infos['variant_id'] = _Info('variant_id', 1, 'Integer', 'Saphetor variant identifier', None, None)
     vcf_template.infos['gene'] = _Info('gene', '.', 'String', 'Genes related to this variant', None, None)
     vcf_template.infos['gnomad_exomes_AF'] = _Info('gnomad_exomes_AF', '.', 'Float',
                                                  'GnomAD exomes allele frequency value', None, None)
     vcf_template.infos['gnomad_genomes_AF'] = _Info('gnomad_genomes_AF', '.', 'Float',
                                                   'GnomAD genomes allele frequency value', None, None)
示例#2
0
 def test_create_processed_variant_annotation_alt_allele_num(self):
   csq_info = parser._Info(
       id=None, num='.', type=None,
       desc='some desc Allele|Consequence|IMPACT|ALLELE_NUM',
       source=None, version=None)
   header_fields = vcf_header_io.VcfHeader(infos={'CSQ': csq_info})
   variant = vcfio.Variant(
       reference_name='19', start=11, end=12, reference_bases='C',
       # The following represent a SNV and an insertion, resp.
       alternate_bases=['T', 'CT'],
       names=['rs1'], quality=2,
       filters=['PASS'],
       # Note that in the minimal mode of VEP, 'T' is an ambiguous annotation
       # ALT because it can map to either the 'T' SNV or the 'CT' insertion.
       # But because there is ALLELE_NUM there should be no ambiguity.
       # The last four annotations have incorrect ALLELE_NUMs.
       info={'CSQ': ['T|C1|I1|1', 'T|C2|I2|2', 'T|C3|I3|0', 'T|C4|I4|3',
                     'T|C5|I5|TEST', 'T|C6|I6|']})
   counter_factory = _CounterSpyFactory()
   factory = processed_variant.ProcessedVariantFactory(
       header_fields,
       split_alternate_allele_info_fields=True,
       annotation_fields=['CSQ'],
       use_allele_num=True,
       minimal_match=True,  # This should be ignored by the factory method.
       counter_factory=counter_factory)
   proc_var = factory.create_processed_variant(variant)
   alt1 = processed_variant.AlternateBaseData('T')
   alt1._info = {
       'CSQ': [
           {annotation_parser.ANNOTATION_ALT: 'T',
            'Consequence': 'C1', 'IMPACT': 'I1', 'ALLELE_NUM': '1'}]
   }
   alt2 = processed_variant.AlternateBaseData('CT')
   alt2._info = {
       'CSQ': [
           {annotation_parser.ANNOTATION_ALT: 'T',
            'Consequence': 'C2', 'IMPACT': 'I2', 'ALLELE_NUM': '2'}]
   }
   self.assertEqual(proc_var.alternate_data_list, [alt1, alt2])
   self.assertFalse(proc_var.non_alt_info.has_key('CSQ'))
   self.assertEqual(counter_factory.counter_map[
       CEnum.VARIANT.value].get_value(), 1)
   self.assertEqual(counter_factory.counter_map[
       CEnum.ANNOTATION_ALT_MATCH.value].get_value(), 2)
   self.assertEqual(
       counter_factory.counter_map[
           CEnum.ANNOTATION_ALT_MISMATCH.value].get_value(), 0)
   self.assertEqual(
       counter_factory.counter_map[
           CEnum.ANNOTATION_ALT_MINIMAL_AMBIGUOUS.value].get_value(), 0)
   self.assertEqual(
       counter_factory.counter_map[
           CEnum.ALLELE_NUM_INCORRECT.value].get_value(), 4)
示例#3
0
    def __init__(self, vcf_file, **opts):
        self.vcf_file = vcf_file
        self.opts = opts
        self.verbose = opts.get("verbose", False)

        self.reader = vcf.VCFReader(open(vcf_file, 'r'))
        self.variants = vcf_utils.filter_vcf_in_memory(self.reader, init_quality_filters(self.opts['filter_args']), keep = True)
        self.annotated_variants = []

        self.annotation_manager = opts.get("annotation_manager", EntrezAnnotationManager(**opts))

        self.reader.infos['GENE'] = _Info('GENE', 1, "String", "Gene containing this variant")
        self.gene_list = defaultdict(list)
示例#4
0
 def _get_sample_variant_and_header_with_csq(self):
     variant = self._get_sample_variant()
     variant.info['CSQ'] = vcfio.VariantInfo(
         data=['A|C1|I1|S1|G1', 'TT|C2|I2|S2|G2', 'A|C3|I3|S3|G3'],
         field_count='.')
     csq_info = parser._Info(
         id=None,
         num='.',
         type=None,
         desc='some desc Allele|Consequence|IMPACT|SYMBOL|Gene',
         source=None,
         version=None)
     header_fields = vcf_header_io.VcfHeader(infos={'CSQ': csq_info})
     return variant, header_fields
示例#5
0
def merge_hc_mity(fhc, fmity, fout, priority):
    """Merges the given HaplotypeCaller and UnifiedGenotyper VCFs into a new
    VCF."""

    hc = vcf.Reader(fhc)
    mity = vcf.Reader(fmity)

    # some sanity checks
    # TODO: possible to make it handle different samples in the two VCFs?
    if sorted(hc.samples) != sorted(mity.samples):
        raise ValueError(
            "Input VCF files must have the same sample column headers.")
    if sorted(hc.contigs.keys()) != sorted(mity.contigs.keys()):
        raise ValueError("Input VCF files must denote the same contigs.")
    if sorted(hc.formats.keys()) != sorted(mity.formats.keys()):
        raise ValueError("Input VCF files must contain the same formats.")

    # NOTE: arbitrarily picking mity as the base template ~ we're doing
    # dict updates, so the hc values will take precedence
    # merge infos
    mity.infos.update(hc.infos)
    # merge formats ~ not necessary since they're equal
    # TODO: merge filters?
    # merge metadata
    if 'GATKCommandLine' in mity.metadata:
        mity.metadata['UnifiedGenotyperCommandLine'] = \
            mity.metadata['GATKCommandLine']
    if 'GATKCommandLine' in hc.metadata:
        mity.metadata['HaplotypeCallerCommandLine'] = \
            hc.metadata['GATKCommandLine']
    del mity.metadata['GATKCommandLine']
    del hc.metadata['GATKCommandLine']
    mity.metadata.update(hc.metadata)
    # add custom INFO field, denoting the variant caller for each variant
    # iterate over both, picking the priority when variants are called by both
    # files
    mity.infos['GATKCaller'] = _Info(
        'GATKCaller', '.', 'String', 'GATK '
        'variant caller used to call the variant')

    out_writer = vcf.Writer(fout, mity)
    for hc_rec, mity_rec in walk_together(hc, mity):
        if hc_rec.CHROM != "MT":
            out_writer.write_record(hc_rec)
        elif mity_rec.CHROM == "MT":
            out_writer.write_record(mity_rec)
        else:
            assert False, "We should not be here!"
def make_header(header_num_dict):
    # type: (Dict[str, str]) -> VcfHeader
    """Builds a VcfHeader based on the header_num_dict.

  All fields of parser._Info are set to their default values except for the
  'id' which is set to the keys in header_num_dict and 'num' which is set based
  on header_num_dict values mapped according to parser.field_counts.

  Args:
    header_num_dict: a dictionary mapping info keys to string num values.
  """
    infos = {}
    for k, v in header_num_dict.iteritems():
        if v in parser.field_counts:
            pyvcf_num_field_value = parser.field_counts[v]
        else:
            pyvcf_num_field_value = int(v)
        infos[k] = parser._Info(id=k,
                                num=pyvcf_num_field_value,
                                type=None,
                                desc='',
                                source=None,
                                version=None)
    return vcf_header_io.VcfHeader(infos=infos)
示例#7
0
def main(args):
    # Load VCF file
    if not os.path.exists(args.vcf):
        common.WARNING("%s does not exist" % args.vcf)
        return 1
    invcf = vcf.Reader(filename=args.vcf)

    # Set up record harmonizer and infer VCF type
    vcftype = trh.InferVCFType(invcf)

    # Check filters all make sense
    if not CheckFilters(invcf, args, vcftype): return 1

    # Set up locus-level filter list
    try:
        filter_list = BuildLocusFilters(args, vcftype)
    except ValueError:
        return 1
    invcf.filters = {}
    for f in filter_list:
        short_doc = f.__doc__ or ''
        short_doc = short_doc.split('\n')[0].lstrip()
        invcf.filters[f.filter_name()] = _Filter(f.filter_name(), short_doc)

    # Set up call-level filters
    call_filters = BuildCallFilters(args)

    # Add new FORMAT fields
    if "FILTER" not in invcf.formats:
        invcf.formats["FILTER"] = _Format("FILTER", 1, "String",
                                          "Call-level filter")

    # Add new INFO fields
    invcf.infos["AC"] = _Info("AC",
                              -1,
                              "Integer",
                              "Alternate allele counts",
                              source=None,
                              version=None)
    invcf.infos["REFAC"] = _Info("REFAC",
                                 1,
                                 "Integer",
                                 "Reference allele count",
                                 source=None,
                                 version=None)
    invcf.infos["HET"] = _Info("HET",
                               1,
                               "Float",
                               "Heterozygosity",
                               source=None,
                               version=None)
    invcf.infos["HWEP"] = _Info("HWEP",
                                1,
                                "Float",
                                "HWE p-value for obs. vs. exp het rate",
                                source=None,
                                version=None)
    invcf.infos["HRUN"] = _Info("HRUN",
                                1,
                                "Integer",
                                "Length of longest homopolymer run",
                                source=None,
                                version=None)

    # Set up output files
    if not os.path.exists(os.path.dirname(os.path.abspath(args.out))):
        common.WARNING("Output directory does not exist")
        return 1
    outvcf = MakeWriter(args.out + ".vcf", invcf, " ".join(sys.argv))
    if outvcf is None: return 1

    # Set up sample info
    all_reasons = GetAllCallFilters(call_filters)
    sample_info = {}
    for s in invcf.samples:
        sample_info[s] = {"numcalls": 0, "totaldp": 0}
        for r in all_reasons:
            sample_info[s][r] = 0

    # Set up locus info
    loc_info = {"totalcalls": 0, "PASS": 0}
    for filt in filter_list:
        loc_info[filt.filter_name()] = 0

    # Go through each record
    record_counter = 0
    while True:
        try:
            record = next(invcf)
        except IndexError:
            common.WARNING(
                "Skipping TR that couldn't be parsed by PyVCF. Check VCF format"
            )
            if args.die_on_warning: return 1
        except StopIteration:
            break
        if args.verbose:
            common.MSG("Processing %s:%s" % (record.CHROM, record.POS))
        record_counter += 1
        if args.num_records is not None and record_counter > args.num_records:
            break
        # Call-level filters
        record = ApplyCallFilters(record, invcf, call_filters, sample_info)

        # Locus-level filters
        record.FILTER = None
        output_record = True
        for filt in filter_list:
            if filt(record) == None: continue
            if args.drop_filtered:
                output_record = False
                break
            record.add_filter(filt.filter_name())
            loc_info[filt.filter_name()] += 1
        if args.drop_filtered:
            if record.call_rate == 0: output_record = False
        if output_record:
            trrecord = trh.HarmonizeRecord(vcftype, record)
            # Recalculate locus-level INFO fields
            record.INFO["HRUN"] = utils.GetHomopolymerRun(record.REF)
            if record.num_called > 0:
                allele_freqs = trrecord.GetAlleleFreqs(
                    uselength=args.use_length)
                genotype_counts = trrecord.GetGenotypeCounts(
                    uselength=args.use_length)
                record.INFO["HET"] = utils.GetHeterozygosity(allele_freqs)
                record.INFO["HWEP"] = utils.GetHardyWeinbergBinomialTest(
                    allele_freqs, genotype_counts)
                record.INFO["AC"] = [
                    int(item * (3 * record.num_called)) for item in record.aaf
                ]
                record.INFO["REFAC"] = int(
                    (1 - sum(record.aaf)) * (2 * record.num_called))
            else:
                record.INFO["HET"] = -1
                record.INFO["HWEP"] = -1
                record.INFO["AC"] = [0] * len(record.ALT)
                record.INFO["REFAC"] = 0
            # Recalc filter
            if record.FILTER is None and not args.drop_filtered:
                record.FILTER = "PASS"
                loc_info["PASS"] += 1
                loc_info["totalcalls"] += record.num_called
            # Output the record
            outvcf.write_record(record)

    # Output log info
    WriteSampLog(sample_info, all_reasons, args.out + ".samplog.tab")
    WriteLocLog(loc_info, args.out + ".loclog.tab")

    return 0
示例#8
0
 def add_vcf_header_info(self, vcf_template):
     vcf_template.infos['gnomad_genomes_AN'] = _Info(
         'gnomad_genomes_AN', '.', 'Integer',
         'GnomAD genomes allele number value', None, None)
 def add_vcf_header_info(self, vcf_template):
     vcf_template.infos['gnomad_genomes_AN'] = _Info('gnomad_genomes_AN', '.', 'Integer',
                                                     'GnomAD genomes allele number value', None, None)