示例#1
0
def filter_vcf_in_memory(variants, filters, short_circuit = False, drop_filtered = False, invert = False, **kwargs):

    chain = []
    for filter_obj in filters:
        chain.append(filter_obj)
        short_doc = filter_obj.__doc__ or ''
        short_doc = short_doc.split('\n')[0].lstrip()
        # add a filter record to the output
        try:
            variants.filters[filter_obj.filter_name()] = _Filter(filter_obj.filter_name(), short_doc)
        except:
            pass
    filtered_records = []
    for record in variants:
        output_record = True
        for filt in chain:
            result = filt(record)
            if (result is None) or (result is not None and invert): continue

            # save some work by skipping the rest of the code
            if drop_filtered:
                output_record = False
                break

            record.add_filter(filt.filter_name())
            if short_circuit: break
        
        # If the record is to be kept (not dropping filtered, or record passed all filters)
        if output_record:
            filtered_records.append(record)

    return filtered_records
示例#2
0
def filter_vcf(file_path, filters, short_circuit = False, drop_filtered = False, invert = False, output_file = None):
    if output_file is None:
        output_file = file_path + '.filt.vcf'
    inp = vcf.Reader(open(file_path, 'r'))

    # build filter chain
    chain = []
    for filter_obj in filters:
        chain.append(filter_obj)
        short_doc = filter_obj.__doc__ or ''
        short_doc = short_doc.split('\n')[0].lstrip()
        # add a filter record to the output
        inp.filters[filter_obj.filter_name()] = _Filter(filter_obj.filter_name(), short_doc)

    # output must be created after all the filter records have been added
    output = vcf.Writer(open(output_file, 'w'), inp)

    # apply filters
    for record in inp:
        output_record = True
        for filt in chain:
            result = filt(record)
            if (result is None) or (result is not None and invert): continue

            # save some work by skipping the rest of the code
            if drop_filtered:
                output_record = False
                break

            record.add_filter(filt.filter_name())
            if short_circuit: break
        
        # If the record is to be kept (not dropping filtered, or record passed all filters)
        if output_record:
            output.write_record(record)
    return output_file
示例#3
0
def main(args):
    # Load VCF file
    if not os.path.exists(args.vcf):
        common.WARNING("%s does not exist" % args.vcf)
        return 1
    invcf = vcf.Reader(filename=args.vcf)

    # Set up record harmonizer and infer VCF type
    vcftype = trh.InferVCFType(invcf)

    # Check filters all make sense
    if not CheckFilters(invcf, args, vcftype): return 1

    # Set up locus-level filter list
    try:
        filter_list = BuildLocusFilters(args, vcftype)
    except ValueError:
        return 1
    invcf.filters = {}
    for f in filter_list:
        short_doc = f.__doc__ or ''
        short_doc = short_doc.split('\n')[0].lstrip()
        invcf.filters[f.filter_name()] = _Filter(f.filter_name(), short_doc)

    # Set up call-level filters
    call_filters = BuildCallFilters(args)

    # Add new FORMAT fields
    if "FILTER" not in invcf.formats:
        invcf.formats["FILTER"] = _Format("FILTER", 1, "String",
                                          "Call-level filter")

    # Add new INFO fields
    invcf.infos["AC"] = _Info("AC",
                              -1,
                              "Integer",
                              "Alternate allele counts",
                              source=None,
                              version=None)
    invcf.infos["REFAC"] = _Info("REFAC",
                                 1,
                                 "Integer",
                                 "Reference allele count",
                                 source=None,
                                 version=None)
    invcf.infos["HET"] = _Info("HET",
                               1,
                               "Float",
                               "Heterozygosity",
                               source=None,
                               version=None)
    invcf.infos["HWEP"] = _Info("HWEP",
                                1,
                                "Float",
                                "HWE p-value for obs. vs. exp het rate",
                                source=None,
                                version=None)
    invcf.infos["HRUN"] = _Info("HRUN",
                                1,
                                "Integer",
                                "Length of longest homopolymer run",
                                source=None,
                                version=None)

    # Set up output files
    if not os.path.exists(os.path.dirname(os.path.abspath(args.out))):
        common.WARNING("Output directory does not exist")
        return 1
    outvcf = MakeWriter(args.out + ".vcf", invcf, " ".join(sys.argv))
    if outvcf is None: return 1

    # Set up sample info
    all_reasons = GetAllCallFilters(call_filters)
    sample_info = {}
    for s in invcf.samples:
        sample_info[s] = {"numcalls": 0, "totaldp": 0}
        for r in all_reasons:
            sample_info[s][r] = 0

    # Set up locus info
    loc_info = {"totalcalls": 0, "PASS": 0}
    for filt in filter_list:
        loc_info[filt.filter_name()] = 0

    # Go through each record
    record_counter = 0
    while True:
        try:
            record = next(invcf)
        except IndexError:
            common.WARNING(
                "Skipping TR that couldn't be parsed by PyVCF. Check VCF format"
            )
            if args.die_on_warning: return 1
        except StopIteration:
            break
        if args.verbose:
            common.MSG("Processing %s:%s" % (record.CHROM, record.POS))
        record_counter += 1
        if args.num_records is not None and record_counter > args.num_records:
            break
        # Call-level filters
        record = ApplyCallFilters(record, invcf, call_filters, sample_info)

        # Locus-level filters
        record.FILTER = None
        output_record = True
        for filt in filter_list:
            if filt(record) == None: continue
            if args.drop_filtered:
                output_record = False
                break
            record.add_filter(filt.filter_name())
            loc_info[filt.filter_name()] += 1
        if args.drop_filtered:
            if record.call_rate == 0: output_record = False
        if output_record:
            trrecord = trh.HarmonizeRecord(vcftype, record)
            # Recalculate locus-level INFO fields
            record.INFO["HRUN"] = utils.GetHomopolymerRun(record.REF)
            if record.num_called > 0:
                allele_freqs = trrecord.GetAlleleFreqs(
                    uselength=args.use_length)
                genotype_counts = trrecord.GetGenotypeCounts(
                    uselength=args.use_length)
                record.INFO["HET"] = utils.GetHeterozygosity(allele_freqs)
                record.INFO["HWEP"] = utils.GetHardyWeinbergBinomialTest(
                    allele_freqs, genotype_counts)
                record.INFO["AC"] = [
                    int(item * (3 * record.num_called)) for item in record.aaf
                ]
                record.INFO["REFAC"] = int(
                    (1 - sum(record.aaf)) * (2 * record.num_called))
            else:
                record.INFO["HET"] = -1
                record.INFO["HWEP"] = -1
                record.INFO["AC"] = [0] * len(record.ALT)
                record.INFO["REFAC"] = 0
            # Recalc filter
            if record.FILTER is None and not args.drop_filtered:
                record.FILTER = "PASS"
                loc_info["PASS"] += 1
                loc_info["totalcalls"] += record.num_called
            # Output the record
            outvcf.write_record(record)

    # Output log info
    WriteSampLog(sample_info, all_reasons, args.out + ".samplog.tab")
    WriteLocLog(loc_info, args.out + ".loclog.tab")

    return 0
示例#4
0
def main():
    # dynamically build the list of available filters
    filters = {}

    # parse command line args
    # (mainly because of local_script)
    parser = create_core_parser()
    (args, unknown_args) = parser.parse_known_args()

    # add filter to dictionary, extend help message
    # with help/arguments of each filter
    def addfilt(filt):
        filters[filt.name] = filt
        arg_group = parser.add_argument_group(filt.name, filt.__doc__)
        filt.customize_parser(arg_group)

    # look for global extensions
    for p in pkg_resources.iter_entry_points('vcf.filters'):
        filt = p.load()
        addfilt(filt)

    # add all classes from local script, if present
    if args.local_script != None:
        import inspect
        import os
        sys.path.insert(0, os.getcwd())
        module_name = args.local_script.replace('.py', '')
        mod = __import__(module_name)
        classes = inspect.getmembers(mod, inspect.isclass)
        for name, cls in classes:
            addfilt(cls)

    # go through the filters on the command line
    # one by one, trying to consume only the declared arguments
    used_filters = []
    while len(args.rest):
        filter_name = args.rest.pop(0)
        if filter_name not in filters:
            sys.exit("%s is not a known filter (%s)" %
                     (filter_name, str(filters.keys())))

        # create a parser only for arguments of current filter
        filt_parser = create_filt_parser(filter_name)
        filters[filter_name].customize_parser(filt_parser)
        (known_filt_args,
         unknown_filt_args) = filt_parser.parse_known_args(args.rest)
        if len(unknown_filt_args):
            sys.exit("%s has no arguments like %s" %
                     (filter_name, unknown_filt_args))

        used_filters.append((filter_name, known_filt_args))
        args.rest = known_filt_args.rest

    # print help using the 'help' parser, so it includes
    # all possible filters and arguments
    if args.help or len(used_filters) == 0 or args.input == None:
        parser.print_help()
        parser.exit()

    inp = vcf.Reader(args.input)

    # build filter chain
    chain = []
    for (name, filter_args) in used_filters:
        f = filters[name](filter_args)
        chain.append(f)
        # add a filter record to the output
        short_doc = f.__doc__ or ''
        short_doc = short_doc.split('\n')[0].lstrip()
        inp.filters[f.filter_name()] = _Filter(f.filter_name(), short_doc)

    # output must be created after all the filter records have been added
    output = vcf.Writer(args.output, inp)

    # apply filters
    short_circuit = not args.no_short_circuit
    drop_filtered = args.no_filtered

    for record in inp:
        output_record = True
        for filt in chain:
            result = filt(record)
            if result == None: continue

            # save some work by skipping the rest of the code
            if drop_filtered:
                output_record = False
                break

            record.add_filter(filt.filter_name())
            if short_circuit: break

        if output_record:
            # use PASS only if other filter names appear in the FILTER column
            #FIXME: is this good idea?
            if record.FILTER is None and not drop_filtered:
                record.FILTER = 'PASS'
            output.write_record(record)
def main():
    # dynamically build the list of available filters
    filters = {}

    # parse command line args
    # (mainly because of local_script)
    parser = create_core_parser()
    (args, unknown_args) = parser.parse_known_args()

    # Enable remote debugging
    if args.debug:
        user_ip = environ['USERIP']
        pydevd.settrace(user_ip, port=58484, stdoutToServer=True, stderrToServer=True)

    # add filter to dictionary, extend help message
    # with help/arguments of each filter
    def addfilt(filt):
        filters[filt.name] = filt
        arg_group = parser.add_argument_group(filt.name, filt.__doc__)
        filt.customize_parser(arg_group)

    # look for global extensions
    for p in pkg_resources.iter_entry_points('vcf.filters'):
        filt = p.load()
        addfilt(filt)

    # add all classes from local script, if present
    if args.local_script != None:
        import inspect
        import os
        sys.path.insert(0, os.getcwd())
        module_name = args.local_script.replace('.py', '')
        mod = __import__(module_name)
        classes = inspect.getmembers(mod, inspect.isclass)
        for name, cls in classes:
            addfilt(cls)

    # go through the filters on the command line
    # one by one, trying to consume only the declared arguments
    used_filters = []
    while len(args.rest):
        filter_name = args.rest.pop(0)
        if filter_name not in filters:
            sys.exit("%s is not a known filter (%s)" % (filter_name, str(filters.keys())))

        # create a parser only for arguments of current filter
        filt_parser = create_filt_parser(filter_name)
        filters[filter_name].customize_parser(filt_parser)
        (known_filt_args, unknown_filt_args) = filt_parser.parse_known_args(args.rest)
        if len(unknown_filt_args):
            sys.exit("%s has no arguments like %s" % (filter_name, unknown_filt_args))

        used_filters.append((filter_name, known_filt_args))
        args.rest = known_filt_args.rest

    # print help using the 'help' parser, so it includes
    # all possible filters and arguments
    if args.help or len(used_filters) == 0 or args.input == None:
        parser.print_help()
        parser.exit()

    inp = vcf.Reader(args.input)

    # build filter chain
    chain = []
    for (name, filter_args) in used_filters:
        f = filters[name](filter_args)
        chain.append(f)
        # add a filter record to the output
        short_doc = f.__doc__ or ''
        short_doc = short_doc.split('\n')[0].lstrip()
        inp.filters[f.filter_name()] = _Filter(f.filter_name(), short_doc)

    # output must be created after all the filter records have been added
    output = vcf.Writer(args.output, inp)

    # apply filters
    short_circuit = not args.no_short_circuit
    drop_filtered = args.no_filtered

    for record in inp:
        output_record = True
        for filt in chain:
            result = filt(record)
            if result is None:
                continue

            # save some work by skipping the rest of the code
            if drop_filtered:
                output_record = False
                break

            record.add_filter(filt.filter_name())
            if short_circuit:
                break

        if output_record:
            # use PASS only if other filter names appear in the FILTER column
            #FIXME: is this good idea?
            if record.FILTER is None and not drop_filtered:
                record.FILTER = 'PASS'
            output.write_record(record)
示例#6
0
    def filter_variants(self, keep_only_snps=False, only_good=False):
        """Filter the VCF records.

        Parameters
        ----------
        keep_only_snps: bool, optional
            Retain only SNP variants (default: False).
        only_good: bool, optional
            True/False if only SNPs that PASS should output.

        Returns
        -------
        list of records is returned.
         """

        if self._reader is None:
            # Create a reader class from input VCF.
            self._reader = vcf.Reader(filename=self.vcf_in)

        # get list of existing filters.
        existing_filters = {}
        removed_filters = []

        for filter_id in self._reader.filters:
            conf = PHEFilterBase.decode(filter_id)
            tuple(conf.keys())
            existing_filters.update({tuple(conf.keys()):filter_id})

        # Add each filter we are going to use to the record.
        # This is needed for writing out proper #FILTER header in VCF.
        for record_filter in self.filters:
            # We know that each filter has short description method.
            short_doc = record_filter.short_desc()
            short_doc = short_doc.split('\n')[0].lstrip()

            filter_name = PHEFilterBase.decode(record_filter.filter_name())

            # Check if the sample has been filtered for this type of filter
            #    in the past. If so remove is, because it is going to be refiltered.
            if tuple(filter_name) in existing_filters:
                logging.info("Removing existing filter: %s", existing_filters[tuple(filter_name)])
                removed_filters.append(existing_filters[tuple(filter_name)])
                del self._reader.filters[existing_filters[tuple(filter_name)]]

            self._reader.filters[record_filter.filter_name()] = _Filter(record_filter.filter_name(), short_doc)

        # Update the filters for output.
        self._update_filters(self._reader.filters)

        _pos = 1
        _chrom = None
        # For each record (POSITION) apply set of filters.
        for record in self._reader:

            if _chrom != record.CHROM:
                _pos, _chrom = 1, record.CHROM

            # Fill in any missing consecutive data with GT=./. records.
            while _pos <= record.POS:
                if _pos == record.POS:
                    _record = record
                else:
                    # This is a padding "N" record when records do not follow each other,
                    #    and there is a gap. e,g, 1,2,3,5,6 -> in 4 "N" will be inserted.

                    _ref = self._get_reference_base(record.CHROM, _pos)

                    _record = vcf.model._Record(record.CHROM, _pos, ".", _ref, [None], 0, [], {}, 'GT', None)
                    _calls = []
                    sorted_samples = sorted(record._sample_indexes.items(), key=operator.itemgetter(1))
                    for sample, i in sorted_samples:

                        _data = make_calldata_tuple(["GT"])
                        _data._types = ["String"]
                        _data._nums = [1]
                        d = ["./."]
                        _calls.append(vcf.model._Call(_record, sample=sample, data=_data(*d)))

                    _record.samples = _calls
                    _record._sample_indexes = dict(sorted_samples)


                self._filter_record(_record, removed_filters)

                # After applying all filters, check if FILTER is None.
                # If it is, then record PASSED all filters.
                if _record.FILTER is None or _record.FILTER == []:
                    if not record.is_monomorphic:
                        _record.FILTER = []

                        if not keep_only_snps or (_record.is_snp and keep_only_snps):

                            self._variants.append(_record)

                elif not only_good:
                    self._variants.append(_record)

                _pos += 1
                if _chrom is None:
                    _chrom = record.CHROM
        return [ variant for variant in self._variants if not variant.FILTER]
示例#7
0
文件: __init__.py 项目: m-bull/PHEnix
    def filter_variants(self, keep_only_snps=False, only_good=False):
        """Filter the VCF records.

        Parameters
        ----------
        keep_only_snps: bool, optional
            Retain only SNP variants (default: False).
        only_good: bool, optional
            True/False if only SNPs that PASS should output.

        Returns
        -------
        list of records is returned.
         """

        if self._reader is None:
            # Create a reader class from input VCF.
            self._reader = vcf.Reader(filename=self.vcf_in)

        # get list of existing filters.
        existing_filters = {}
        removed_filters = []

        for filter_id in self._reader.filters:
            conf = PHEFilterBase.decode(filter_id)
            tuple(conf.keys())
            existing_filters.update({tuple(conf.keys()):filter_id})

        # Add each filter we are going to use to the record.
        # This is needed for writing out proper #FILTER header in VCF.
        for record_filter in self.filters:
            # We know that each filter has short description method.
            short_doc = record_filter.short_desc()
            short_doc = short_doc.split('\n')[0].lstrip()

            filter_name = PHEFilterBase.decode(record_filter.filter_name())

            # Check if the sample has been filtered for this type of filter
            #    in the past. If so remove is, because it is going to be refiltered.
            if tuple(filter_name) in existing_filters:
                logging.info("Removing existing filter: %s", existing_filters[tuple(filter_name)])
                removed_filters.append(existing_filters[tuple(filter_name)])
                del self._reader.filters[existing_filters[tuple(filter_name)]]

            self._reader.filters[record_filter.filter_name()] = _Filter(record_filter.filter_name(), short_doc)

        # Update the filters for output.
        self._update_filters(self._reader.filters)

        _pos = 1
        _chrom = None
        # For each record (POSITION) apply set of filters.
        for record in self._reader:

            if _chrom != record.CHROM:
                _pos, _chrom = 1, record.CHROM

            # Fill in any missing consecutive data with GT=./. records.
            while _pos <= record.POS:
                if _pos == record.POS:
                    _record = record
                else:
                    # This is a padding "N" record when records do not follow each other,
                    #    and there is a gap. e,g, 1,2,3,5,6 -> in 4 "N" will be inserted.

                    _ref = self._get_reference_base(record.CHROM, _pos)

                    _record = vcf.model._Record(record.CHROM, _pos, ".", _ref, [None], 0, [], {}, 'GT', None)
                    _calls = []
                    sorted_samples = sorted(record._sample_indexes.items(), key=operator.itemgetter(1))
                    for sample, i in sorted_samples:

                        _data = make_calldata_tuple(["GT"])
                        _data._types = ["String"]
                        _data._nums = [1]
                        d = ["./."]
                        _calls.append(vcf.model._Call(_record, sample=sample, data=_data(*d)))

                    _record.samples = _calls
                    _record._sample_indexes = dict(sorted_samples)


                self._filter_record(_record, removed_filters)

                # After applying all filters, check if FILTER is None.
                # If it is, then record PASSED all filters.
                if _record.FILTER is None or _record.FILTER == []:
                    if not record.is_monomorphic:
                        _record.FILTER = []

                        if not keep_only_snps or (_record.is_snp and keep_only_snps):

                            self._variants.append(_record)

                elif not only_good:
                    self._variants.append(_record)

                _pos += 1
                if _chrom is None:
                    _chrom = record.CHROM
        return [ variant for variant in self._variants if not variant.FILTER]
def main():

    # Parse command line arguments
    args = parse_args()
    vcf_file = args.in_vcf
    outfile = args.out_csv
    out_vcf = args.out_vcf
    report_FPs = args.falsepos
    tech_variation = args.tech_variation

    outstream = open(outfile, 'w')

    # Write header
    outstream.write(('variant,nonref_alleles_pool,total_alleles_pool,'
                    'nonref_alleles_probands,total_alleles_probands,'
                    'nonref_reads_pool,total_reads_pool,nonref_reads_probands,'
                    'recovered_all,falsepos,QD,AF_EXOMESgnomad,AF_GENOMESgnomad,'
                    'proband,recovered_in_proband,GT_pool\n'))

    with open(vcf_file, 'r') as this_vcf:
        vcf_reader = vcf.Reader(this_vcf)
        # Add an aditional filter that will be inherited by the vcf writer
        vcf_reader.filters['InPool'] = _Filter('InPool',
            'All alleles found in the probands are also found in the pool.')
        # Create vcf writer based on the header from the input vcf
        vcf_writer = vcf.Writer(open(out_vcf, 'w'), vcf_reader)

        # Fetch sample names
        all_vcf_samples = vcf_reader.samples
        pool_name = check_pool_name(args.pool, all_vcf_samples)
        proband_names = check_proband_names(args.probands, all_vcf_samples)
        sys.stderr.write('Pool name: {0}\nProband names: {1}\n'.format(pool_name,
                        ', '.join(proband_names)))
        pool_size = len(proband_names)
        pool_pos = all_vcf_samples.index(pool_name)
        probands_pos = [all_vcf_samples.index(proband) for proband in proband_names]

        # Create a vcf writer for each proband
        probandVCF_dict = {}
        for proband in proband_names:
            # Can't deepcopy vcf reader object, so editing it and returning it to previous state
            vcf_reader.samples= [proband]
            proband_out_vcf = proband + args.out_proband_vcf
            probandVCF_dict[proband] = vcf.Writer(open(proband_out_vcf, 'w'), vcf_reader)
        vcf_reader.samples = all_vcf_samples

        for record in vcf_reader:
            # Extract gnomad allele frequency data if available
            AF_EXOMESgnomad = extract_record_info(record, 'AF_EXOMESgnomad')
            AF_GENOMESgnomad = extract_record_info(record, 'AF_GENOMESgnomad')

            var_id = variant_id(record)
            nonref_alleles_pool, total_alleles_pool = count_nonref_alleles(record.samples[pool_pos]['GT'])
            qual = record.QUAL
            QD = qual/record.INFO['DP']

            nonref_reads_pool = count_nonref_reads(record.samples[pool_pos])
            total_reads_pool = record.samples[pool_pos]['DP']

            nonref_reads_probands = 0
            for proband_pos in probands_pos:
                nonref_reads_probands += count_nonref_reads(record.samples[proband_pos])

            GT_pool = record.samples[pool_pos]['GT']
            alleles_in_pool = get_nonref_alleles(record.samples[pool_pos]['GT'])
            alleles_in_probands = set.union(*[get_nonref_alleles(record.samples[pos]['GT']) for pos in probands_pos])

            filtered = 'FALSE'
            falsepos = 'FALSE'
            # Calculate a mininum read filter based on the filter_reads or ploidy_filter
            # arguments, if given
            if args.filter_reads or args.ploidy_filter:
                min_read_filter = set_read_filter(total_reads_pool, args.filter_reads,
                    args.ploidy_filter, tech_variation)
                alleles_in_pool_by_reads = set(alleles_supported(record, pool_pos,
                    min_read_filter, include_ref = False))
                if is_recovered(alleles_in_probands, alleles_in_pool_by_reads):
                    filtered = 'TRUE'
                # likely false positive if found in the pool but not in any of the probands
                if len(alleles_in_pool_by_reads - alleles_in_probands) > 0:
                    falsepos = 'TRUE'

            else:
                # Filter if all the variants found in the probands are also found in the pool
                if is_recovered(alleles_in_probands, alleles_in_pool):
                    filtered = 'TRUE'
                # likely false positive if found in the pool but not in any of the probands
                if len(alleles_in_pool - alleles_in_probands) > 0:
                    falsepos = 'TRUE'

            if filtered == 'TRUE':
                record.FILTER = 'InPool'

            # Count nonref alleles and total alleles in probands
            # Write a filtered vcf for each proband
            nonref_alleles_probands = 0
            total_alleles_probands = 0
            for proband_pos in probands_pos:
                proband = all_vcf_samples[proband_pos]
                nonref, total = count_nonref_alleles(record.samples[proband_pos]['GT'])
                nonref_alleles_probands += nonref
                total_alleles_probands += total

            for proband_pos in probands_pos:
                proband = all_vcf_samples[proband_pos]
                nonref, total = count_nonref_alleles(record.samples[proband_pos]['GT'])

                # Skip variant if this individual has no non-ref alleles e.g. GT is  ./. or 0/0
                if nonref == 0:
                    continue

                # Check if variant is recovered for this proband specifically
                alleles_this_proband = get_nonref_alleles(record.samples[proband_pos]['GT'])

                # Write out the variant (GT for this sample only) to the vcf file for that proband
                # only if the variant is not found in the parent pool
                recovered_proband = 'FALSE'

                if args.filter_reads or args.ploidy_filter:
                    if is_recovered(alleles_this_proband, alleles_in_pool_by_reads):
                        recovered_proband = 'TRUE'
                else:
                    if is_recovered(alleles_this_proband, alleles_in_pool):
                        recovered_proband = 'TRUE'
                if recovered_proband == 'FALSE':
                    tmp_record = copy.deepcopy(record)
                    tmp_record.samples = [record.samples[proband_pos]]
                    probandVCF_dict[proband].write_record(tmp_record)

                outstream.write(','.join([str(x) for x in [var_id,nonref_alleles_pool,
                    total_alleles_pool,nonref_alleles_probands,total_alleles_probands,
                    nonref_reads_pool,total_reads_pool,nonref_reads_probands,filtered,falsepos,QD,
                    AF_EXOMESgnomad, AF_GENOMESgnomad, proband, recovered_proband, GT_pool]]) + '\n')

            # If none of the probands have any non-ref alleles at this locus
            # Still report it in the csv for false positives counts
            if report_FPs and nonref_alleles_probands == 0:
                outstream.write(','.join([str(x) for x in [var_id,nonref_alleles_pool,
                    total_alleles_pool,nonref_alleles_probands,total_alleles_probands,
                    nonref_reads_pool,total_reads_pool,nonref_reads_probands,filtered,falsepos,QD,
                    AF_EXOMESgnomad, AF_GENOMESgnomad, 'NA', 'NA', 'NA']]) + '\n')



            # Write all samples from all variants to VCF
            # includes FILTER InPool for variants where all alleles in probands are recovered in pool
            vcf_writer.write_record(record)
示例#9
0
        filt.customize_parser(parser)
        filter_help += '\n  %s:\t%s' % (filt.name, filt.description)

    parser.description += filter_help

    # parse command line args
    args = parser.parse_args()

    inp = vcf.Reader(file(args.input[0]))

    # build filter chain
    chain = []
    for name in args.filters:
        f = filters[name](args)
        chain.append(f)
        inp.filters[f.filter_name()] = _Filter(f.filter_name(), f.description)

    oup = vcf.Writer(args.output, inp)

    # apply filters
    short_circuit = not args.no_short_circuit

    for record in inp:
        for filt in chain:
            result = filt(record)
            if result:
                record.add_filter(filt.filter_name())
                if short_circuit:
                    break

        if (not args.no_filtered) or (record.FILTER == '.'):
示例#10
0
文件: vcf_filter.py 项目: arq5x/PyVCF
        filt.customize_parser(parser)
        filter_help += '\n  %s:\t%s' % (filt.name, filt.description)

    parser.description += filter_help

    # parse command line args
    args = parser.parse_args()

    inp = vcf.Reader(file(args.input[0]))

    # build filter chain
    chain = []
    for name in args.filters:
        f = filters[name](args)
        chain.append(f)
        inp.filters[f.filter_name()] = _Filter(f.filter_name(), f.description)

    oup = vcf.Writer(args.output, inp)

    # apply filters
    short_circuit = not args.no_short_circuit

    for record in inp:
        for filt in chain:
            result = filt(record)
            if result:
                record.add_filter(filt.filter_name())
                if short_circuit:
                    break

        if (not args.no_filtered) or (record.FILTER == '.'):
示例#11
0
def main():
    # Parse command line arguments
    args = parse_args()
    individual_vcf_files = args.individual_vcfs
    pool_vcf_file = args.pool_vcf
    pool_spec_file = args.pool_specs
    outfile = args.out_csv
    out_vcf_suffix = args.suffix
    report_falsepos = args.falsepos
    output_filtered = not args.exclude_filtered
    split_vars = args.split

    probands_in_pool = parse_pool_specs(pool_spec_file)

    outstream = open(outfile, 'w')
    # Write header
    outstream.write(('proband,variant,recovered_proband,falsepos,'
                     'QD,AF_EXOMESgnomad,nonref_alleles_proband,'
                     'total_alleles_proband,nonref_reads_proband,'
                     'position'
                     '\n'))

    # Parse vcfs for pools
    # Simply record which variants were found in the pool
    pool_vars = parse_pool_vcf(pool_vcf_file, split_vars)
    nonref_alleles_probands = {}
    # Parse vcfs of individuals
    individual_vars = set()

    probands_found = []
    for vcf_file in individual_vcf_files:
        proband = sample_id_from_fname(vcf_file)

        if proband not in probands_in_pool:
            continue  # Skip any vcf files that don't match up with the pool specs

        probands_found.append(proband)

        with open(vcf_file, 'r') as this_vcf:
            vcf_reader = vcf.Reader(this_vcf)
            # Add an aditional filter that will be inherited by the vcf writer
            vcf_reader.filters['InPool'] = _Filter(
                'InPool',
                'All alleles found in the probands are also found in the pool.'
            )
            # Create vcf writer based on the header from the input vcf
            vcf_writer = vcf.Writer(open(proband + out_vcf_suffix, 'w'),
                                    vcf_reader)

            for record in vcf_reader:
                falsepos = 'FALSE'
                qual = record.QUAL
                try:
                    QD = qual / record.INFO['DP']
                except KeyError:
                    QD = 'NA'
                # Count alleles/reads supporting this variant
                nonref_alleles_proband, total_alleles_proband = count_nonref_alleles(
                    record.samples[0]['GT'])
                nonref_reads_proband = count_nonref_reads(record.samples[0])
                position = variant_position(record)

                variants, AF_EXOMESgnomad_all = get_variants_and_info(
                    record, 'AF_EXOMESgnomad', split_vars)
                if len(AF_EXOMESgnomad_all) != len(variants):
                    if not AF_EXOMESgnomad_all == ['NA']:
                        sys.stderr.write((
                            'WARNING: Number of variant allelese and gnomAD '
                            'records do not match. Writing AF_EXOMESgnomad = NA '
                            'for all. Variants: {} AF_EXOMESgnomad {} \n'
                        ).format(variants, AF_EXOMESgnomad_all))
                    AF_EXOMESgnomad_all = ['NA'] * len(variants)

                all_variants_in_pool = True
                for variant, AF_EXOMESgnomad in zip(variants,
                                                    AF_EXOMESgnomad_all):
                    individual_vars.add(variant)
                    variant_in_pool = variant in pool_vars
                    # If any variant is not in the pool, then set to false
                    if not variant_in_pool:
                        all_variants_in_pool = False

                    variant_in_pool = R_bool(variant_in_pool)
                    outstream.write(','.join([
                        str(x) for x in [
                            proband, variant, variant_in_pool, falsepos, QD,
                            AF_EXOMESgnomad, nonref_alleles_proband,
                            total_alleles_proband, nonref_reads_proband,
                            position
                        ]
                    ]) + '\n')

                # Either report variants as filtered in the VCF or skip them completely
                if all_variants_in_pool:
                    if output_filtered:
                        record.FILTER = 'InPool'  # Set in_pool vcf filter
                        vcf_writer.write_record(record)
                else:
                    vcf_writer.write_record(record)

    if set(probands_in_pool) != set(probands_found):
        raise ValueError(
            ('Based on --pool_specs, expecting VCFs for '
             'the probands: {}, found VCFs for: {}. Please check that file '
             'given for --pool_specs is correct and that all proband VCFs are '
             'provided and named correctly.').format(sorted(probands_in_pool),
                                                     sorted(probands_found)))

    # If false positives required, go through pooled vcf again and report them
    # Can do this without looping through again?
    if report_falsepos:
        proband = 'NA'
        variant_in_pool = 'TRUE'
        falsepos = 'TRUE'
        QD = 'NA'
        AF_EXOMESgnomad = 'NA'
        nonref_alleles_proband = 'NA'
        total_alleles_proband = 'NA'
        nonref_reads_proband = 'NA'
        with open(pool_vcf_file, 'r') as this_vcf:
            for record in vcf.Reader(this_vcf):
                variants, AF_EXOMESgnomad_all = get_variants_and_info(
                    record, 'AF_EXOMESgnomad', split_vars)
                for variant in variants:  # usually one, but could be multiple
                    if not variant in individual_vars:
                        outstream.write(','.join([
                            str(x) for x in [
                                proband, variant, variant_in_pool, falsepos,
                                QD, AF_EXOMESgnomad, nonref_alleles_proband,
                                total_alleles_proband, nonref_reads_proband,
                                position
                            ]
                        ]) + '\n')

    outstream.close
def main():
    # dynamically build the list of available filters
    filters = {}

    # parse command line args
    # (mainly because of custom_filters)
    parser = create_core_parser()
    (args, unknown_args) = parser.parse_known_args()

    # add filter to dictionary, extend help message
    # with help/arguments of each filter
    def addfilt(filt):
        filters[filt.name] = filt
        arg_group = parser.add_argument_group(filt.name, filt.__doc__)
        filt.customize_parser(arg_group)

    # Load predefined and local script filters
    filter_modules = [importlib.import_module('vcf.filters')]
    if args.custom_filters != None:
        filter_modules.append(imp.load_source('local_filters', args.custom_filters))
    for my_module in filter_modules:
        for name, filter_class in inspect.getmembers(my_module, inspect.isclass):
            addfilt(filter_class)

    # go through the filters on the command line
    # one by one, trying to consume only the declared arguments
    used_filters = []
    while len(args.rest):
        filter_name = args.rest.pop(0)
        if filter_name not in filters:
            sys.exit("%s is not a known filter (%s)" % (filter_name, str(filters.keys())))

        # create a parser only for arguments of current filter
        filt_parser = create_filt_parser(filter_name)
        filters[filter_name].customize_parser(filt_parser)
        (known_filt_args, unknown_filt_args) = filt_parser.parse_known_args(args.rest)
        if len(unknown_filt_args):
            sys.exit("%s has no arguments like %s" % (filter_name, unknown_filt_args))

        used_filters.append((filter_name, known_filt_args))
        args.rest = known_filt_args.rest

    # print help using the 'help' parser, so it includes
    # all possible filters and arguments
    if args.help or len(used_filters) == 0 or args.input == None:
        parser.print_help()
        parser.exit()

    inp = vcf.Reader(args.input)

    # build filter chain
    chain = []
    for (name, filter_args) in used_filters:
        f = filters[name](filter_args)
        chain.append(f)
        # add a filter record to the output
        short_doc = f.__doc__ or ''
        short_doc = short_doc.split('\n')[0].lstrip()
        inp.filters[f.filter_name()] = _Filter(f.filter_name(), short_doc)

    # output must be created after all the filter records have been added
    output = vcf.Writer(open(args.output, 'w'), inp, lineterminator='\n')

    # apply filters
    short_circuit = not args.no_short_circuit
    drop_filtered = args.no_filtered

    for record in inp:
        output_record = True
        for filt in chain:
            result = filt(record)
            if result == None: continue

            # save some work by skipping the rest of the code
            if drop_filtered:
                output_record = False
                break

            record.add_filter(filt.filter_name())
            if short_circuit: break

        if output_record:
            # use PASS only if other filter names appear in the FILTER column
            #FIXME: is this good idea?
            if record.FILTER is None and not drop_filtered: record.FILTER = 'PASS'
            output.write_record(record)