Пример #1
0
def main():
    global options, args

    # Open and parse each line of the vcf file
    input_vcf = vcf.Reader(open(options.input_vcf, 'r'))
    # If an FREQ field already exists in FORMAT or INFO, it has to be stored and be used when importing from input
    former_vcfformat_freq = input_vcf.formats[
        'FREQ'] if 'FREQ' in input_vcf.formats else None
    former_vcfinfo_sfreq = input_vcf.infos[
        'FREQ'] if 'FREQ' in input_vcf.infos else None
    former_vcfinfo_sdp = input_vcf.infos[
        'DPS'] if 'DPS' in input_vcf.infos else None
    input_vcf.formats['FREQ'] = VcfFormat('FREQ', None, 'String',
                                          'Variant allele frequency')
    input_vcf.infos['SFREQ'] = VcfInfo(
        'SFREQ', 1, 'Float', 'Maximum variant allele frequency of all samples')
    input_vcf.infos['SDP'] = VcfInfo(
        'SDP', 1, 'Integer', 'Maximum sequencing depth of all samples')
    output_vcf = vcf.Writer(open(options.output_vcf, 'w'),
                            input_vcf,
                            lineterminator='\n')
    if former_vcfformat_freq is not None:
        input_vcf.formats['FREQ'] = former_vcfformat_freq
    if former_vcfinfo_sfreq is not None:
        input_vcf.infos['SFREQ'] = former_vcfinfo_sfreq
    if former_vcfinfo_sdp is not None:
        input_vcf.infos['SDP'] = former_vcfinfo_sdp

    for record in input_vcf:
        if not 'FREQ' in record.FORMAT.split(':'):
            record.add_format('FREQ')

        # Default values for added INFO fields
        site_freq = None
        site_depth = 0

        # iterate over all call objects of record
        for call in record.samples:
            # Allele frequency and Depth evaluation among samples
            try:
                site_freq = max(site_freq, max(
                    call.aaf)) if call.aaf is not None else site_freq
                site_depth = max(
                    call.depth,
                    site_depth) if call.depth is not None else site_depth
            except Exception:
                print "ERROR: unforeseen exception when normalizing record:", record
                raise
            call.add_format('FREQ', norm_freq(call.aaf))
            # TODO: unfortunately GATK filtering doesn't yet deal correctly with "None" (.) values
            if site_freq is None or site_freq == '.':
                site_freq = 0
            record.add_info('SFREQ', site_freq)
            record.add_info('SDP', site_depth)
        output_vcf.write_record(record)
Пример #2
0
def annotation_vcf(parsed_args, process_num):
    records, results = Queue(100 * process_num), Queue()
    input_finished = False
    output_finished = False
    wait_records = dict()
    processes = list()
    records_id = count()
    for i in range(process_num):
        p = Process(target=score_vcf,
                    args=(records, results, parsed_args.annotation))
        processes.append(p)
        p.start()
    vcf_reader = vcf.Reader(filename=parsed_args.file_in)
    vcf_reader.infos['dbscSNV'] = VcfInfo(
        'dbscSNV',
        vcf_field_counts['A'],
        'String',
        'dbscSNV Score for VCF record alleles, Format: ALLELE|ada_score|rf_score',
        version=None,
        source=None)
    vcf_writer = vcf.Writer(open(parsed_args.file_out, 'w'), vcf_reader)
    while True:
        while not records.full() and not input_finished:
            try:
                record = next(vcf_reader)
                record_id = next(records_id)
                wait_records[record_id] = record
                record_infos = list()
                chromosome = str(record.CHROM)
                pos = record.POS
                ref = record.REF
                for alt in record.ALT:
                    record_infos.append(
                        VariantRecord(chromosome, pos, ref, str(alt)))
                records.put((record_id, record_infos))
            except StopIteration:
                input_finished = True
                records.put('END')
                break
        processes_status = list()
        for p in processes:
            processes_status.append(p.is_alive())
        if True not in processes_status:
            results.put('END')
        while True:
            try:
                result = results.get(False)
            except queue.Empty:
                break
            if result != 'END':
                record_id, record_score = result[0], result[1]
                record_write = wait_records.pop(record_id)
                record_write.add_info('dbscSNV', record_score)
                vcf_writer.write_record(record_write)
            else:
                output_finished = True
                break
        if output_finished:
            break
    vcf_writer.close()
Пример #3
0
    def add_node_tag_randomly(self,
                              tree,
                              input_vcf,
                              output_vcf,
                              alpha,
                              add_info_tag='NODE',
                              add_info_num=1,
                              add_info_type='String',
                              add_info_description='Nodes in a tree.',
                              add_info_source=None,
                              add_info_version=None):
        weights = self.__sample_node_proportion(tree, alpha)
        original_reader = vcf.Reader(open(input_vcf, 'r'))
        original_reader.infos[add_info_tag] = VcfInfo(
            add_info_tag, add_info_num, add_info_type, add_info_description,
            add_info_source, add_info_version)

        writer = vcf.Writer(open(output_vcf, 'w'),
                            original_reader,
                            lineterminator='\n')
        for record in original_reader:
            node = self.__sample_node(tree, weights)
            nodes = tree.sub_tree_nodes(at=node)
            nodes_string = '/'.join(map(str, nodes))
            record.add_info(add_info_tag, nodes_string)
            writer.write_record(record)
        writer.close()
Пример #4
0
    def add_annotation(self):
        """
        <p>
        Read the input VCF file, add annotations to the #INFO column and write it back to the output VCF file.
        </p>
        """

        vcfReader = vcf.Reader(open(self.inputFile, 'r'))
        """
        How to add info header
         <http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41>
        """
        vcfReader.infos['TSSOL'] = VcfInfo(
            'TSSOL', vcf_field_counts['A'], 'String',
            'Info indicates whether the variant overlapping with the'
            ' transcription start site(TSS)')
        vcfReader.infos['CCURI'] = VcfInfo(
            'CCURI', vcf_field_counts['A'], 'String',
            'Info includes the URL of the cage cluster to which the'
            ' variant overlapping')
        vcfReader.infos['SAMPURI'] = VcfInfo(
            'SAMPURI', vcf_field_counts['A'], 'String',
            'Info includes the URL of the samples with to which the'
            ' variant overlapping')

        vcfWriter = vcf.VCFWriter(open(self.outputFile, 'w'), vcfReader)

        cnt = 0
        cnt_block = 100
        t1 = time.time()

        #pool = Pool(self.n_parallel)
        #batch = list(itertools.islice(vcfReader, self.n_parallel))
        #res = pool.map(parallel_annotation_caller, zip([self]*len(batch), batch))

        for record in vcfReader:
            vcfWriter.write_record(self.get_annotation(record))

            if cnt % cnt_block == 1:
                t2 = time.time()
                ips = cnt_block / (t2 - t1)
                print "speed: %.2f iters/s = %d iters p/h = %.1f hours/million iters" % \
                      (ips, ips * 3600, 1000000 / ips / 3600)
                t1 = time.time()
            cnt += 1

        vcfWriter.close()
def annotation_vcf(parsed_args, process_num):
    records, results = Queue(100 * process_num), Queue()
    input_finished = False
    output_finished = False
    wait_records = dict()
    processes = list()
    records_id = count()
    for i in range(process_num):
        p = Process(target=score_vcf, args=(records, results, parsed_args.annotation))
        processes.append(p)
        p.start()
    vcf_reader = vcf.Reader(filename=parsed_args.file_in)
    vcf_reader.infos['SpliceAI'] = VcfInfo('SpliceAI', vcf_field_counts['A'], 'String',
                                           'SpliceAIv1.3 variant annotation. These include delta scores (DS) and delta positions (DP) for '
                                           'acceptor gain (AG), acceptor loss (AL), donor gain (DG), and donor loss (DL). '
                                           'Format: ALLELE|SYMBOL|DS_AG|DS_AL|DS_DG|DS_DL|DP_AG|DP_AL|DP_DG|DP_DL', version=None, source=None)
    vcf_writer = vcf.Writer(open(parsed_args.file_out, 'w'), vcf_reader)
    while True:
        while not records.full() and not input_finished:
            try:
                record = next(vcf_reader)
                record_id = next(records_id)
                wait_records[record_id] = record
                record_infos = list()
                chromosome = str(record.CHROM).replace('chr', '')
                pos = record.POS
                ref = record.REF
                for alt in record.ALT:
                    record_infos.append(VariantRecord(chromosome, pos, ref, str(alt)))
                records.put((record_id, record_infos))
            except StopIteration:
                input_finished = True
                records.put('END')
                break
        processes_status = list()
        for p in processes:
            processes_status.append(p.is_alive())
        if True not in processes_status:
            results.put('END')
        while True:
            try:
                result = results.get(False)
            except queue.Empty:
                break
            if result != 'END':
                record_id, record_score = result[0], result[1]
                record_write = wait_records.pop(record_id)
                record_write.add_info('SpliceAI', record_score)
                vcf_writer.write_record(record_write)
            else:
                output_finished = True
                break
        if output_finished:
            break
    vcf_writer.close()
def annotate_vcf(in_vcf, out_vcf, bw, ann_name):

    #add annotation field to header
    in_vcf.infos[ann_name] = VcfInfo(ann_name,
                                     vcf_field_counts['A'],
                                     'Float',
                                     'Replication Value',
                                     source="douglas",
                                     version="none")

    #annotate, read by read
    for i, record in enumerate(in_vcf):

        chrom = "chr" + record.CHROM
        pos = record.POS

        #get replication val at base from bw
        val = query_position(chrom, pos, bw)

        record.add_info(ann_name, val)
        out_vcf.write_record(record)
Пример #7
0
def main():
    parser, (options, args) = _get_args()
    if len(args) != 3:
        parser.error("Missing arguments!")

    pp2file = args[0]
    input_vcf = args[1]
    output_vcf = args[2]

    engine = create_engine('sqlite:///{}'.format(pp2file))
    conn = engine.connect()

    annotation_identifier = "PP2"

    reader = vcf.Reader(open(input_vcf, 'r'))
    reader.infos[annotation_identifier] = VcfInfo(
        annotation_identifier, 1, 'String',
        ("PolyPhen2 annotations in the following order:"
         "Gene name; "
         "UniProt id; "
         "Amino acid change; "
         "HVar effect category; "
         "Strength of var effect (probability); ", "HDiv effect category; "
         "Strength of div effect (probability)"), 'PolyPhen2', 'PGV001')
    writer = vcf.Writer(open(output_vcf, 'w'), reader, lineterminator='\n')
    for v in reader:
        res = annotate_variant(
            conn,
            v.CHROM if v.CHROM.startswith('chr') else 'chr{}'.format(v.CHROM),
            v.POS, v.REF, v.ALT[0])
        if res is None:
            annotation = ['.', '.', '.', '.', '.', '.', '.']
        else:
            annotation = [
                res.gene, res.protein, res.aa_change, res.hvar_pred,
                res.hvar_prob, res.hdiv_pred, res.hdiv_prob
            ]

        v.add_info(annotation_identifier, annotation)
        writer.write_record(v)
Пример #8
0
def main(argv):
    # Read and parse arguments
    infile = ''
    outfile = ''
    do_batch_lookups = True
    input_error = False

    parser = argparse.ArgumentParser(
        description='Simple VCF Client application')
    parser.add_argument('-i',
                        help='Input VCF file',
                        type=str,
                        metavar='Input File',
                        required=True)
    parser.add_argument('-o',
                        help='Output VCF file',
                        type=str,
                        metavar='Output File',
                        required=True)
    parser.add_argument('-k',
                        help='Your key to the API',
                        type=str,
                        metavar='API Key',
                        required=False)
    parser.add_argument('-g',
                        help='Reference genome either 1019 (default) or 1038',
                        type=int,
                        metavar='Reference Genome',
                        required=False,
                        default=1019)
    parser.add_argument('-nb',
                        help="Do not do batch requests",
                        action='store_true')

    args = parser.parse_args()
    infile = args.i
    outfile = args.o
    api_key = args.k
    ref_genome = args.g if args.g is not None else _ref_genome
    do_batch_lookups = not args.nb

    # Open and load vcf file into vfc reader object
    print("Reading input file ", infile)
    vcf_reader = vcf.Reader(filename=infile, encoding='utf8')

    # Add a new GENE info field in the metadata description, so that we may store such data for each record
    vcf_reader.infos['GENE'] = VcfInfo('GENE', ".", 'String',
                                       'Concatenated list of GENE sumbols', "",
                                       "")

    # Prepare output for writing.
    print("Opening output file ", outfile)
    vcf_writer = vcf.Writer(open(outfile, 'w'),
                            vcf_reader,
                            lineterminator='\n')

    # Declare an array of Variant_lookup_data objects to hold data for executing the
    # lookups and process its outcome.
    variant_lookup_data_array = []

    # A counter for the total number of rows processed thus far
    total_counter = 0

    # Initialize client connection to API
    api = VariantAPIClient(api_key)
    if (api is None):
        print("Failed to connect to API")
        sys.exit()

    print("Start parsing input file")

    # Iterate throught all the records read from the input VCF file
    while True:
        try:
            # Get next record (corresponds to a data row in the file)
            vcf_record = next(vcf_reader)

            # A vcf_record (i.e. row in the VCF file) may correspond to more than one variants, if it contains
            # more than one ALT values. We generate a Variant_lookup_data record for each variant,
            # and add them to the variant_lookup_data_array.
            # Note: A reference to the same "vcf_record" object will be stored in each Variant_lookup_data record,
            #       however the "alt" field will contain a different ALT value.
            variant_lookup_data_from_vcf_record(vcf_record,
                                                variant_lookup_data_array)

        except StopIteration as e:
            # Reached end of input VCF file, no new vcf_record was read
            vcf_record = None

        # If we are performing batch lookups...
        if (do_batch_lookups):
            # Check if we have reached (or slightly crossed) the limit of variants we want for the batch request, or the end of the input file.
            # Note: In this implementation we may cross the limit if the last vcf_record read contained more than one variants. This is OK.
            if (len(variant_lookup_data_array) >= _batch_limit
                    or vcf_record is None):
                # Extract variant strings from array.
                variant_string_array = [
                    vld.variant_string for vld in variant_lookup_data_array
                ]

                # Execute batch lookup request
                batch_data = api.batch_lookup(variant_string_array,
                                              ref_genome=ref_genome,
                                              params={'add-all-data': 1})

                # Process response, variant by variant
                batch_counter = 0
                for data in batch_data:
                    process_single_variant_response_data(
                        variant_lookup_data_array[batch_counter], data,
                        vcf_writer)
                    batch_counter += 1

                # Clear array
                del variant_lookup_data_array[:]

                if (vcf_record is None):
                    # Reached the end of the file, finish
                    break

        # If we are performing individual lookups for each variant (which is not recommended for performance issues),
        # execute the lookup and process the outcome
        else:
            if (vcf_record is not None):
                # Execute lookup requests for each element in the array
                for vld in variant_lookup_data_array:
                    data = api.lookup(vld.variant_string,
                                      ref_genome=ref_genome)
                    process_single_variant_response_data(vld, data, vcf_writer)

                # Clear array
                del variant_lookup_data_array[:]
            else:
                # Reached the end of the file, finish
                break

        total_counter += 1
        if (total_counter % 1000 == 0):
            print("Read ", total_counter, " rows")

    print("Finished reading ", total_counter, " rows, exiting")
Пример #9
0
#!/usr/bin/env python

import vcf
import sys
from vcf.parser import _Info as VcfInfo

vcf_reader = vcf.Reader(sys.stdin)

vcf_reader.infos['ALT_idx'] = VcfInfo('ALT_idx', 'A', 'String',
                                      'index for the alternative alleles')
vcf_reader.infos['ALT_pos'] = VcfInfo(
    'ALT_pos', 1, 'String', 'original postition for the multiallele complex')
vcf_reader.infos['ALT_num'] = VcfInfo(
    'ALT_num', 1, 'Integer',
    'number of alternative allele for the multiallele complex')
vcf_reader.infos['ALTs_orig'] = VcfInfo('ALTs_orig', 1, 'String',
                                        'all original ALTs')
vcf_reader.infos['REF_orig'] = VcfInfo('REF_orig', 1, 'String', 'original REF')
vcf_reader.infos['ACs_orig'] = VcfInfo('ACs_orig', 1, 'String',
                                       'all original ACs')

writer = vcf.Writer(sys.stdout, vcf_reader, lineterminator='\n')

# info field that might be multi-allelic
for Record in vcf_reader:

    # when encountering multiallele, annotate alt sequence
    if len(Record.ALT) > 1:

        ALT_idx = ','.join([str(x) for x in range(1, len(Record.ALT) + 1)])
        ALT_pos = str(Record.CHROM) + ':' + str(Record.POS)
Пример #10
0
###################################################################

#override the snpEff EFF fields in both header and info, split the fields and add them in INFO for easier conversion to table

###################################################################

import sys
import re
import vcf
from vcf.parser import _Info as VcfInfo

if __name__ == '__main__':

    vcf_reader = vcf.Reader(sys.stdin)

    vcf_reader.infos['Allele'] = VcfInfo('Allele', 1, 'String',
                                         'variant allele')
    vcf_reader.infos['ENSG'] = VcfInfo('ENSG', 1, 'String', 'Ensembl Gene ID')
    vcf_reader.infos['Feature'] = VcfInfo('Feature', 1, 'String',
                                          'Ensembl Transcript')
    vcf_reader.infos['Feature_type'] = VcfInfo('Feature', 1, 'String',
                                               'Feature type')
    vcf_reader.infos['Consequence'] = VcfInfo('Consequence', 1, 'String',
                                              'Functional effect')
    vcf_reader.infos['cDNA_position'] = VcfInfo('cDNA_position', 1, 'String',
                                                'cDNA position')
    vcf_reader.infos['CDS_position'] = VcfInfo('CDS_position', 1, 'String',
                                               'CDS position')
    vcf_reader.infos['Protein_position'] = VcfInfo('Protein_position', 1,
                                                   'String',
                                                   'Protein position')
    vcf_reader.infos['Amino_acids'] = VcfInfo('Amino_acids', 1, 'String',
Пример #11
0
                        help="Fasta file for the genomic reference")
    parser.add_argument('--length',
                        '-l',
                        type=int,
                        default=25,
                        help="length of sequence on the 3 prime side")
    parser.add_argument('--thresh_times',
                        '-t',
                        type=int,
                        default=5,
                        help="times of repeat above which STR is called")
    args = parser.parse_args()

    vcf_reader = vcf.Reader(sys.stdin)

    vcf_reader.infos['Primer3'] = VcfInfo('Primer3', 1, 'String',
                                          '3 prime side sequence')
    vcf_reader.infos['Primer5'] = VcfInfo('Primer5', 1, 'String',
                                          '5 prime side sequence')
    vcf_reader.infos['STR'] = VcfInfo('STR', 0, 'Flag', 'whether there is STR')
    vcf_reader.infos['STR_RU'] = VcfInfo('STR_RU', 1, 'String',
                                         'repeating unit of STR')
    vcf_reader.infos['STR_times'] = VcfInfo('STR_times', 1, 'Integer',
                                            'time of repeats for STR')
    vcf_reader.infos['STR_match'] = VcfInfo(
        'STR_match', 0, 'Flag',
        'whether the ALT sequence change from REF matches multiples of STR_RU')

    writer = vcf.Writer(sys.stdout, vcf_reader, lineterminator='\n')

    for Record in vcf_reader:
Пример #12
0
def run_std_filter(args):
    vcf_out = os.path.basename(args.inputVcf)
    vcf_out = os.path.splitext(vcf_out)[0]
    txt_out = os.path.basename(args.inputTxt)
    txt_out = os.path.splitext(txt_out)[0]

    if args.outdir:
        vcf_out = os.path.join(args.outdir, vcf_out)
        txt_out = os.path.join(args.outdir, txt_out)

    vcf_out = vcf_out + '_STDfilter.vcf'
    txt_out = txt_out + '_STDfilter.txt'
    vcf_reader = vcf.Reader(open(args.inputVcf, 'r'))
    vcf_reader.infos['FAILURE_REASON'] = VcfInfo(
        'FAILURE_REASON', '.', 'String',
        'Failure Reason from MuTect text File', 'muTect', 'v1.1.5')
    vcf_reader.infos['set'] = VcfInfo(
        'set', '.', 'String', 'The variant callers that reported this event',
        'mskcc/basicfiltering', 'v0.2.1')
    vcf_reader.formats['DP'] = VcfFormat('DP', '1', 'Integer',
                                         'Total read depth at this site')
    vcf_reader.formats['AD'] = VcfFormat(
        'AD', 'R', 'Integer',
        'Allelic depths for the ref and alt alleles in the order listed')

    allsamples = list(vcf_reader.samples)
    if len(allsamples) != 2:
        logger.critical(
            "The VCF does not have two genotype columns. Please input a proper vcf with Tumor/Normal columns"
        )
        sys.exit(1)

    # If the caller reported the normal genotype column before the tumor, swap those around
    if_swap_sample = False
    if allsamples[1] == args.tsampleName:
        if_swap_sample = True
        vcf_reader.samples[0] = allsamples[1]
        vcf_reader.samples[1] = allsamples[0]

    # Dictionary to store records to keep
    keepDict = {}

    # Filter each row (Mutation)
    txtDF = pd.read_table(args.inputTxt, skiprows=1, dtype=str)
    txt_fh = open(txt_out, "wb")
    for index, row in txtDF.iterrows():
        chr = row.loc['contig']
        pos = row.loc['position']
        ref_allele = row.loc['ref_allele']
        alt_allele = row.loc['alt_allele']
        trd = int(row.loc['t_ref_count'])
        tad = int(row.loc['t_alt_count'])

        ##############################
        # Tumor Variant Calculations #
        ##############################

        # Total Depth
        # Todo: Does this include indels? soft clipping?
        tdp = trd + tad

        # Variant Fraction
        if tdp != 0:
            tvf = int(tad) / float(tdp)
        else:
            tvf = 0

        ###############################
        # Normal Variant Calculations #
        ###############################

        nrd = int(row.loc['n_ref_count'])
        nad = int(row.loc['n_alt_count'])

        # Total Depth
        ndp = nrd + nad

        # Variant Fraction
        if ndp != 0:
            nvf = int(nad) / float(ndp)
        else:
            nvf = 0

        # Get REJECT or PASS
        judgement = row.loc['judgement']
        failure_reason = row.loc['failure_reasons']

        # nvfRF is one of the thresholds that the tumor variant fraction must exceed
        # in order to pass filtering.
        #
        # This threshold is equal to the normal variant fraction, multiplied by
        # the number of times greater we must see the mutation in the tumor (args.tnr):
        nvfRF = int(args.tnr) * nvf

        # This will help in filtering VCF
        key_for_tracking = str(chr) + ':' + str(pos) + ':' + str(
            ref_allele) + ':' + str(alt_allele)

        if judgement != 'KEEP':
            # Check the failure reasons to determine if we should still consider this variant
            failure_tags = failure_reason.split(',')
            tag_count = 0
            for tag in failure_tags:
                if tag in ACCEPTED_TAGS:
                    tag_count += 1
            # All failure_reasons should be found in accepted tags to continue
            if tag_count != len(failure_tags):
                continue
        else:
            failure_reason = 'KEEP'

        if tvf > nvfRF:
            if (tdp >= int(args.dp)) & (tad >= int(args.ad)) & (tvf >= float(
                    args.vf)):
                if key_for_tracking in keepDict:
                    print('MutectStdFilter: There is a repeat ',
                          key_for_tracking)
                else:
                    keepDict[key_for_tracking] = failure_reason
                out_line = str.encode(args.tsampleName + "\t" + str(chr) +
                                      "\t" + str(pos) + "\t" +
                                      str(ref_allele) + "\t" +
                                      str(alt_allele) + "\t" +
                                      str(failure_reason) + "\n")
                txt_fh.write(out_line)

    txt_fh.close()

    # This section uses the keepDict to write all passed mutations to the new VCF file
    vcf_writer = vcf.Writer(open(vcf_out, 'w'), vcf_reader)
    for record in vcf_reader:
        key_for_tracking = str(record.CHROM) + ':' + str(
            record.POS) + ':' + str(record.REF) + ':' + str(record.ALT[0])

        if key_for_tracking in keepDict:
            failure_reason = keepDict.get(key_for_tracking)
            # There was no failure reason for calls that had "KEEP" in their judgement column,
            # but this code uses "KEEP" as the key when they are encountered
            if failure_reason == 'KEEP':
                failure_reason = 'None'

            record.add_info('FAILURE_REASON', failure_reason)
            record.add_info('set', 'MuTect')
            if if_swap_sample:
                nrm = record.samples[0]
                tum = record.samples[1]
                record.samples[0] = tum
                record.samples[1] = nrm

            if record.FILTER == 'PASS':
                vcf_writer.write_record(record)

            # Change the failure reason to PASS, for mutations for which we want to override MuTect's assessment
            else:
                record.FILTER = 'PASS'
                vcf_writer.write_record(record)
        else:
            continue

    vcf_writer.close()

    # Normalize the events in the VCF, produce a bgzipped VCF, then tabix index it
    norm_gz_vcf = cmo_util.normalize_vcf(vcf_out, args.refFasta)
    cmo_util.tabix_file(norm_gz_vcf)

    return norm_gz_vcf
Пример #13
0
def serial_annotate(opts, trans_provided_no_acc):
    # 串行注释, 生成vcf格式
    am, hp, hn3, hn5, hdp = annotator(opts.annotation)
    chrome_dic = generate_chrome_dic(opts.annotation)
    vcf_reader = vcf.Reader(filename=opts.file_in)
    vcf_reader.infos['HGVS'] = VcfInfo('HGVS',
                                       vcf_field_counts['A'],
                                       'String',
                                       'VCF record alleles in HGVS syntax',
                                       version=None,
                                       source=None)
    vcf_reader.infos['HGVS_Normalise'] = VcfInfo(
        'HGVS_Normalise',
        vcf_field_counts['A'],
        'String',
        'VCF record alleles in HGVS syntax (Normalised)',
        version=None,
        source=None)
    vcf_writer = vcf.Writer(open(opts.file_out, 'w'), vcf_reader)
    for record in vcf_reader:
        chrome = str(record.CHROM)
        start = record.affected_start
        stop = record.affected_end
        record_hgvs_list = list()
        record_hgvs_normalise_list = list()
        for alt in record.ALT:
            hgvs_list = list()
            hgvs_normalise_list = list()
            if record.is_snp:
                var_type = 'snv'
                ref = record.REF
                call = str(alt)
            else:
                if len(record.REF) == 1 and len(str(alt)) > 1:
                    var_type = 'ins'
                    ref = '.'
                    call = str(alt)[1:]
                elif len(record.REF) > 1 and len(str(alt)) == 1:
                    var_type = 'del'
                    ref = record.REF[1:]
                    call = '.'
                else:
                    var_type = 'delins'
                    if record.REF[0] == str(alt)[0]:
                        ref = record.REF[1:]
                        call = str(alt)[1:]
                    else:
                        ref = record.REF
                        call = str(alt)
                        start = record.affected_start - 1
            record_parser = VariantRecord(chrome, start, stop, ref, call,
                                          var_type)
            g = generate_g(record_parser, chrome_dic)
            try:
                g_parser = hp.parse_hgvs_variant(g)
                g_normalise_3 = hn3.normalize(g_parser)
                g_normalise_5 = hn5.normalize(g_parser)
                trans_related = am.relevant_transcripts(g_parser)
            except (HGVSParseError, HGVSError, HGVSUsageError) as e:
                error = str(e)
                logging.error(
                    '{chrome} {start} {stop} {ref} {call} {g} annotate error. {error}.'
                    .format(**locals()))
                record_hgvs_list.append('.|.|.')
                record_hgvs_normalise_list.append('.|.|.')
                continue
            trans = select_trans(trans_related, trans_provided_no_acc,
                                 opts.how)
            if len(trans) == 0:
                logging.warning(
                    '{chrome} {start} {stop} {ref} {call} {g} no related transcripts in UTA.'
                    .format(**locals()))
                record_hgvs_list.append(g + '|.|.')
                record_hgvs_normalise_list.append(str(g_normalise_3) + '|.|.')
                continue
            for tran in trans:
                try:
                    t = am.g_to_t(g_parser, tran)
                    strand = get_transcript_strand(opts, hdp, g, tran)
                    if strand == 3:
                        g_normalise = g_normalise_3
                    else:
                        g_normalise = g_normalise_5
                    t_normalise = am.g_to_t(g_normalise, tran)
                    p = am.t_to_p(t)
                    p_normalise = am.t_to_p(t_normalise)
                    hgvs_ = '|'.join([g, str(t), str(p)])
                    hgvs_normalise = '|'.join(
                        [str(g_normalise),
                         str(t_normalise),
                         str(p_normalise)])
                except (HGVSError, HGVSUsageError, NotImplementedError,
                        IndexError) as e:
                    error = str(e)
                    logging.error(
                        '{chrome} {start} {stop} {ref} {call} {tran} {g} annotate error. {error}.'
                        .format(**locals()))
                    hgvs_ = '|'.join([g, '.', '.'])
                    hgvs_normalise = '|'.join([str(g_normalise_3), '.', '.'])
                hgvs_list.append(hgvs_)
                hgvs_normalise_list.append(hgvs_normalise)
            hgvs_alt = '/'.join(hgvs_list)
            hgvs_normalise_alt = '/'.join(hgvs_normalise_list)
            record_hgvs_list.append(hgvs_alt)
            record_hgvs_normalise_list.append(hgvs_normalise_alt)
        record_hgvs = ','.join(record_hgvs_list)
        record_hgvs_normalise = ','.join(record_hgvs_normalise_list)
        record.add_info('HGVS', record_hgvs)
        record.add_info('HGVS_Normalise', record_hgvs_normalise)
        vcf_writer.write_record(record)
    vcf_writer.close()
Пример #14
0
def convert_grch38_ref_mismatch_sites_to_grch37(input_vcf_file,
                                                output_vcf_basename):
    """
    For ACMG59 reportable range there are 4 sites that have
    reference mismatch between GRCh37 and GRCh38
    All ref and alts in variants overlapping these sites
    will need to be updated to 37 reference
    output file will contain variants overlapping mismatch sites
    and all other variants with original record
    """
    logger = logging.getLogger(__name__)
    output_vcf_file = f'{output_vcf_basename}.vcf'
    reader = vcf.Reader(filename=input_vcf_file)
    records = list(reader)
    mismatched_site_overlap = {}
    for record in records:
        mismatched_site_key = find_overlapping_mismatch_site(record)
        if mismatched_site_key:
            mismatched_site_overlap[mismatched_site_key] = True
            try:
                update_grch38_ref_to_grch37_for_record_if_needed(
                    record, mismatched_site_key)
            except ValueError as e:
                logger.info(
                    f'Record {record.CHROM}:{record.POS} with mismatch site {mismatched_site_key} encountered error {e}'
                )

    reader.infos['PREPROCESSED'] = VcfInfo(
        'PREPROCESSED',
        0,
        'Flag',
        'The record was pre-processed. Added when a record needed to be changed for liftover',
        '',
        '',
    )
    # if there are no overlapping variants in mismatched sites,
    # create a homozygous variant matching 37 as ref and 38 as alt
    for key, site in MISMATCH_SITES.items():
        if key not in mismatched_site_overlap.keys():
            # TODO: separate out creation of a record
            mismatch_record = copy(record)
            mismatch_record.ID = '.'
            mismatch_record.QUAL = 100
            mismatch_record.FILTER = []
            mismatch_record.FORMAT = 'GT'
            mismatch_record.samples = []
            # copy the objects within a record.
            # Without doing an explicit copy it will just be a
            # pointer to the original record
            for sample in record.samples:
                mismatch_record.samples.append(copy(sample))
            mismatch_record.samples[0].data = calldata_spec('1/1')
            mismatch_record.INFO = {}
            mismatch_record.add_info('preprocessed')
            mismatch_record.CHROM = site['38_coordinates']['chrom']
            mismatch_record.POS = site['38_coordinates']['start']
            mismatch_record.REF = site['37_coordinates']['base']
            mismatch_record.ALT = [
                vcf.model._Substitution(site['38_coordinates']['base'])
            ]
            records.append(mismatch_record)

    contig_order = {c: i for i, c in enumerate(reader.contigs)}

    def sort_key(record):
        """
        Sorts records by (CHROM,POS,REF).
        If contigs are specified in the VCF file and record CHROM matches a contig,
        contig order is maintained.
        Any unmatched CHROMs will throw an error
        """
        if record.CHROM not in contig_order:
            raise ValueError(
                f'Unexpected chrom {record.CHROM} found. Expected one of {contig_order.keys()}'
            )
        return (contig_order[record.CHROM], record.POS, record.REF)

    records.sort(key=sort_key)

    with open(output_vcf_file, 'w') as out_fp:
        writer = vcf.Writer(out_fp, reader, lineterminator='\n')
        for record in records:
            writer.write_record(record)
Пример #15
0
def main():
    global options, args

    # Be sure to get files bgzipped and tabix indexed
    for vcf_file in options.input_vcf:
        #if not os.path.isfile(vcf_file + '.gz'):
        command_line = "bgzip -c " + vcf_file + " > " + vcf_file + ".gz"
        shlex.split(command_line)
        retcode = subprocess.check_output(command_line,
                                          stderr=subprocess.STDOUT,
                                          shell=True)
        print retcode
        #if not os.path.isfile(vcf_file + '.gz.tbi'):
        command_line = "tabix -f -p vcf " + vcf_file + ".gz"
        shlex.split(command_line)
        retcode = subprocess.check_output(command_line,
                                          stderr=subprocess.STDOUT,
                                          shell=True)
        print retcode

    # First vcf file will be the template file for outputting parameters
    template_vcf = vcf.Reader(open(options.input_vcf[0], 'r'))

    # Add essential fields in both formats and infos (header information)
    template_vcf.formats['FREQ'] = VcfFormat('FREQ', 1, 'String',
                                             'Variant allele frequency')
    template_vcf.infos['SFREQ'] = VcfInfo(
        'SFREQ', 1, 'Float', 'Maximum variant allele frequency of all samples')
    template_vcf.infos['SDP'] = VcfInfo(
        'SDP', 1, 'Integer', 'Maximum sequencing depth of all samples')

    # Create a list of sorted variant-sites containing chr and position
    variant_sites = []
    for vcf_file in options.input_vcf:
        tmp_vcf = vcf.Reader(open(vcf_file, 'r'))
        for record in tmp_vcf:
            new_variant_site = (get_chromosome_number(record.CHROM),
                                record.POS)
            if not new_variant_site in variant_sites:
                variant_sites.append(new_variant_site)
    variant_sites.sort(key=lambda variant: (variant[0], variant[1]))

    # Open all files for random access
    input_vcf = []
    for index, vcf_file in enumerate(options.input_vcf):
        input_vcf.append(vcf.Reader(open(vcf_file + '.gz', 'r')))
        # Perform tests and checks
        if index > 0 and input_vcf[index].samples != template_vcf.samples:
            print "INFO: not same sample list in", vcf_file

        # Add necessary FORMAT or INFO fields definitions in template
        for info in input_vcf[index].infos:
            if not info in template_vcf.infos:
                template_vcf.infos[info] = input_vcf[index].infos[info]
        for myformat in input_vcf[index].formats:
            if not myformat in template_vcf.formats:
                template_vcf.formats[myformat] = input_vcf[index].formats[
                    myformat]

    # Open output handles
    output_vcf = vcf.Writer(open(options.output_vcf, 'w'),
                            template_vcf,
                            lineterminator='\n')
    output_indels_vcf = vcf.Writer(open(options.output_vcf + '_indels.vcf',
                                        'w'),
                                   template_vcf,
                                   lineterminator='\n')
    output_snps_vcf = vcf.Writer(open(options.output_vcf + '_snps.vcf', 'w'),
                                 template_vcf,
                                 lineterminator='\n')

    # Now parse each variant-site and fetch information from vcfs:
    for my_variant_site in variant_sites:
        records = []
        for my_vcf in input_vcf:
            try:
                for record in my_vcf.fetch(
                        chromosome_number2str(my_variant_site[0]),
                        my_variant_site[1], my_variant_site[1]):
                    # vcf.fetch returns also next position if described, must be therefore removed
                    if record.POS == my_variant_site[1]:
                        records.append(record)
            except KeyError:
                # This exception is raised when the primary key is not found in one of the files. No actions required
                pass
        # master_records are those records for being output to merged vcf. A master record will be created for each
        # group of variants from a same variant site that can be merged
        master_records = [records[0]]
        for record in records[1:]:
            add_to_master = False
            already_added = False
            for master_record in master_records:
                if master_record != record:
                    add_to_master = True
                else:
                    if not master_record.merge(record):
                        add_to_master = True
                    else:
                        already_added = True
            if add_to_master and not already_added:
                master_records.append(record)

        for master_record in master_records:
            output_vcf.write_record(master_record)
            if master_record.is_snp:
                output_snps_vcf.write_record(master_record)
            elif master_record.is_indel:
                output_indels_vcf.write_record(master_record)
Пример #16
0
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description=' vcfAnnoSTR')
    parser.add_argument('--ref',
                        '-r',
                        required=True,
                        help="Fasta file for the genomic reference")
    parser.add_argument('--anc_ref',
                        '-a',
                        required=True,
                        help="Fasta file for the ancester genomic reference")
    args = parser.parse_args()

    vcf_reader = vcf.Reader(sys.stdin)

    vcf_reader.infos['ALT_Codon'] = VcfInfo('ALT_Codon', 1, 'String',
                                            'Alt codon')
    vcf_reader.infos['ANC_Codon'] = VcfInfo('ANC_Codon', 1, 'String',
                                            'Ancester allele codon')

    writer = vcf.Writer(sys.stdout, vcf_reader, lineterminator='\n')

    for Record in vcf_reader:

        if Record.INFO.get('LoF_filter', False):
            if 'ANC_ALLELE' in Record.INFO['LoF_filter']:
                if Record.INFO['VARTYPE'] == 'SNP':

                    phase = int(Record.INFO['CDS_position'].split('|')[0]) % 3
                    strand = Record.INFO['STRAND'].split('|')[0]

                    # use samtools faidx to extract
Пример #17
0
        fields = line.split(' ')
        all_entries.append(fields)

    # initialize pfam_dict
    pfam_dict = {}
    for entry in all_entries:
        pfam_dict[entry[1]] = []

    # put each entry into the slot
    for entry in all_entries:
        pfam_dict[entry[1]].append(
            (int(entry[4]), int(entry[5]), entry[3], entry[7]))

    vcf_reader = vcf.Reader(sys.stdin)

    vcf_reader.infos['AA_pos'] = VcfInfo('AA_pos', 1, 'Integer',
                                         'Amino acid change position')
    vcf_reader.infos['Pfam'] = VcfInfo('Pfam', 0, 'Flag',
                                       'whether Gene in pfam')
    vcf_reader.infos['InDom'] = VcfInfo('InDom', 0, 'Flag',
                                        'whether in pfam domain')
    vcf_reader.infos['PfamDom'] = VcfInfo('PfamDom', 1, 'String',
                                          'info of pfam domain')
    vcf_reader.infos['AftDom'] = VcfInfo('AftDom', 0, 'Flag',
                                         'whether after pfam domain')

    writer = vcf.Writer(sys.stdout, vcf_reader, lineterminator='\n')

    numera = re.compile('[0-9]+')
    for Record in vcf_reader:
        # if coding change
        if Record.INFO.get('AAChange', False):
Пример #18
0
def parallel_annotate(opts, trans_provided_no_acc, process_num):
    # 并行注释
    chrome_dic = generate_chrome_dic(opts.annotation)
    # 创建队列, 初始化
    records, results = Queue(100 * process_num), Queue()
    input_finished = False
    output_finished = False
    wait_records = dict()
    records_id = count()
    processes = list()
    # 开启多个进程监听队列, 注释
    for i in range(process_num):
        p = Process(target=process_record,
                    args=(records, results, opts, trans_provided_no_acc))
        processes.append(p)
        p.start()
    # 读取vcf信息, 写入新的vcf
    vcf_reader = vcf.Reader(filename=opts.file_in)
    vcf_reader.infos['HGVS'] = VcfInfo('HGVS',
                                       vcf_field_counts['A'],
                                       'String',
                                       'VCF record alleles in HGVS syntax',
                                       version=None,
                                       source=None)
    vcf_reader.infos['HGVS_Normalise'] = VcfInfo(
        'HGVS_Normalise',
        vcf_field_counts['A'],
        'String',
        'VCF record alleles in HGVS syntax (Normalised)',
        version=None,
        source=None)
    vcf_writer = vcf.Writer(open(opts.file_out, 'w'), vcf_reader)
    while True:
        while not records.full() and not input_finished:
            try:
                record = next(vcf_reader)
                chrome = str(record.CHROM)
                start = record.affected_start
                stop = record.affected_end
                record_id = next(records_id)
                wait_records[record_id] = record
                record_infos = list()
                for alt in record.ALT:
                    if record.is_snp:
                        var_type = 'snv'
                        ref = record.REF
                        call = str(alt)
                    else:
                        if len(record.REF) == 1 and len(str(alt)) > 1:
                            var_type = 'ins'
                            ref = '.'
                            call = str(alt)[1:]
                        elif len(record.REF) > 1 and len(str(alt)) == 1:
                            var_type = 'del'
                            ref = record.REF[1:]
                            call = '.'
                        else:
                            var_type = 'delins'
                            if record.REF[0] == str(alt)[0]:
                                ref = record.REF[1:]
                                call = str(alt)[1:]
                            else:
                                ref = record.REF
                                call = str(alt)
                                start = record.affected_start - 1
                    record_parser = VariantRecord(chrome, start, stop, ref,
                                                  call, var_type)
                    g = generate_g(record_parser, chrome_dic)
                    record_infos.append((record_parser, g))
                records.put((record_id, record_infos))
            except StopIteration:
                input_finished = True
                records.put('END')
                break
        processes_status = list()
        for p in processes:
            processes_status.append(p.is_alive())
        if True not in processes_status:
            results.put('END')
        while True:
            try:
                result = results.get(False)
            except queue.Empty:
                break
            if result != 'END':
                record_id, record_hgvs, record_hgvs_normalise = result[
                    0], result[1], result[2]
                record_write = wait_records.pop(record_id)
                record_write.add_info('HGVS', record_hgvs)
                record_write.add_info('HGVS_Normalise', record_hgvs_normalise)
                vcf_writer.write_record(record_write)
            else:
                output_finished = True
                break
        if output_finished:
            break
    vcf_writer.close()
                info_item = info_item.strip()
                if not info_item.startswith('splice=') and info_item not in [
                        '', '.'
                ]:
                    new_INFO.append(info_item.strip())
            new_INFO.append(splice_predict.print_vcf(effect))

            new_line = '\t'.join(elems[:7] + [';'.join(new_INFO)] + elems[8:])
            return new_line

        # write_header
        vcf_reader = vcf.Reader(vcf_input)
        vcf_info_desc = 'Splice effect. Format: Transcript|Effect|MaxEntScan-wild|MaxEntScan-mut|MaxEntScan-closest|dist'
        vcf_reader.infos['splice'] = VcfInfo(id='splice',
                                             num=1,
                                             type='String',
                                             desc=vcf_info_desc,
                                             source='spliceAnnotator',
                                             version=__version__)
        vcf.Writer(vcf_output, vcf_reader)

        # distribute annotation
        pool = Pool(processes=args.np)
        for line in vcf_input:
            if line.startswith('#'):
                pass
            else:
                pool.apply_async(process, args=(line, ), callback=log_result)
        pool.close()
        pool.join()
        flush(result_list)
Пример #20
0
                    help="Filepath to reference FASTA file")
parser.add_argument("--in-vcf",
                    required=True,
                    help="Filepath to vcf file to be analyzed")
parser.add_argument("--out-vcf",
                    required=True,
                    help="Filepath to vcf file to be output")

args = parser.parse_args()
ref_path = args.reference
reference = Fasta(ref_path, sequence_always_upper=True, read_ahead=1000)
in_vcf_path = args.in_vcf
in_vcf_handle = open(in_vcf_path)
in_vcf = vcf.Reader(in_vcf_handle)
in_vcf.infos['HRUN'] = VcfInfo(
    'HRUN', 1, 'Integer',
    'Homopolymer length to the right of report indel position', "get_hrun",
    "1.0")
out_vcf_path = args.out_vcf
out_vcf_handle = open(out_vcf_path, 'w')
out_vcf = vcf.Writer(out_vcf_handle, in_vcf)
for record in in_vcf:
    chrom = record.CHROM
    pos = record.POS - 1
    ref = record.REF
    calc_hrun = False
    for alt in record.ALT:
        if len(ref) != len(alt):
            calc_hrun = True
    if calc_hrun:
        window = 50
        hrun = 1
Пример #21
0
###################################################################

#override the snpEff EFF fields in both header and info, split the fields and add them in INFO for easier conversion to table

###################################################################

import sys
import re
import vcf
from vcf.parser import _Info as VcfInfo

if __name__ == '__main__':

    vcf_reader = vcf.Reader(sys.stdin)

    vcf_reader.infos['AC'] = VcfInfo('AC', 1, 'Integer',
                                     'Allele count in genotypes')
    vcf_reader.infos['AF'] = VcfInfo('AF', 1, 'Float', 'Allele Frequency')
    vcf_reader.infos['MLEAC'] = VcfInfo(
        'MLEAC', 1, 'Integer',
        'Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC)'
    )
    vcf_reader.infos['MLEAF'] = VcfInfo(
        'MLEAF', 1, 'Float',
        'Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF)'
    )
    vcf_reader.infos['ALT_idx'] = VcfInfo('ALT_idx', 1, 'String',
                                          'index for the alternative alleles')
    vcf_reader.infos['length'] = VcfInfo(
        'length', 1, 'Integer', 'length(ALT) - length(REF) for each ALT')
    vcf_reader.infos['VARTYPE'] = VcfInfo('VARTYPE', 1, 'String',
                                          'variant types')
    def addTSSInfo(self, vcfInputFile):
        vcf_reader = vcf.Reader(open(vcfInputFile, 'r'))
        vcf_reader.infos['TSSOL'] = VcfInfo(
            'TSSOL', vcf_field_counts['A'], 'String',
            'Info indicates whether the variant overlapping with the'
            ' transcription start site(TSS)')

        vcf_writer = vcf.VCFWriter(open('output.vcf', 'w'), vcf_reader)

        query = SPARQLQueries.sparqlQueries()

        totalVar = 0
        tssOLVar = 0

        lo = LiftOver('hg38ToHg19.over.chain.gz')

        for record in vcf_reader:
            variantStart = record.start
            variantEnd = record.end
            variantChromosome = record.CHROM
            variantSubType = record.var_subtype
            isOverlapping = False

            # Adding chr prefix to the chromosome
            if "chr" not in variantChromosome:
                variantChromosome = "chr" + str(record.CHROM)

            #liftover from hg20 to hg19
            data = lo.convert_coordinate(variantChromosome, variantStart)

            #print variantChromosome
            print variantStart
            print variantEnd

            if ((data != None)):
                data2 = data.pop()

                variantChromosomehg19 = data2[0]
                variantStarthg19 = data2[1]

                data = lo.convert_coordinate(variantChromosome, variantEnd)
                data2 = data.pop()

                variantEndhg19 = data2[1]

                # SPARQL query
                result = query.getTSS('http://ep.dbcls.jp/fantom5/sparql',
                                      variantStarthg19, variantEndhg19,
                                      variantChromosomehg19)

                for row in result:

                    values = sparql.unpack_row(row)
                    cageStart = values[1]
                    cageEnd = values[2]

                    if ((variantSubType == 'ins') &
                        (variantStart > cageStart)):
                        isOverlapping = True
                        tssOLVar = tssOLVar + 1
                        break
                    elif ((variantSubType != 'ins') & (cageStart > 0)):
                        isOverlapping = True
                        tssOLVar = tssOLVar + 1
                    break

                totalVar = totalVar + 1
                record.add_info('TSSOL', [isOverlapping])
            else:
                print "No liftover found for this pos = " + record.ID

            vcf_writer.write_record(record)

            print "No of variants = " + str(totalVar)
            print "No of tss overlapping variants = " + str(tssOLVar)
Пример #23
0
#print(input)
#Ensure the bam and csv file match
#if input==bam:
#        print("working with vcf: "+input + " and bam: " +bam)
#else:
#   print( "bam:"+bam +" does not match vcf: " +input)
#   sys.exit(1)

#Put header on true false and exp csv files

in_var = vcf.Reader(open(sys.argv[1], 'r'))
## update infos ##
in_var.infos['MapQ'] = VcfInfo(
    id='MapQ',
    num=1,
    type='Float',
    desc="The average MapQ of the reads containing the called variant"
)  #,source=None, version=None)
in_var.infos['Read_pos'] = VcfInfo(
    id='Read_pos',
    num=1,
    type='Float',
    desc="The average read cycle that called the given variant"
)  #,source=None, version=None)
in_var.infos['Phred'] = VcfInfo(
    id='Phred',
    num=1,
    type='Float',
    desc="The average Phred score of the called variant"
)  #,source=None, version=None)
Пример #24
0
def run_std_filter(args):
    vcf_out = os.path.basename(args.inputVcf)
    vcf_out = os.path.splitext(vcf_out)[0]

    if args.outdir:
        vcf_out = os.path.join(args.outdir, vcf_out)

    txt_out = vcf_out + '_STDfilter.txt'
    vcf_out = vcf_out + '_STDfilter.vcf'

    vcf_reader = vcf.Reader(open(args.inputVcf, 'r'))
    vcf_reader.infos['set'] = VcfInfo(
        'set', '.', 'String', 'The variant callers that reported this event',
        'mskcc/basicfiltering', 'v0.2.1')
    vcf_reader.formats['DP'] = VcfFormat('DP', '1', 'Integer',
                                         'Total read depth at this site')
    vcf_reader.formats['AD'] = VcfFormat(
        'AD', 'R', 'Integer',
        'Allelic depths for the ref and alt alleles in the order listed')

    allsamples = list(vcf_reader.samples)

    if len(allsamples) != 2:
        if args.verbose:
            logger.critical(
                'The VCF does not have two genotype columns. Please input a proper vcf with Tumor/Normal columns'
            )
        sys.exit(1)

    # If the caller reported the normal genotype column before the tumor, swap those around
    if_swap_sample = False
    if allsamples[1] == args.tsampleName:
        if_swap_sample = True
        vcf_reader.samples[0] = allsamples[1]
        vcf_reader.samples[1] = allsamples[0]

    nsampleName = vcf_reader.samples[1]

    vcf_writer = vcf.Writer(open(vcf_out, 'w'), vcf_reader)
    txt_fh = open(txt_out, "wb")
    # Iterate through rows and filter mutations
    for record in vcf_reader:
        tcall = record.genotype(args.tsampleName)

        keep_based_on_status = True
        if "Somatic" not in record.INFO['STATUS'] and args.filter_germline:
            keep_based_on_status = False

        if tcall['QUAL'] is not None:
            tmq = int(tcall['QUAL'])
        else:
            tmq = 0
        if tcall['DP'] is not None:
            tdp = int(tcall['DP'])
        else:
            tdp = 0
        if tcall['VD'] is not None:
            tad = int(tcall['VD'])
        else:
            tad = 0
        if tdp != 0:
            tvf = int(tad) / float(tdp)
        else:
            tvf = 0

        ncall = record.genotype(nsampleName)
        if ncall:
            if ncall['QUAL'] is not None:
                nmq = int(ncall['QUAL'])
            else:
                nmq = 0
            if ncall['DP'] is not None:
                ndp = int(ncall['DP'])
            else:
                ndp = 0
            if ncall['VD'] is not None:
                nad = int(ncall['VD'])
            else:
                nad = 0
            if ndp != 0:
                nvf = nad / ndp
            else:
                nvf = 0
            nvfRF = int(args.tnr) * nvf
        else:
            logger.critical(
                "filter_vardict: There are no genotype values for Normal. We will exit."
            )
            sys.exit(1)

        record.add_info('set', 'VarDict')

        if if_swap_sample:
            nrm = record.samples[0]
            tum = record.samples[1]
            record.samples[0] = tum
            record.samples[1] = nrm

        if tvf > nvfRF:
            if keep_based_on_status & (tmq >= int(args.mq)) & (nmq >= int(
                    args.mq)) & (tdp >= int(args.dp)) & (tad >= int(
                        args.ad)) & (tvf >= float(args.vf)):
                vcf_writer.write_record(record)
                out_line = str.encode(args.tsampleName + "\t" + record.CHROM +
                                      "\t" + str(record.POS) + "\t" +
                                      str(record.REF) + "\t" +
                                      str(record.ALT[0]) + "\t" + "." + "\n")
                txt_fh.write(out_line)

    vcf_writer.close()
    txt_fh.close()

    # Normalize the events in the VCF, produce a bgzipped VCF, then tabix index it
    norm_gz_vcf = cmo_util.normalize_vcf(vcf_out, args.refFasta)
    cmo_util.tabix_file(norm_gz_vcf)
    return norm_gz_vcf
Пример #25
0
def RunStdFilter(args):
    vcf_out = os.path.basename(args.inputVcf)
    vcf_out = os.path.splitext(vcf_out)[0]
    txt_out = os.path.basename(args.inputTxt)
    txt_out = os.path.splitext(txt_out)[0]
    if (args.outdir):
        vcf_out = os.path.join(args.outdir, vcf_out + "_STDfilter.vcf")
        txt_out = os.path.join(args.outdir, txt_out + "_STDfilter.txt")
    else:
        vcf_out = vcf_out + "_STDfilter.vcf"
        txt_out = txt_out + "_STDfilter.txt"
    vcf_reader = vcf.Reader(open(args.inputVcf, 'r'))
    vcf_reader.infos['FAILURE_REASON'] = VcfInfo(
        'FAILURE_REASON', '1', 'String',
        'Failure Reason from MuTect text File')
    vcf_writer = vcf.Writer(open(vcf_out, 'w'), vcf_reader)
    txtDF = pd.read_table(args.inputTxt, skiprows=1, low_memory=False)
    txt_fh = open(txt_out, "wb")
    allsamples = vcf_reader.samples
    sample1 = allsamples[0]
    sample2 = allsamples[1]
    if (sample1 == args.tsampleName):
        nsampleName = sample2
    else:
        nsampleName = sample1

    # Dictionalry to store records to keep
    keepDict = {}

    for index, row in txtDF.iterrows():
        chr = row.loc['contig']  # Get Chromosome
        pos = row.loc['position']  # Get Position
        ref_allele = row.loc['ref_allele']
        alt_allele = row.loc['alt_allele']
        trd = int(row.loc['t_ref_count'])
        tad = int(row.loc['t_alt_count'])
        tdp = trd + tad
        if (tdp != 0):
            tvf = int(tad) / float(tdp)
        else:
            tvf = 0
        nrd = int(row.loc['n_ref_count'])
        nad = int(row.loc['n_alt_count'])
        ndp = nrd + nad
        if (ndp != 0):
            nvf = int(nad) / float(ndp)
        else:
            nvf = 0
        judgement = row.loc['judgement']  # Get REJECT or PASS
        failure_reason = row.loc['failure_reasons']  # Get Reject Reason
        nvfRF = int(args.tnr) * nvf
        if (args.hotspotVcf):
            hotspotFlag = checkHotspot(args.hotspotVcf, chr, pos)
        else:
            hotspotFlag = False

        # This will help in filtering VCF
        key_for_tracking = str(chr) + ":" + str(pos) + ":" + str(
            ref_allele) + ":" + str(alt_allele)
        if (judgement == "KEEP"):

            if (key_for_tracking in keepDict):
                print("MutectStdFilter:There is a repeat ", key_for_tracking)
            else:
                keepDict[key_for_tracking] = judgement
                txt_fh.write(args.tsampleName + "\t" + str(chr) + "\t" +
                             str(pos) + "\t" + str(ref_allele) + "\t" +
                             str(alt_allele) + "\t" + str(judgement) + "\n")
        else:
            accepted_tags = [
                "alt_allele_in_normal", "nearby_gap_events", "triallelic_site",
                "possible_contamination", "clustered_read_position"
            ]
            failure_tags = failure_reason.split(",")
            tag_count = 0
            for tag in failure_tags:
                if tag in accepted_tags:
                    tag_count = tag_count + 1
                else:
                    continue
            if (tag_count != len(failure_tags)):
                continue
            if (tvf > nvfRF):
                if ((tdp >= int(args.dp)) & (tad >= int(args.ad)) &
                    (tvf >= float(args.vf))):
                    if (key_for_tracking in keepDict):
                        print("MutectStdFilter:There is a repeat ",
                              key_for_tracking)
                    else:
                        keepDict[key_for_tracking] = failure_reason
                    txt_fh.write(args.tsampleName + "\t" + str(chr) + "\t" +
                                 str(pos) + "\t" + str(ref_allele) + "\t" +
                                 str(alt_allele) + "\t" + str(failure_reason) +
                                 "\n")
            else:
                if (hotspotFlag):
                    if ((tdp >= int(args.dp)) & (tad >= int(args.ad)) &
                        (tvf >= float(args.vf))):
                        if (key_for_tracking in keepDict):
                            print("MutectStdFilter:There is a repeat ",
                                  key_for_tracking)
                        else:
                            keepDict[key_for_tracking] = failure_reason
                        txt_fh.write(args.tsampleName + "\t" + str(chr) +
                                     "\t" + str(pos) + "\t" + str(ref_allele) +
                                     "\t" + str(alt_allele) + "\t" +
                                     str(failure_reason) + "\n")

    txt_fh.close()
    for record in vcf_reader:
        key_for_tracking = str(record.CHROM) + ":" + str(
            record.POS) + ":" + str(record.REF) + ":" + str(record.ALT[0])
        if (key_for_tracking in keepDict):
            failure_reason = keepDict.get(key_for_tracking)
            if (failure_reason == "KEEP"):
                failure_reason = "None"
            record.add_info('FAILURE_REASON', failure_reason)
            if (record.FILTER == "PASS"):

                vcf_writer.write_record(record)
            else:
                record.FILTER = "PASS"
                vcf_writer.write_record(record)

        else:
            continue
    vcf_writer.close()
    return (vcf_out)
Пример #26
0
###################################################################

#override the snpEff EFF fields in both header and info, split the fields and add them in INFO for easier conversion to table

###################################################################

import sys
import re
import vcf
from vcf.parser import _Info as VcfInfo

if __name__ == '__main__':

    vcf_reader = vcf.Reader(sys.stdin)

    vcf_reader.infos['EFF'] = VcfInfo('EFF', 1, 'String', 'Effect of mutation')
    vcf_reader.infos['Impact'] = VcfInfo('Impact', 1, 'String',
                                         'Likely impact of mutation')
    vcf_reader.infos['FunClass'] = VcfInfo('FunClass', 1, 'String', 'Class')
    vcf_reader.infos['CodonChange'] = VcfInfo('CodonChange', 1, 'String',
                                              'Nucleotide Change')
    vcf_reader.infos['AAChange'] = VcfInfo('AAChange', 1, 'String',
                                           'Protein Change')
    #vcf_reader.infos['AAChange.p'] = VcfInfo('AAChange.p', 1, 'String', 'Protein Change')
    #vcf_reader.infos['AAChange.c'] = VcfInfo('AAChange.c', 1, 'String', 'Protein Change')
    vcf_reader.infos['AALength'] = VcfInfo('AALength', 1, 'Integer',
                                           'Protein Length')
    vcf_reader.infos['Gene'] = VcfInfo('Gene', 1, 'String', 'Gene')
    vcf_reader.infos['BioType'] = VcfInfo('BioType', 1, 'String', 'BioType')
    vcf_reader.infos['Coding'] = VcfInfo('Coding', 1, 'String', 'Coding')
    vcf_reader.infos['Transcript'] = VcfInfo('Transcript', 1, 'String',