Пример #1
0
 def get_svs(self):
     """Get svs objects from sample vcf."""
     sv_list = [SV(rec) for rec in VariantFile(self.vcf).fetch()]
     sv_name_dict = {sv.name: sv for sv in sv_list}
     return sv_name_dict
Пример #2
0
def match_database(args):
    # Load FASTA reference
    refs = Fastafile(expanduser(args.reference))

    # Open input variant files
    db = VariantFile(args.database)
    sample = VariantFile(args.sample)

    format_meta = []
    for fmt, meta in db.header.formats.items():
        if fmt not in sample.header.formats:
            format_meta.append(meta.name)
            sample.header.formats.add(meta.name + '_FOUND',
                                      number='.',
                                      type=meta.type,
                                      description='Allele(s) found: ' +
                                      meta.description)
            sample.header.formats.add(meta.name + '_NOTFOUND',
                                      number='.',
                                      type=meta.type,
                                      description='Allele(s) not found: ' +
                                      meta.description)
            sample.header.formats.add(
                meta.name + '_NOCALL',
                number='.',
                type=meta.type,
                description='Allele(s) with uncertain presense: ' +
                meta.description)

    info_meta = []
    for info, meta in db.header.info.items():
        if info not in sample.header.info:
            info_meta.append(meta.name)
            sample.header.info.add(meta.name + '_FOUND',
                                   number='.',
                                   type=meta.type,
                                   description='Allele(s) found: ' +
                                   meta.description)
            sample.header.info.add(meta.name + '_NOTFOUND',
                                   number='.',
                                   type=meta.type,
                                   description='Allele(s) not found: ' +
                                   meta.description)
            sample.header.info.add(
                meta.name + '_NOCALL',
                number='.',
                type=meta.type,
                description='Allele(s) with uncertain presense: ' +
                meta.description)

    with VariantFile(args.output, 'w', header=sample.header) as out:
        # Create parallel locus iterator by chromosome
        for chrom, ref, loci in records_by_chromosome(refs, [sample, db],
                                                      [args.name, None], args):
            # Create superloci by taking the union of overlapping loci across all of the locus streams
            loci = [
                sort_almost_sorted(l, key=NormalizedLocus.extreme_order_key)
                for l in loci
            ]
            superloci = union(loci,
                              interval_func=attrgetter('min_start',
                                                       'max_stop'))

            # Proceed by superlocus
            for _, _, (superlocus, alleles) in superloci:
                alleles.sort(key=NormalizedLocus.natural_order_key)
                superlocus.sort(key=NormalizedLocus.natural_order_key)

                for allele in alleles:
                    super_allele = [
                        locus for locus in superlocus
                        if locus.extremes_intersect(allele)
                    ]

                    # Remove all reference calls from the superlocus.
                    # This is primarily done to remove long leading and trailing reference regions.
                    # Interstitial reference regions will be added back, based on how gaps are handled.
                    super_non_ref = [
                        locus for locus in super_allele if not locus.is_ref()
                    ]

                    if args.debug:
                        super_start, super_stop = get_superlocus_bounds(
                            [[allele], super_non_ref])
                        print('-' * 80, file=sys.stderr)
                        print('{}:[{:d}-{:d}):'.format(chrom, super_start,
                                                       super_stop),
                              file=sys.stderr)
                        print(file=sys.stderr)

                        print('  ALLELE: {} {}:[{}-{}) ref={} alt={}'.format(
                            allele.record.id, allele.contig, allele.start,
                            allele.stop, allele.alleles[0] or '-',
                            allele.alleles[1] or '-'),
                              file=sys.stderr)
                        print(file=sys.stderr)

                        for i, locus in enumerate(super_non_ref, 1):
                            lref = locus.alleles[0] or '-'
                            indices = locus.allele_indices
                            if indices.count(None) == len(indices):
                                geno = 'nocall'
                            elif indices.count(0) == len(indices):
                                geno = 'refcall'
                            else:
                                sep = '|' if locus.phased else '/'
                                geno = sep.join(locus.alleles[a] or '-'
                                                if a is not None else '.'
                                                for a in indices)
                            print('  VAR{:d}: {}[{:5d}-{:5d}) ref={} geno={}'.
                                  format(i, locus.contig, locus.start,
                                         locus.stop, lref, geno),
                                  file=sys.stderr)

                    # Search superlocus for allele
                    match_zygosity = find_allele(ref,
                                                 allele,
                                                 super_non_ref,
                                                 debug=args.debug)

                    if args.debug:
                        print(file=sys.stderr)
                        print('    MATCH={}'.format(match_zygosity),
                              file=sys.stderr)
                        print(file=sys.stderr)

                    # Annotate results of search
                    if match_zygosity is None:
                        suffix = '_NOCALL'
                    elif match_zygosity == 0:
                        suffix = '_NOTFOUND'
                    else:
                        suffix = '_FOUND'

                    # Number of times to repeat the copied metadata
                    times = match_zygosity if suffix == '_FOUND' else 1

                    for locus in super_allele:
                        annotate_info(locus, allele, info_meta, suffix, times)
                        annotate_format(locus, allele, format_meta, suffix,
                                        times)

                for locus in sorted(superlocus,
                                    key=NormalizedLocus.record_order_key):
                    out.write(locus.record)
Пример #3
0
def run_process(opts, inputvcf):
    db_file = opts.database
    outputvcf = opts.output
    minhomopolyx = int(opts.minhomopolyx)
    minrepeatcount = int(opts.minrepeatcount)
    maxvaf = float(opts.maxvaf)
    indelmaxdp = int(opts.indelmaxdp)
    indelmaxao = int(opts.indelmaxao)
    indelmaxvaf = float(opts.indelmaxvaf)
    snvmaxdp = int(opts.snvmaxdp)

    # Get Lowconf Database (obj1 : standard, obj2 : range)
    lowconfobj1, lowconfobj2 = lowconfdb2obj(db_file)

    # Open VCF
    vcf_in = VariantFile(inputvcf)

    # Add INFO to Header
    if not ngb_functions.vcfHeaderCheck(vcf_in.header.info, "LOW_CONFIDENCE"):
        vcf_in.header.info.add("LOW_CONFIDENCE", ".", "String",
                               "Low Confidence Type")

    # Add FILTER to Header
    if not ngb_functions.vcfHeaderCheck(vcf_in.header.filters, "homopolymer"):
        vcf_in.header.filters.add("homopolymer", None, None,
                                  "Homopolymer Sequence Region")
    if not ngb_functions.vcfHeaderCheck(vcf_in.header.filters,
                                        "repeat_sequence"):
        vcf_in.header.filters.add("repeat_sequence", None, None,
                                  "Repeat Sequence Region")
    if not ngb_functions.vcfHeaderCheck(vcf_in.header.filters,
                                        "sequencing_error"):
        vcf_in.header.filters.add("sequencing_error", None, None,
                                  "Sequencing Error Low Confidence Region")
    if not ngb_functions.vcfHeaderCheck(vcf_in.header.filters,
                                        "mapping_error"):
        vcf_in.header.filters.add("mapping_error", None, None,
                                  "Mapping Error Low Confidence Region")
    if not ngb_functions.vcfHeaderCheck(vcf_in.header.filters,
                                        "snp_candidate"):
        vcf_in.header.filters.add("snp_candidate", None, None,
                                  "SNP Candidates")
    if not ngb_functions.vcfHeaderCheck(vcf_in.header.filters,
                                        "strand_biased"):
        vcf_in.header.filters.add("strand_biased", None, None,
                                  "Strand Biased (Freebayes)")
    if not ngb_functions.vcfHeaderCheck(vcf_in.header.filters,
                                        "lowcoverage_indel"):
        vcf_in.header.filters.add("lowcoverage_indel", None, None,
                                  "Low Coverage (DP,AO,VAF) Indels")
    if not ngb_functions.vcfHeaderCheck(vcf_in.header.filters,
                                        "lowcoverage_snv"):
        vcf_in.header.filters.add("lowcoverage_snv", None, None,
                                  "Low Coverage (DP) SNVs")

    # Write VCF
    vcf_out = VariantFile(outputvcf if outputvcf else '-',
                          'w',
                          header=vcf_in.header)

    for record in vcf_in.fetch():
        chrom = record.chrom
        pos = record.pos
        ref = record.ref
        alts = record.alts

        vaf = float(record.samples[0]["NGB_VAF"][0])
        ao = int(record.samples[0]["NGB_AO"][0])
        dp = int(record.samples[0]["NGB_DP"])
        vtype = record.info["TYPE"][0]
        reflen = len(record.ref)
        altlen = len(record.alts[0])
        """
        if "ngb_cv_rcv_sig_description" in record.info:
            tmpcv = record.info["ngb_cv_rcv_sig_description"][0]
            cv = tmpcv.split("|")
        else:
            cv = list()
        """

        seqerror_info_list = list()
        strandbiased_info_list = list()
        homopolymer_info_list = list()
        repeat_info_list = list()
        saf_format_list = list()
        sar_format_list = list()
        lowcov_indel_list = list()
        lowcov_snv_list = list()
        for i, alt in enumerate(alts):
            # Get Lowconf info
            lowconf = ""
            id1 = chrom + '-' + str(pos) + '-' + ref + '-' + alt
            if id1 in lowconfobj1:
                lowconf = lowconfobj1[id1]
            else:
                lowconf = ""

            # Get Lowconf Info from range database
            for lowconfdata in lowconfobj2:
                if chrom == lowconfdata["chrom"] and pos in range(
                        int(lowconfdata["start"]),
                        int(lowconfdata["end"]) + 1):
                    lowconf = lowconfdata["type"]
            seqerror_info_list.append(lowconf)

            # Get Strand Biased Information
            strandbiased = ""
            # (Freebayes)
            if "SAF" in record.info:
                if record.info["SAF"][i] == 0 or record.info["SAR"][
                        i] == 0 or record.info["RPR"][i] < 1 or record.info[
                            "RPL"][i] < 1:
                    strandbiased = "strand_biased"
                else:
                    strandbiased = ""
            """
            # Mutect
            elif "F1R2" in record.format:
                alt_f1r2 = record.samples[0]['F1R2'][i+1]
                alt_f2r1 = record.samples[0]['F2R1'][i+1]
                if alt_f1r2 == 0 or alt_f2r1 == 0:
                    strandbiased = "strand_biased"
                else:
                    strandbiased = ""
                saf_format_list.append(alt_f1r2)
                sar_format_list.append(alt_f2r1)
            """
            strandbiased_info_list.append(strandbiased)

            # Homopolymer & Repeat Sequence Filtering (VAF, CV)
            homopolymerinfo = ""
            repeatinfo = ""
            #if vaf < maxvaf and ("Pathogenic" not in cv) and ("Likely_pathogenic" not in cv):
            if vaf < maxvaf:
                # Get Homopolymer Info
                if "HOMOPOLYX" in record.info:
                    if int(record.info["HOMOPOLYX"][0]) >= minhomopolyx:
                        homopolymerinfo = "homopolymer"
                    else:
                        homopolymerinfo = ""
                # Get Repeat Info
                if "REPEAT_COUNT" in record.info:
                    if int(record.info["REPEAT_COUNT"][0]) >= minrepeatcount:
                        repeatinfo = "repeat_sequence"
                    else:
                        repeatinfo = ""
            homopolymer_info_list.append(homopolymerinfo)
            repeat_info_list.append(repeatinfo)

            # Indel Filtering
            lowcovindelinfo = ""
            if (altlen != reflen) and (vtype == "ins" or vtype == "del"
                                       or vtype == "complex"):
                if vaf < indelmaxvaf or ao < indelmaxao or dp < indelmaxdp:
                    lowcovindelinfo = "lowcoverage_indel"
                else:
                    lowcovindelinfo = ""
            else:
                lowcovindelinfo = ""
            lowcov_indel_list.append(lowcovindelinfo)

            # SNV Filtering
            lowcovsnvinfo = ""
            if (altlen == reflen) and (vtype == "snp" or vtype == "complex"):
                if dp < snvmaxdp:
                    lowcovsnvinfo = "lowcoverage_snv"
                else:
                    lowcovsnvinfo = ""
            else:
                lowcovsnvinfo = ""
            lowcov_snv_list.append(lowcovsnvinfo)

        lowconf_info_list = list()
        for i, itema in enumerate(seqerror_info_list):
            itemb = strandbiased_info_list[i]
            itemc = homopolymer_info_list[i]
            itemd = repeat_info_list[i]
            iteme = lowcov_indel_list[i]
            itemf = lowcov_snv_list[i]
            itemm = ""

            if itema != '':
                itemm += itema + "|"
            if itemb != '':
                itemm += itemb + "|"
            if itemc != '':
                itemm += itemc + "|"
            if itemd != '':
                itemm += itemd + "|"
            if iteme != '':
                itemm += iteme + "|"
            if itemf != '':
                itemm += itemf + "|"

            if itemm != '':
                itemn = itemm[0:-1]
            else:
                itemn = ''

            if itemn != '':
                lowconf_info_list.append(itemn)

        if lowconf_info_list != []:
            info_value = ','.join(str(e) for e in lowconf_info_list)
            record.info['LOW_CONFIDENCE'] = info_value

        # Add FILTER
        lowconf_infolist = list()
        if 'LOW_CONFIDENCE' in record.info:
            for lowconf_info in record.info['LOW_CONFIDENCE']:
                lowconf_infolist += lowconf_info.split("|")
        lowconf_infolist = list(set(lowconf_infolist))
        for lowconf_info in lowconf_infolist:
            record.filter.add(lowconf_info)

        # PASS FILTER
        if list(record.filter) == []:
            record.filter.add("PASS")

        # Remove Filter
        for rf in remove_filter_list:
            if rf in list(record.filter):
                record.filter.__delitem__(rf)

        # Write VCF
        vcf_out.write(record)
Пример #4
0
 def fetch(self, chrm, pos_start, pos_end, return_samples=False):
     vcf_file = "%s.%s.vcf.gz" % (self.pop_vcf_stem, chrm)
     vcf_open = VariantFile(vcf_file, drop_samples=(not return_samples))
     return vcf_open.fetch(chrm, pos_start, pos_end)
Пример #5
0
#!/group/ctan/anaconda3/envs/snakemake/bin/python

import sys
from vcf_ctan import samvcf
from pysam import VariantFile

samples= ["AC","BD","Commander","EC2.1","EC2.2","EC7.1","EC7.2","Fleet","Hindmarsh","La_Trobe","Scope","Vlamingh","W1","WI4304","X1","barke","bowman","haruna_Nijo","igri","spontaneum_B1k-04-12"]
smps = [samples[3],samples[4],samples[5],samples[6]]

ibcf = VariantFile(sys.argv[1])
#obcf = VariantFile(sys.argv[2],'w',header=ibcf.header)
ofile = open(sys.argv[2],"w")
hd = "\t".join(["#chr","pos","len","ref","ref_num","alt","alt_num")
ofile.write(hd)
for one in ibcf.fetch("chr3H"):
    record = samvcf(one)
    if record.flt and record.diff_repeat(smps):
        opt = record.opt + [str(sum(one.samples[smps[0]]['GT'])),",".join(list(map(str,one.samples[smps[0]]['AD']))),str(sum(one.samples[smps[1]]['GT'])),",".join(list(map(str,one.samples[smps[1]]['AD']))),str(sum(one.samples[smps[2]]['GT'])),",".join(list(map(str,one.samples[smps[2]]['AD']))),str(sum(one.samples[smps[3]]['GT'])),",".join(list(map(str,one.samples[smps[3]]['AD'])))]
        ofile.write("\t".join(opt) + "\n")
Пример #6
0
#!/usr/bin/env python3
from pysam import VariantFile
import sys

vcf_in = VariantFile(sys.argv[1], 'r')
vcf_out = VariantFile('-', 'w', header=vcf_in.header)
cp = (0, 0)
for rec in vcf_in.fetch():
    if (rec.chrom, rec.pos) != cp:
        vcf_out.write(rec)
    cp = (rec.chrom, rec.pos)
Пример #7
0
async def import_data(file_id, filepath, core=None, reference_id=2):
    import ipdb

    import os
    import datetime
    import sqlalchemy
    import subprocess
    import multiprocessing as mp
    import reprlib
    import gzip
    from pysam import VariantFile

    from core.framework.common import log, war, err, RegovarException
    import core.model as Model

    # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
    # Tools
    # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

    def count_vcf_row(filename):
        """
            Use linux OS commands to quickly count variant to parse in the vcf file
        """
        bashCommand = 'grep -v "^#" ' + str(filename) + ' | wc -l'
        if filename.endswith("gz"):
            bashCommand = "z" + bashCommand
        process = subprocess.Popen(bashCommand,
                                   shell=True,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.STDOUT)
        cmd_out = process.communicate()[0]
        return int(cmd_out.decode('utf8'))

    def debug_clear_header(filename):
        """
            A workaround to fix a bug with GVCF header with pysam
            EDIT : in fact the problem to be that pysam do not support some kind of compression, so this command 
            is still used to rezip the vcf in a supported format.
        """
        bashCommand = "grep -v '^##GVCFBlock' {} | gzip --best > /var/regovar/downloads/tmp_workaround".format(
            filename)
        if filename.endswith("gz"):
            bashCommand = "z" + bashCommand
        process = subprocess.Popen(bashCommand,
                                   shell=True,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.STDOUT)
        bashCommand = "mv /var/regovar/downloads/tmp_workaround  {} ".format(
            filename)
        process = subprocess.Popen(bashCommand,
                                   shell=True,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.STDOUT)

    def prepare_vcf_parsing(filename):
        """
            Parse vf headers and return information about which data shall be parsed
            and stored in the database
        """
        # Extract headers
        debug_clear_header(filename)

        headers = {}
        samples = []
        _op = open
        if filename.endswith('gz') or filename.endswith('zip'):
            _op = gzip.open
        with _op(filename) as f:
            for line in f:
                if _op != open:
                    line = line.decode()
                if line.startswith('##'):
                    l = line[2:].strip()
                    l = [l[0:l.index('=')], l[l.index('=') + 1:]]
                    if l[0] not in headers.keys():
                        if l[0] == 'INFO':
                            headers[l[0]] = {}
                        else:
                            headers[l[0]] = []
                    if l[0] == 'INFO':
                        data = l[1][1:-1].split(',')
                        info_id = data[0][3:]
                        info_type = data[2][5:]
                        info_desc = data[3][13:-1]
                        headers['INFO'].update({
                            info_id: {
                                'type': info_type,
                                'description': info_desc
                            }
                        })
                    else:
                        headers[l[0]].append(l[1])
                elif line.startswith('#'):
                    samples = line[1:].strip().split('\t')[9:]
                else:
                    break

        # Check for VEP
        vep = {'vep': False}
        if 'VEP' in headers.keys() and 'CSQ' in headers['INFO'].keys():
            d = headers['INFO']['CSQ']['description'].split('Format:')
            vep = {
                'vep': {
                    'version': headers['VEP'][0].split(' ')[0],
                    'flag': 'CSQ',
                    'name': 'VEP',
                    'db_type': 'transcript',
                    'db_pk_field': 'Feature',
                    'description': d[0].strip(),
                    'columns': d[1].strip().split('|'),
                }
            }
            if 'Feature' not in vep['vep']['columns']:
                vep = {'vep': False}

        # Check for SnpEff
        snpeff = {'snpeff': False}
        if 'SnpEffVersion' in headers.keys():
            if 'ANN' in headers['INFO'].keys():
                # TODO
                pass
            elif 'EFF' in headers['INFO'].keys():
                d = headers['INFO']['EFF']['description'].split('\'')
                snpeff = {
                    'snpeff': {
                        'version':
                        headers['SnpEffVersion'][0].strip().strip('"').split(
                            ' ')[0],
                        'flag':
                        'EFF',
                        'name':
                        'SnpEff',
                        'db_type':
                        'transcript',
                        'db_pk_field':
                        'Transcript_ID',
                        'columns':
                        [c.strip() for c in d[1].strip().split('|')],
                        'description':
                        d[0].strip(),
                    }
                }
                if 'Transcript_ID' not in snpeff['snpeff']['columns']:
                    snpeff = {'snpeff': False}

        # Retrieve extension
        file_type = os.path.split(filename)[1].split('.')[-1]
        if not 'vcf' in file_type:
            file_type += os.path.split(filename)[1].split('.')[-2] + "."

        # Return result
        result = {
            'vcf_version': headers['fileformat'][0],
            'name': os.path.split(filename)[1],
            'count': count_vcf_row(filename),
            'size': os.path.getsize(filename),
            'type': file_type,
            'samples': samples,
            'annotations': {}
        }
        result['annotations'].update(vep)
        result['annotations'].update(snpeff)
        return result

    def normalise_annotation_name(name):
        """
            Tool to convert a name of a annotation tool/db/field/version into the corresponding valid name for the database
        """
        if name[0].isdigit():
            name = '_' + name

        def check_char(char):
            if char in ['.', '-', '_', '/']:
                return '_'
            elif char.isalnum():
                # TODO : remove accents
                return char.lower()
            else:
                return ''

        return ''.join(check_char(c) for c in name)

    def create_annotation_db(reference_id, reference_name, table_name,
                             vcf_annotation_metadata):
        """
            Create an annotation database according to information retrieved from the VCF file with the prepare_vcf_parsing method
        """
        # Create annotation table
        pk = 'transcript_id character varying(100), ' if vcf_annotation_metadata[
            'db_type'] == 'transcript' else ''
        pk2 = ',transcript_id' if vcf_annotation_metadata[
            'db_type'] == 'transcript' else ''
        pattern = "CREATE TABLE {0} (variant_id bigint, bin integer, chr integer, pos bigint, ref text, alt text, " + pk + "{1}, CONSTRAINT {0}_ukey UNIQUE (variant_id" + pk2 + "));"
        query = ""
        db_map = {}
        fields = []
        for col in vcf_annotation_metadata['columns']:
            col_name = normalise_annotation_name(col)
            fields.append("{} text".format(col_name))
            db_map[col_name] = {
                'name': col_name,
                'type': 'string',
                'name_ui': col
            }  # By default, create a table with only text field. Type can be changed by user via a dedicated UI
        query += pattern.format(table_name, ', '.join(fields))
        query += "CREATE INDEX {0}_idx_vid ON {0} USING btree (variant_id);".format(
            table_name)
        query += "CREATE INDEX {0}_idx_var ON {0} USING btree (bin, chr, pos);".format(
            table_name)
        if vcf_annotation_metadata['db_type'] == 'transcript':
            query += "CREATE INDEX {0}_idx_tid ON {0} USING btree (transcript_id);".format(
                table_name)

        # Register annotation
        db_uid, pk_uid = Model.execute(
            "SELECT MD5('{0}'), MD5(concat(MD5('{0}'), '{1}'))".format(
                table_name,
                normalise_annotation_name(
                    vcf_annotation_metadata['db_pk_field']))).first()
        query += "INSERT INTO annotation_database (uid, reference_id, name, version, name_ui, description, ord, type, db_pk_field_uid, jointure) VALUES "
        query += "('{0}', {1}, '{2}', '{3}', '{4}', '{5}', {6}, '{7}', '{8}', '{2} ON {2}.bin={{0}}.bin AND {2}.chr={{0}}.chr AND {2}.pos={{0}}.pos AND {2}.ref={{0}}.ref AND {2}.alt={{0}}.alt AND {2}.transcript_id={{0}}.transcript_pk_value');".format(  # We removed this condition /*AND {{0}}.transcript_pk_field_uid=\"{8}\"*/ in the jointure as this condition is already done by a previous query when updating working table with annotations
            db_uid, reference_id, table_name,
            vcf_annotation_metadata['version'],
            vcf_annotation_metadata['name'],
            vcf_annotation_metadata['description'], 30,
            vcf_annotation_metadata['db_type'], pk_uid)

        query += "INSERT INTO annotation_field (database_uid, ord, name, name_ui, type) VALUES "
        for idx, f in enumerate(vcf_annotation_metadata['columns']):
            query += "('{0}', {1}, '{2}', '{3}', 'string'),".format(
                db_uid, idx, normalise_annotation_name(f), f)
        Model.execute(query[:-1])
        Model.execute(
            "UPDATE annotation_field SET uid=MD5(concat(database_uid, name)) WHERE uid IS NULL;"
        )
        return db_uid, db_map

    def prepare_annotation_db(reference_id, vcf_annotation_metadata):
        """
            Prepare database for import of custom annotation, and set the mapping between VCF info fields and DB schema
        """

        reference = Model.execute(
            "SELECT table_suffix FROM reference WHERE id={}".format(
                reference_id)).first()[0]
        table_name = normalise_annotation_name('{}_{}_{}'.format(
            vcf_annotation_metadata['flag'],
            vcf_annotation_metadata['version'], reference))

        # Get database schema (if available)
        table_cols = {}
        db_uid = Model.execute(
            "SELECT uid FROM annotation_database WHERE name='{}'".format(
                table_name)).first()

        if db_uid is None:
            # No table in db for these annotation : create new table
            db_uid, table_cols = create_annotation_db(reference_id, reference,
                                                      table_name,
                                                      vcf_annotation_metadata)
        else:
            db_uid = db_uid[0]
            # Table already exists : retrieve columns already defined
            for col in Model.execute(
                    "SELECT name, name_ui, type FROM annotation_field WHERE database_uid='{}'"
                    .format(db_uid)):
                table_cols[col.name] = {
                    'name': col.name,
                    'type': col.type,
                    'name_ui': col.name_ui
                }
        # Get diff between columns in vcf and columns in DB, and update DB schema
        diff = []
        for col in vcf_annotation_metadata['columns']:
            if normalise_annotation_name(col) not in table_cols.keys():
                diff.append(col)
        if len(diff) > 0:
            offset = len(vcf_annotation_metadata['columns'])
            query = ""
            for idx, col in enumerate(diff):
                name = normalise_annotation_name(col)
                query += "ALTER TABLE {0} ADD COLUMN {1} text; INSERT INTO public.annotation_field (database_uid, ord, name, name_ui, type) VALUES ('{2}', {3}, '{1}', '{4}', 'string');".format(
                    table_name, name, db_uid, offset + idx, col)
                table_cols[name] = {
                    'name': name,
                    'type': 'string',
                    'name_ui': col
                }

            # execute query
            Model.execute(query)
        # Update vcf_annotation_metadata with database mapping
        db_pk_field_uid = Model.execute(
            "SELECT db_pk_field_uid FROM annotation_database WHERE uid='{}'".
            format(db_uid)).first().db_pk_field_uid
        vcf_annotation_metadata.update({
            'table': table_name,
            'db_uid': db_uid,
            'db_pk_field_uid': db_pk_field_uid
        })
        vcf_annotation_metadata['db_map'] = {}
        for col in vcf_annotation_metadata['columns']:
            vcf_annotation_metadata['db_map'][col] = table_cols[
                normalise_annotation_name(col)]
        return vcf_annotation_metadata

    def normalize_chr(chrm):
        """
            Normalize chromosome number from VCF format into Database format
        """
        chrm = chrm.upper()
        if chrm.startswith("CHROM"):
            chrm = chrm[5:]
        if chrm.startswith("CHRM") and chrm != "CHRM":
            chrm = chrm[4:]
        if chrm.startswith("CHR"):
            chrm = chrm[3:]

        if chrm == "X":
            chrm = 23
        elif chrm == "Y":
            chrm = 24
        elif chrm == "M":
            chrm = 25
        else:
            try:
                chrm = int(chrm)
            except Exception as error:
                # TODO log /report error
                chrm = None
        return chrm

    def normalize(pos, ref, alt):
        """
            Normalize given (position, ref and alt) from VCF into Database format
             - Assuming that position in VCF are 1-based (0-based in Database)
             - triming ref and alt to get minimal alt (and update position accordingly)
        """
        # input pos comming from VCF are 1-based.
        # to be consistent with UCSC databases we convert it into 0-based
        pos -= 1

        if (ref == alt):
            return None, None, None
        if ref is None:
            ref = ''
        if alt is None:
            alt = ''
        while len(ref) > 0 and len(alt) > 0 and ref[0] == alt[0]:
            ref = ref[1:]
            alt = alt[1:]
            pos += 1
        if len(ref) == len(alt):
            while ref[-1:] == alt[-1:]:
                ref = ref[0:-1]
                alt = alt[0:-1]
        return pos, ref, alt

    def normalize_gt(infos):
        """
            Normalize GT sample informatin from VCF format into Database format
        """
        gt = get_info(infos, 'GT')
        if gt != 'NULL':
            if infos['GT'][0] == infos['GT'][1]:
                # Homozyot ref
                if infos['GT'][0] in [None, 0]:
                    return 0
                # Homozyot alt
                return '1'
            else:
                if 0 in infos['GT']:
                    # Hetero ref
                    return '2'
                else:
                    return '3'
            log("unknow : " + str(infos['GT']))
        return -1

    def get_alt(alt):
        """
            Retrieve alternative values from VCF data
        """
        if ('|' in alt):
            return alt.split('|')
        else:
            return alt.split('/')

    def get_info(infos, key):
        """
            Retrieving info annotation from VCF data
        """
        if (key in infos):
            if infos[key] is None: return 'NULL'
            return infos[key]
        return 'NULL'

    def is_transition(ref, alt):
        """
            Return true if the variant is a transversion; false otherwise
        """
        tr = ref + alt
        if len(ref) == 1 and tr in ('AG', 'GA', 'CT', 'TC'):
            return True
        return False

    def escape_value_for_sql(value):
        if type(value) is str:
            value = value.replace('%', '%%')
            value = value.replace("'", "''")

        return value

    # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
    # Tiers code from vtools.  Bin index calculation
    # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

    #
    # Utility function to calculate bins.
    #
    # This function implements a hashing scheme that UCSC uses (developed by Jim Kent) to
    # take in a genomic coordinate range and return a set of genomic "bins" that your range
    # intersects.  I found a Java implementation on-line (I need to find the URL) and I
    # simply manually converted the Java code into Python code.

    # IMPORTANT: Because this is UCSC code the start coordinates are 0-based and the end
    # coordinates are 1-based!!!!!!

    # BINRANGE_MAXEND_512M = 512 * 1024 * 1024
    # binOffsetOldToExtended = 4681; #  (4096 + 512 + 64 + 8 + 1 + 0)

    _BINOFFSETS = (
        512 + 64 + 8 +
        1,  # = 585, min val for level 0 bins (128kb binsize)    
        64 + 8 + 1,  # =  73, min val for level 1 bins (1Mb binsize) 
        8 + 1,  # =   9, min val for level 2 bins (8Mb binsize)  
        1,  # =   1, min val for level 3 bins (64Mb binsize)  
        0)  # =   0, only val for level 4 bin (512Mb binsize)

    #    1:   0000 0000 0000 0001    1<<0
    #    8:   0000 0000 0000 1000    1<<3
    #   64:   0000 0000 0100 0000    1<<6
    #  512:   0000 0010 0000 0000    1<<9

    _BINFIRSTSHIFT = 17
    # How much to shift to get to finest bin.
    _BINNEXTSHIFT = 3
    # How much to shift to get to next larger bin.
    _BINLEVELS = len(_BINOFFSETS)

    #
    # IMPORTANT: the start coordinate is 0-based and the end coordinate is 1-based.
    #
    def getUcscBins(start, end):
        bins = []
        startBin = start >> _BINFIRSTSHIFT
        endBin = (end - 1) >> _BINFIRSTSHIFT
        for i in range(_BINLEVELS):
            offset = _BINOFFSETS[i]
            if startBin == endBin:
                bins.append(startBin + offset)
            else:
                for bin in range(startBin + offset, endBin + offset):
                    bins.append(bin)
            startBin >>= _BINNEXTSHIFT
            endBin >>= _BINNEXTSHIFT
        return bins

    def getMaxUcscBin(start, end):
        bin = 0
        startBin = start >> _BINFIRSTSHIFT
        endBin = (end - 1) >> _BINFIRSTSHIFT
        for i in range(_BINLEVELS):
            offset = _BINOFFSETS[i]
            if startBin == endBin:
                if startBin + offset > bin:
                    bin = startBin + offset
            else:
                for i in range(startBin + offset, endBin + offset):
                    if i > bin:
                        bin = i
            startBin >>= _BINNEXTSHIFT
            endBin >>= _BINNEXTSHIFT
        return bin

    # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
    # Import
    # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

    def transaction_end(job_id, result):
        job_in_progress.remove(job_id)
        if result is Exception or result is None:
            core.notify_all({
                'msg': 'import_vcf_end',
                'data': {
                    'file_id': file_id,
                    'msg': 'Error occured : ' + str(err)
                }
            })

    start_0 = datetime.datetime.now()
    job_in_progress = []

    vcf_metadata = prepare_vcf_parsing(filepath)
    db_ref_suffix = "_" + Model.execute(
        "SELECT table_suffix FROM reference WHERE id={}".format(
            reference_id)).first().table_suffix

    # Prepare database for import of custom annotation, and set the mapping between VCF info fields and DB schema
    for annotation in vcf_metadata['annotations'].keys():
        if vcf_metadata['annotations'][annotation]:
            data = prepare_annotation_db(
                reference_id, vcf_metadata['annotations'][annotation])
            vcf_metadata['annotations'][annotation].update(data)

    if filepath.endswith(".vcf") or filepath.endswith(".vcf.gz"):
        start = datetime.datetime.now()

        # Create vcf parser
        vcf_reader = VariantFile(filepath)

        # get samples in the VCF
        samples = {
            i: Model.get_or_create(Model.session(), Model.Sample, name=i)[0]
            for i in list((vcf_reader.header.samples))
        }

        if len(samples.keys()) == 0:
            war("VCF files without sample cannot be imported in the database.")
            if core is not None:
                core.notify_all({
                    'msg': 'import_vcf_end',
                    'data': {
                        'file_id':
                        file_id,
                        'msg':
                        "VCF files without sample cannot be imported in the database."
                    }
                })
            return

        if core is not None:
            core.notify_all({
                'msg': 'import_vcf_start',
                'data': {
                    'file_id':
                    file_id,
                    'samples': [{
                        'id': samples[s].id,
                        'name': samples[s].name
                    } for s in samples.keys()]
                }
            })

        # Associate sample to the file
        Model.execute(
            "INSERT INTO sample_file (sample_id, file_id) VALUES {0} ON CONFLICT DO NOTHING;"
            .format(','.join([
                "({0}, {1})".format(samples[sid].id, file_id)
                for sid in samples
            ])))

        # parsing vcf file
        records_count = vcf_metadata['count']
        records_current = 0
        table = "variant" + db_ref_suffix
        log("Importing file {0}\n\r\trecords  : {1}\n\r\tsamples  :  ({2}) {3}\n\r\tstart    : {4}"
            .format(filepath, records_count, len(samples.keys()),
                    reprlib.repr([s for s in samples.keys()]), start))
        # bar = Bar('\tparsing  : ', max=records_count, suffix='%(percent).1f%% - %(elapsed_td)s')

        sql_pattern1 = "INSERT INTO {0} (chr, pos, ref, alt, is_transition, bin, sample_list) VALUES ({1}, {2}, '{3}', '{4}', {5}, {6}, array[{7}]) ON CONFLICT (chr, pos, ref, alt) DO UPDATE SET sample_list=array_intersect({0}.sample_list, array[{7}])  WHERE {0}.chr={1} AND {0}.pos={2} AND {0}.ref='{3}' AND {0}.alt='{4}';"
        sql_pattern2 = "INSERT INTO sample_variant" + db_ref_suffix + " (sample_id, variant_id, bin, chr, pos, ref, alt, genotype, depth) SELECT {0}, id, {1}, '{2}', {3}, '{4}', '{5}', '{6}', {7} FROM variant" + db_ref_suffix + " WHERE bin={1} AND chr={2} AND pos={3} AND ref='{4}' AND alt='{5}' ON CONFLICT (sample_id, variant_id) DO NOTHING;"
        sql_pattern3 = "INSERT INTO {0} (variant_id, bin,chr,pos,ref,alt, transcript_id, {1}) SELECT id, {3},{4},{5},'{6}','{7}', '{8}', {2} FROM variant" + db_ref_suffix + " WHERE bin={3} AND chr={4} AND pos={5} AND ref='{6}' AND alt='{7}' ON CONFLICT (variant_id, transcript_id) DO  NOTHING;"  # TODO : on conflict, shall update fields with value in the VCF to complete database annotation with (maybe) new fields
        sql_query1 = ""
        sql_query2 = ""
        sql_query3 = ""
        count = 0
        for r in vcf_reader:
            records_current += 1
            if core is not None:
                core.notify_all({
                    'msg': 'import_vcf',
                    'data': {
                        'file_id':
                        file_id,
                        'progress_total':
                        records_count,
                        'progress_current':
                        records_current,
                        'progress_percent':
                        round(records_current / max(1, records_count) * 100, 2)
                    }
                })

            chrm = normalize_chr(str(r.chrom))
            samples_array = ','.join([str(samples[s].id) for s in r.samples])
            for sn in r.samples:
                s = r.samples.get(sn)
                if (len(s.alleles) > 0):
                    pos, ref, alt = normalize(r.pos, r.ref, s.alleles[0])
                    if pos is not None and alt != ref:
                        bin = getMaxUcscBin(pos, pos + len(ref))
                        sql_query1 += sql_pattern1.format(
                            table, chrm, pos, ref, alt,
                            is_transition(ref, alt), bin, samples_array)
                        sql_query2 += sql_pattern2.format(
                            samples[sn].id, bin, chrm, pos, ref, alt,
                            normalize_gt(s), get_info(s, 'DP'))
                        count += 1

                    pos, ref, alt = normalize(r.pos, r.ref, s.alleles[1])
                    if pos is not None and alt != ref:
                        bin = getMaxUcscBin(pos, pos + len(ref))
                        sql_query1 += sql_pattern1.format(
                            table, chrm, pos, ref, alt,
                            is_transition(ref, alt), bin, samples_array)
                        sql_query2 += sql_pattern2.format(
                            samples[sn].id, bin, chrm, pos, ref, alt,
                            normalize_gt(s), get_info(s, 'DP'))
                        count += 1

                    # Import custom annotation for the variant
                    for ann_name, metadata in vcf_metadata[
                            'annotations'].items():
                        if metadata:
                            # By transcript (r.info is a list of annotation. Inside we shall find, transcript and allele information to be able to save data for the current variant)
                            for info in r.info[metadata['flag']]:
                                data = info.split('|')
                                q_fields = []
                                q_values = []
                                allele = ""
                                trx_pk = "NULL"
                                for col_pos, col_name in enumerate(
                                        metadata['columns']):
                                    q_fields.append(
                                        metadata['db_map'][col_name]['name'])
                                    val = escape_value_for_sql(data[col_pos])

                                    if col_name == 'Allele':
                                        allele = val.strip().strip("-")
                                    if col_name == metadata['db_pk_field']:
                                        trx_pk = val.strip()

                                    q_values.append(
                                        '\'{}\''.format(val) if val != ''
                                        and val is not None else 'NULL')

                                pos, ref, alt = normalize(
                                    r.pos, r.ref, s.alleles[0])
                                # print(pos, ref, alt, allele)
                                if pos is not None and alt == allele:
                                    # print("ok")
                                    sql_query3 += sql_pattern3.format(
                                        metadata['table'], ','.join(q_fields),
                                        ','.join(q_values), bin, chrm, pos,
                                        ref, alt, trx_pk)
                                    count += 1
                                pos, ref, alt = normalize(
                                    r.pos, r.ref, s.alleles[1])
                                # print(pos, ref, alt, allele)
                                if pos is not None and alt == allele:
                                    # print("ok")
                                    sql_query3 += sql_pattern3.format(
                                        metadata['table'], ','.join(q_fields),
                                        ','.join(q_values), bin, chrm, pos,
                                        ref, alt, trx_pk)
                                    count += 1

                    # manage split big request to avoid sql out of memory transaction
                    if count >= 10000:
                        count = 0
                        # Model.execute_async(transaction1 + transaction2 + transaction3, transaction_end)
                        transaction = sql_query1 + sql_query2 + sql_query3
                        log("VCF import : Execute async query (as coroutine)")
                        await Model.execute_aio(transaction)
                        # job_id = Model.execute_bw(transaction, transaction_end)
                        # job_in_progress.append(job_id)
                        # log("VCF import : Execute async query, new job_id : {}. Jobs running [{}]".format(job_id, ','.join([job_in_progress])))
                        # Reset query buffers
                        sql_query1 = ""
                        sql_query2 = ""
                        sql_query3 = ""

        # Loop done, execute last pending query
        log("VCF import : Execute last async query (as coroutine)")
        transaction = sql_query1 + sql_query2 + sql_query3
        await Model.execute_aio(transaction)
        log("VCF import : Done")

    end = datetime.datetime.now()
    if core is not None:
        core.notify_all({
            'msg': 'import_vcf_end',
            'data': {
                'file_id':
                file_id,
                'msg':
                'Import done without error.',
                'samples': [{
                    'id': samples[s].id,
                    'name': samples[s].name
                } for s in samples.keys()]
            }
        })
Пример #8
0
def _mergeAndAddGT(snvvcf, indvcf, outfile):
	from pysam import VariantFile
	snv = VariantFile(snvvcf)
	ind = VariantFile(indvcf)
	
	snv.header.info.add('TYPE', 1, 'String', 'Type of somatic mutation')
	ind.header.info.add('TYPE', 1, 'String', 'Type of somatic mutation')
	snv.header.info.add('QSI', 1, 'Integer', 'Quality score for any somatic variant, ie. for the ALT haplotype to be present at a significantly different frequency in the tumor and normal')
	snv.header.info.add('TQSI', 1, 'Integer', 'Data tier used to compute QSI')
	snv.header.info.add('QSI_NT', 1, 'Integer', 'Quality score reflecting the joint probability of a somatic variant and NT')
	snv.header.info.add('TQSI_NT', 1, 'Integer', 'Data tier used to compute QSI_NT')
	snv.header.info.add('IC', 1, 'Integer', 'Number of times RU repeats in the indel allele')
	snv.header.info.add('IHP', 1, 'Integer', 'Largest reference interrupted homopolymer length intersecting with the indel')
	snv.header.info.add('OVERLAP', 0, 'Flag', 'Somatic indel possibly overlaps a second indel.')
	snv.header.info.add('RC', 1, 'Integer', 'Number of times RU repeats in the reference allele')
	snv.header.info.add('RU', 1, 'String', 'Smallest repeating sequence unit in inserted or deleted sequence')
	snv.header.formats.add('GT', 1, 'String', 'Possible genotype')
	ind.header.formats.add('GT', 1, 'String', 'Possible genotype')
	snv.header.formats.add('BCN50', 1, 'Float', 'Fraction of filtered reads within 50 bases of the indel.')
	snv.header.formats.add('DP2', 1, 'Integer', 'Read depth for tier2')
	snv.header.formats.add('DP50', 1, 'Float', 'Average tier1 read depth within 50 bases')
	snv.header.formats.add('FDP50', 1, 'Float', 'Average tier1 number of basecalls filtered from original read depth within 50 bases')
	snv.header.formats.add('SUBDP50', 1, 'Float', 'Average number of reads below tier1 mapping quality threshold aligned across sites within 50 bases')
	snv.header.formats.add('TAR', 2, 'Integer', 'Reads strongly supporting alternate allele for tiers 1,2')
	snv.header.formats.add('TIR', 2, 'Integer', 'Reads strongly supporting indel allele for tiers 1,2')
	snv.header.formats.add('TOR', 2, 'Integer', 'Other reads (weak support or insufficient indel breakpoint overlap) for tiers 1,2')

	contigs = list(snv.header.contigs.keys())
	out = open(outfile, 'w')
	#Can't change sample names with VariantFile
	#out = VariantFile(outfile, 'w', header = snv.header)
	headers = str(snv.header).splitlines()
	cnames  = headers[-1].split("\t")

	cnames [-2] = nprefix
	cnames [-1] = tprefix
	headers[-1] = "\t".join(cnames)
	out.write("\n".join(headers) + "\n")
	
	r1 = r2 = None
	indel_gts = {
		'ref': (0, 0),
		'het': (0, 1),
		'hom': (1, 1)
	}
	while True:
		if not r1:
			try:
				r1 = next(snv)
				r1.info['TYPE'] = 'SNV'
				alleles = (r1.ref, ) + r1.alts
				gts = r1.info['SGT'].split('->')
				try:
					r1.samples['NORMAL']['GT'] = tuple(sorted(alleles.index(gt) for gt in list(gts[0])))
					r1.samples['TUMOR']['GT'] = tuple(sorted(alleles.index(gt) for gt in list(gts[1])))
				except ValueError:
					r1 = None
					continue
			except StopIteration:
				r1 = None
		if not r2:
			try:
				r2 = next(ind)
				r2.info['TYPE'] = 'INDEL'
				gts = r2.info['SGT'].split('->')
				r2.samples['NORMAL']['GT'] = indel_gts[gts[0]]
				r2.samples['TUMOR']['GT']  = indel_gts[gts[1]]
			except StopIteration:
				r2 = None
		
		if r1 and r2:
			if (contigs.index(r1.chrom), r1.pos) < (contigs.index(r2.chrom), r2.pos):
				out.write(str(r1))
				r1 = None
			else:
				out.write(str(r2))
				r2 = None
		elif r1:
			out.write(str(r1))
			r1 = None
		elif r2:
			out.write(str(r2))
			r2 = None
		else:
			break
	out.close()
Пример #9
0
        methylated = mc8[6]  # Number Gs
        unmethylated = mc8[4]  # Number As
    return (methylated, unmethylated)


if __name__ == '__main__':

    parser = argparse.ArgumentParser(
        description="Takes a list of input files? Or Idrectory...TBD")
    parser.add_argument("--input_file", default="./101.bcf")
    parser.add_argument("--output_dir", default="./extract_output/")
    parser.add_argument("--merge_strands", action="store_true")

    args = parser.parse_args()

    infile = VariantFile("101.bcf", threads=4)
    csv_out_name = args.input_file.replace('.bcf', '.csv')
    ofile = open(csv_out_name, "w")

    # Column names for ouptut
    writer = csv.writer(ofile)
    writer.writerow([
        "chr", "pos", "reference", "call", "methylated", "unmethylated",
        "strand"
    ])

    # The things in rec.format
    # GT FT DP MQ GQ QD GL MC8 AMQ CS CG CX
    # 480 minutes per one bcf--unacceptable!!!
    # 7 minutes for chrom 22--using 4 threads
    # 7 minutes for chrom 22--using 8 threads
Пример #10
0
#!/bin/python3.6
import sys
from pysam import VariantFile

vcf_in = VariantFile(sys.argv[1])  # dosen't matter if bgziped or not. Automatically recognizes

new_header = vcf_in.header
# import pdb; pdb.set_trace()
new_header.info.add("DP", "1", "Integer", "Sum of AD fields")
new_header.info.add("AF", "1", "Float", "Alt AD / sum(AD)")
# start new vcf with the new_header
vcf_out = VariantFile(sys.argv[2], 'w', header=new_header)

for record in vcf_in.fetch():
    dp = sum(record.samples[0].get("AD"))
    record.info["DP"] = dp
    af = record.samples[0].get("AD")[1]/dp
    record.info["AF"] = af
    vcf_out.write(record)
Пример #11
0
def run_process(opts, inputvcf):
    outputvcf = opts.output

    # Open VCF
    vcf_in = VariantFile(inputvcf)

    # Add INFO to Header
    vcf_in.header.info.add(
        "TYPE", "A", "String",
        "The type of allele, either snp, ins, del, or complex.")

    # Add FORMAT to Header
    vcf_in.header.formats.add(
        "NGB_DP", "1", "Integer",
        "Approximate read depth; some reads may have been filtered")
    vcf_in.header.formats.add("NGB_AO", "A", "Integer",
                              "Alternate allele observation count")
    vcf_in.header.formats.add("NGB_RO", "1", "Integer",
                              "Reference allele observation count")
    vcf_in.header.formats.add(
        "NGB_VAF", "A", "Float",
        "Allele fractions of alternate alleles in the tumor")

    # Write VCF
    vcf_out = VariantFile(outputvcf if outputvcf else '-',
                          'w',
                          header=vcf_in.header)

    for record in vcf_in.fetch():
        chrom = record.chrom
        pos = record.pos
        ref = record.ref
        alts = record.alts

        variant_type_list = list()
        ngb_dp_list = list()
        ngb_ao_list = list()
        ngb_ro_list = list()
        ngb_vaf_list = list()
        tmp_dp = sum(record.samples[0]['AD'])
        tmp_ro = record.samples[0]['AD'][0]
        for n, alt in enumerate(alts):
            # Get Variant TYPE (freebayes format)
            ret = ngb_functions.pairdiff(ref, alt)
            vartype = ret['variant_type']
            variant_type_list.append(vartype)

            # Get DP,AO,RO,VAF
            tmp_vaf = float(record.samples[0]['AD'][(n + 1)]) / float(tmp_dp)
            tmp_ao = int(record.samples[0]['AD'][(n + 1)])
            ngb_dp_list.append(tmp_dp)
            ngb_ao_list.append(tmp_ao)
            ngb_vaf_list.append(tmp_vaf)

        if variant_type_list != []:
            #info_value = ','.join(str(e) for e in variant_type_list)
            record.info['TYPE'] = variant_type_list
        if ngb_dp_list != []:
            record.samples[0]["NGB_DP"] = ngb_dp_list[0]
            record.samples[0]["NGB_AO"] = tuple(ngb_ao_list)
            record.samples[0]["NGB_RO"] = tmp_ro
            record.samples[0]["NGB_VAF"] = tuple(ngb_vaf_list)

        # Write VCF
        vcf_out.write(record)
Пример #12
0
 def test_read_variant_vcf(self):
     p = pd.read_csv(P,
                     index_col=0,
                     sep='\t')['binary']
     infile = VariantFile(VCF)
     t = read_variant(infile, p, 'vcf',
                      False, [], False,
                      p.index, [])
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(eof, False)
     self.assertEqual(abs((k - np.zeros(50)).max()), 0.0)
     self.assertEqual(var_name,
                      'FM211187_16_G_A')
     self.assertEqual(kstrains,
                      [])
     self.assertEqual(nkstrains,
                      sorted(['sample_%d' % x
                              for x in range(1, 51)]))
     self.assertEqual(af, 0.0)
     self.assertEqual(missing, 0.0)
     # not providing samples
     t = read_variant(infile, p, 'vcf',
                      False, [], False,
                      set(), [])
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(eof, False)
     self.assertEqual(k, None)
     self.assertEqual(var_name, None)
     self.assertEqual(kstrains, None)
     self.assertEqual(nkstrains, None)
     self.assertEqual(af, None)
     # providing burden
     burden_regions = deque([])
     load_burden(B, burden_regions)
     t = read_variant(infile, p.head(5), 'vcf',
                      True, burden_regions, False,
                      p.head(5).index, [])
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(eof, False)
     self.assertTrue(abs((k -
                      np.array([0, 0, 0, 0, 0])).max()) < 1E-7)
     self.assertEqual(var_name,
                      'CDS1')
     self.assertEqual(kstrains,
                      [])
     self.assertEqual(nkstrains,
                      ['sample_1', 'sample_2', 'sample_3',
                       'sample_4', 'sample_5'])
     self.assertEqual(af, 0.0)
     self.assertEqual(missing, 0)
     # providing burden
     burden_regions = deque([])
     load_burden(BM, burden_regions)
     # last one has multiple regions
     burden_regions.reverse()
     t = read_variant(infile, p.head(5), 'vcf',
                      True, burden_regions, False,
                      p.head(5).index, [])
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(eof, False)
     self.assertTrue(abs((k -
                      np.array([0, 0, 0, 0, 0])).max()) < 1E-7)
     self.assertEqual(var_name,
                      'CDS3')
     self.assertEqual(kstrains,
                      [])
     self.assertEqual(nkstrains,
                      ['sample_1', 'sample_2', 'sample_3',
                       'sample_4', 'sample_5'])
     self.assertEqual(af, 0.0)
     self.assertEqual(missing, 0)
     # uncompressed option - no effect
     infile = VariantFile(VCF)
     t = read_variant(infile, p.head(5), 'vcf',
                      False, [], True,
                      p.head(5).index, [])
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(eof, False)
     self.assertTrue(abs((k -
                      np.array([0, 0, 0, 0, 0])).max()) < 1E-7)
     self.assertEqual(var_name,
                      'FM211187_16_G_A')
     self.assertEqual(kstrains,
                      [])
     self.assertEqual(nkstrains,
                      ['sample_1', 'sample_2', 'sample_3',
                       'sample_4', 'sample_5'])
     self.assertEqual(af, 0.0)
     self.assertEqual(missing, 0.0)
     # different type
     with self.assertRaises(AttributeError):
         t = read_variant(infile, p.head(5), 'kmers',
                          False, [], True,
                          p.head(5).index, [])
     with self.assertRaises(AttributeError):
         t = read_variant(infile, p.head(5), 'Rtab',
                          False, [], False,
                          p.head(5).index, [])
     # read until exhaustion
     while not t[0]:
         t = read_variant(infile, p, 'vcf',
                          False, [], False,
                          p.index, [])
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(eof, True)
     self.assertEqual(k, None)
     self.assertEqual(var_name, None)
     self.assertEqual(kstrains, None)
     self.assertEqual(nkstrains, None)
     self.assertEqual(af, None)
     self.assertEqual(missing, None)
     # different file
     infile = gzip.open(KMER)
     with self.assertRaises(AttributeError):
         t = read_variant(infile, p.head(5), 'vcf',
                          False, [], False,
                          p.head(5).index, [])
     infile = open(PRES)
     with self.assertRaises(AttributeError):
         t = read_variant(infile, p.head(5), 'vcf',
                          False, [], False,
                          p.head(5).index, [])
     # burden with missing genotypes in last read variant
     # issue #90
     p = pd.read_csv(P,
                     index_col=0,
                     sep='\t')['binary']
     infile = VariantFile(VCFMISSING)
     burden_regions = deque([])
     load_burden(BMISSING, burden_regions)
     t = read_variant(infile, p.head(5), 'vcf',
                      True, burden_regions, False,
                      p.head(5).index, [])
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(eof, False)
     self.assertTrue(abs((k -
                      np.array([1, 1, 0, 0, 0])).max()) < 1E-7)
     self.assertEqual(var_name,
                      'CDS1')
     self.assertEqual(kstrains,
                      ['sample_1', 'sample_2'])
     self.assertEqual(nkstrains,
                      ['sample_3', 'sample_4', 'sample_5'])
     self.assertEqual(af, 0.4)
     self.assertEqual(missing, 0)
     # check that missing variants are properly missed
     # issue #120
     p = pd.read_csv(P,
                     index_col=0,
                     sep='\t')['binary']
     infile = VariantFile(VCFMISSING)
     variant = next(infile)
     total = 0
     missing = 0
     samples = set()
     for sample, call in variant.samples.items():
         if sample not in p.index:
             continue
         for haplotype in call.get('GT', [None]):
             if haplotype is None or haplotype == '.':
                 missing += 1
             total += 1
             samples.add(sample)
     pysam_missing = missing / float(total)
     infile = VariantFile(VCFMISSING)
     t = read_variant(infile, p, 'vcf', False, [], False, p.index, [])
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(pysam_missing, missing)
Пример #13
0
    def main(self, args):
        command.Command.main(self, args)
        self.validate(args)
        for i in [1, 2]:
            attr = "pop%d" % i
            pid, ary = getattr(args, attr)
            if len(ary) == 1 and ary[0][0] == "@":
                setattr(args, attr, SampleList(
                    pid, open(ary[0][1:], "rt").read().strip().split("\n")))
        pop_d = dict([args.pop1, args.pop2])
        for pid in pop_d:
            if pop_d[pid]:
                c = Counter(pop_d[pid])
                if max(c.values()) > 1:
                    raise RuntimeError(
                        "Population %s has duplicated samples: %s" %
                        (pid, [item for item in c.items() if item[1] > 1]))
        dist = [[], []]
        if not args.d:
            first_sid = args.pop1.samples[0]
            args.d = [first_sid] * 2
        args.d = [args.d[0] + ":0", args.d[1] + ":1"]
        all_samples = set(args.pop1.samples) | set(args.pop2.samples)
        for sid_i in args.d:
            sid, i = sid_i.split(":")
            i = int(i)
            if sid not in all_samples:
                raise RuntimeError("%s is not in the sample list" % sid)
            if sid in args.pop1.samples:
                d = dist[0]
            else:
                assert sid in args.pop2.samples
                d = dist[1]
            d.append((sid, i))
        undist = [[(k, i) for k in p.samples for i in (0, 1) if (k, i) not in d]
                  for p, d in zip((args.pop1, args.pop2), dist)]
        npop = 1

        def print_pop(i):
            logger.info("Population %d:" % i)
            logger.info("Distinguished lineages: " +
                        ", ".join("%s:%d" % t for t in dist[i - 1]))
            logger.info("Undistinguished lineages: " +
                        ", ".join("%s:%d" % t for t in undist[i - 1]))
        print_pop(1)
        if args.pop2.pid is not None:
            npop = 2
            common = set(args.pop1.samples) & set(args.pop2.samples)
            if common:
                logger.error("Populations 1 and 2 should be disjoint, "
                             "but both contain " + ", ".join(common))
                sys.exit(1)
            print_pop(2)

        # Start parsing
        vcf = VariantFile(args.vcf)
        with optional_gzip(args.out, "wt") as out:
            samples = list(vcf.header.samples)
            dist = dist[:npop]
            undist = undist[:npop]
            if not set([dd[0] for d in dist for dd in d]) <= set(samples):
                raise RuntimeError("Distinguished lineages not found in data?")
            missing = [s for u in undist for s, _ in u if s not in samples]
            if missing:
                msg = "The following samples were not found in the data: %s. " % ", ".join(
                    missing)
                if args.ignore_missing:
                    logger.warn(msg)
                else:
                    msg += "If you want to continue without these samples, use --ignore-missing."
                    raise RuntimeError(msg)
            undist = [[t for t in u if t[0] not in missing] for u in undist]

            # Write header
            pids = [a.pid for a in (args.pop1, args.pop2)[:npop]]
            out.write("# SMC++ ")
            json.dump({"version": version, "pids": pids,
                       "undist": undist, "dist": dist}, out)
            out.write("\n")
            na = list(map(len, dist))
            nb = list(map(len, undist))

            # function to convert a VCF record to our format:
            # <span, dist gt, # undist gt, # undist, [...]>
            def rec2gt(rec):
                ref = rec.alleles[0]
                da = [[rec.samples[d].alleles[i]
                       for d, i in di] for di in dist]
                a = [sum([x != ref for x in d])
                     if None not in d else -1 for d in da]
                bs = [[rec.samples[d].alleles[i] != ref
                       for d, i in un
                       if rec.samples[d].alleles[i] is not None]
                      for un in undist]
                b = [sum(_) for _ in bs]
                nb = [len(_) for _ in bs]
                # Fold non-polymorphic (in subsample) sites
                if np.array_equal(b, nb) and np.array_equal(a, na):
                    a = [0] * len(a)
                    b = [0] * len(b)
                return list(sum(zip(a, b, nb), tuple()))

            try:
                region_iterator = vcf.fetch(contig=args.contig)
            except ValueError as e:
                logger.error("VCF reader threw an error: %s", e)
                logger.error("Make sure the VCF is indexed:")
                logger.error("")
                logger.error("    $ tabix %s", args.vcf)
                logger.error("")
                sys.exit(1)

            contig_length = args.length or vcf.header.contigs[args.contig].length
            if contig_length is None:
                logger.error("Failed to acquire contig length from VCF header. See the --length option.")
                sys.exit(1)
            if args.mask:
                mask_iterator = TabixFile(
                    args.mask).fetch(reference=args.contig)
                args.missing_cutoff = np.inf
            else:
                mask_iterator = iter([])
                if args.missing_cutoff is None:
                    args.missing_cutoff = np.inf
            mask_iterator = (x.split("\t") for x in mask_iterator)
            mask_iterator = ((x[0], int(x[1]), int(x[2]))
                             for x in mask_iterator)
            snps_only = (
                rec for rec in region_iterator if
                len(rec.alleles) <= 2 and
                all(len(a) == 1 for a in rec.alleles)
                )

            def interleaved():
                cmask = next(mask_iterator, None)
                csnp = next(snps_only, None)
                while cmask or csnp:
                    if cmask is None:
                        yield "snp", csnp
                        csnp = next(snps_only, None)
                    elif csnp is None:
                        yield "mask", cmask
                        cmask = next(mask_iterator, None)
                    else:
                        if csnp.pos < cmask[1]:
                            yield "snp", csnp
                            csnp = next(snps_only, None)
                        elif csnp.pos <= cmask[2]:
                            while csnp is not None and csnp.pos <= cmask[2]:
                                csnp = next(snps_only, None)
                            yield "mask", cmask
                            cmask = next(mask_iterator, None)
                        else:
                            yield "mask", cmask
                            cmask = next(mask_iterator, None)

            abnb_miss = [-1, 0, 0] * len(nb)
            abnb_nonseg = sum([[0, 0, x] for x in nb], [])
            multiples = set()
            with RepeatingWriter(out) as rw, \
                    tqdm.tqdm(total=contig_length, unit='bases', unit_scale=True) as bar:
                def write(x):
                    if not write.first or not args.drop_first_last:
                        rw.write(x)
                    write.first = False
                write.first = True
                last_pos = 0
                for ty, rec in interleaved():
                    if ty == "mask":
                        span = rec[1] - last_pos
                        write([span] + abnb_nonseg)
                        write([rec[2] - rec[1] + 1] + abnb_miss)
                        last_pos = rec[2]
                        continue
                    bar.update(rec.pos - last_pos)
                    abnb = rec2gt(rec)
                    if rec.pos == last_pos:
                        multiples.add(rec.pos)
                        continue
                    span = rec.pos - last_pos - 1
                    if 1 <= span <= args.missing_cutoff:
                        write([span] + abnb_nonseg)
                    elif span > args.missing_cutoff:
                        write([span] + abnb_miss)
                    write([1] + abnb)
                    last_pos = rec.pos
                if not args.drop_first_last:
                    write([contig_length - last_pos] + abnb_nonseg)
            if multiples:
                # FIXME: what to do with multiple records at same site
                logger.warn(
                    "Multiple entries found at %d positions; skipped all but the first", len(multiples))
Пример #14
0
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from pysam import VariantFile

quals = [record.qual for record in VariantFile(everclear.input[0])]
plt.hist(quals)

plt.savefig(everclear.output[0])
from labels import SVRecord_generic
from collections import defaultdict
import brewer2mpl
import matplotlib.patches as patches

windowsizes = []
windowsizes_by_caller = {}
windowsizes_by_SVType = defaultdict(list)
SVCount_bytype = defaultdict(int)

callers = []
lost_SVs = 0
total_SVs = 0

for vcf_file in os.listdir('../MinorResearchInternship/VCF'):
    vcf_in = VariantFile('../MinorResearchInternship/VCF/' + vcf_file, 'r')
    caller = re.findall(r'^\w*', vcf_file)
    callers += [caller[0]]
    windowsizes_by_caller[caller[0]] = {
        "CI_sizes": {
            "Start": {
                "DEL": [],
                "INS": [],
                "BND": [],
                "INV": [],
                "DUP": []
            },
            "End": {
                "DEL": [],
                "INS": [],
                "BND": [],
Пример #16
0
def Main():
    parser = argparse.ArgumentParser(
        description="loading vcf and interaction files")
    parser.add_argument("interactionfile",
                        help="Interaction calls from HiCap method")
    parser.add_argument(
        "vcfile", help="Variant calls from either HiCap or sequencing samples")
    parser.add_argument("-o",
                        "--output",
                        help="output of interaction files",
                        action='store',
                        default=None)
    args = parser.parse_args()
    Vcfin = VariantFile(args.vcfile)

    result_title = [
        "RefSeqName", "TranscriptName", "Feature_ID", "Feature_Chr",
        "Feature_Start", "Feature_End", "Annotation", "Strand",
        "Interactor_Chr", "Interactor_Start", "Interactor_End", "Distance",
        "SNPs", "SNP_ID", "Ind_count", "Swed_Freq", "TAV2431", "TAV2515",
        "TAV2709", "BAV2375", "BAV2424", "BAV2714"
    ]

    with open(args.output, "w") as output_file:
        output_file.write("\t".join(result_title) + "\n")

    with open(args.interactionfile, 'r') as f:
        next(f)

        for line in f:
            line = line.strip().split("\t")
            all_fields = line[0], line[1], line[2], line[3], line[4], line[
                5], line[6], line[7], line[8], line[9], line[10], line[11]
            chr = ((line[8])[3:], line[9], line[10])

            TAV2431 = [line[12], line[13]]
            TAV2515 = [line[15], line[16]]
            TAV2709 = [line[18], line[19]]
            BAV2375 = [line[21], line[22]]
            BAV2424 = [line[24], line[25]]
            BAV2714 = [line[27], line[28]]

            interaction_sample = [
                TAV2431, TAV2515, TAV2709, BAV2375, BAV2424, BAV2714
            ]
            interaction_binary = int2binary(interaction_sample)

            sample_list = [3, 4, 5, 0, 1, 2]
            for rec in Vcfin.fetch(chr[0], int(chr[1]), int(chr[2])):
                genotype_binary = []
                for test in rec.samples.values():

                    genotype = "/".join([str(x) for x in test["GT"]])
                    if genotype == "None/None":
                        continue
                    elif genotype == "0/1" or genotype == "1/1":
                        genotype_binary.append("1")
                    elif genotype == "0/0":
                        genotype_binary.append("0")

                    swed_freq = "0"
                    for f, v in rec.info.iteritems():
                        if pattern.match(f):
                            swed_freq = v

                    if rec.id == None:
                        rec.id = "X"

                sorted_genotype = [
                    x for _, x in sorted(zip(sample_list, genotype_binary))
                ]
                zip_array = list(zip(interaction_binary, sorted_genotype))

                count = 0
                for a, b in zip_array:
                    if a == b:
                        count = count + 1

                if count == 6:
                    allele = "|".join(rec.alleles)
                    count_int_allele = 0
                    for a, b in zip_array:
                        if (a, b) == ('1', '1'):
                            count_int_allele = count_int_allele + 1

                    changed_freq = "".join(str(x) for x in swed_freq)
                    unzip_array = ["|".join(x) for x in zip_array]
                    snp = (line[8], rec.start, rec.stop, allele,
                           rec.filter.keys()[0])
                    str_snp = "_".join(str(x) for x in snp)

                    result = "\t".join(
                        all_fields
                    ), str_snp, rec.id, count_int_allele, changed_freq, "\t".join(
                        unzip_array)
                    combined_result = "\t".join(str(x) for x in result)

                    with open(args.output, "a") as output_file:
                        output_file.write(combined_result + "\n")
Пример #17
0
def match_replicates(args):
    """Match a genome against another presumably identical genome (i.e. replicates)."""
    refs = Fastafile(expanduser(args.reference))
    in_vars = [VariantFile(var) for var in [args.vcf1, args.vcf2]]
    out_vars = make_outputs(in_vars, args.out1, args.out2)

    match_status_map = {True: '=', False: 'X', None: '.'}

    # Create parallel locus iterator by chromosome
    for chrom, ref, loci in records_by_chromosome(refs, in_vars,
                                                  [args.name1, args.name2],
                                                  args):
        # Create superloci by taking the union of overlapping loci across all of the locus streams
        loci = [
            sort_almost_sorted(l, key=NormalizedLocus.extreme_order_key)
            for l in loci
        ]
        superloci = union(loci,
                          interval_func=attrgetter('min_start', 'max_stop'))

        # Proceed by superlocus
        for _, _, (super1, super2) in superloci:
            super1.sort(key=NormalizedLocus.natural_order_key)
            super2.sort(key=NormalizedLocus.natural_order_key)

            super_start, super_stop = get_superlocus_bounds([super1, super2])

            print('-' * 80)
            print(f'{chrom}:[{super_start:d}-{super_stop:d}):')
            print()

            for i, superlocus in enumerate([super1, super2], 1):
                for locus in superlocus:
                    lstart = locus.start
                    lstop = locus.stop
                    lref = locus.ref or '-'
                    indices = locus.allele_indices
                    sep = '|' if locus.phased else '/'
                    geno = sep.join(
                        locus.alleles[a] or '-' if a is not None else '.'
                        for a in indices)
                    print(
                        f'  NORM{i:d}: [{lstart:5d}-{lstop:5d}) ref={lref} geno={geno}'
                    )
            print()

            match, match_type = superlocus_equal(ref,
                                                 super_start,
                                                 super_stop,
                                                 super1,
                                                 super2,
                                                 debug=args.debug)
            match_status = match_status_map[match]

            print(f'    MATCH={match_status} TYPE={match_type}')
            print()

            write_match(out_vars[0], super1, args.name1, match_status,
                        match_type)
            write_match(out_vars[1], super2, args.name2, match_status,
                        match_type)

            for i, superlocus in enumerate([super1, super2], 1):
                for locus in superlocus:
                    print(f'  VCF{i:d}: {locus.record}', end='')
            print()

    for out_var in out_vars:
        if out_var is not None:
            out_var.close()
Пример #18
0
import pysam
import sys

from pysam import VariantFile

bcf_in = VariantFile('-')  # auto-detect input format
'''
print('\naaa\n')
print(dir(bcf_in.header))
for k, v in bcf_in.header.formats.items():
    print('{}\t{}'.format(k, v))
    print('\t{}'.format(dir(v)))
    print('\t{}'.format(v.name))
    print('\t{}'.format(v.number))
    print('\t{}'.format(v.type))
    print('\t{}'.format(v.record))
    print('\t{}'.format(v.id))
    print('\t{}'.format(v.description)) 
print('\nbbb\n')
print(bcf_in.header.formats)
'''
bcf_in.header.add_line(
    '##FORMAT=<ID=NonHomrefQ,Number=1,Type=Integer,Description=\"Likelihood of the homozygous-reference genotype\">'
)
bcf_out = VariantFile('-', 'w', header=bcf_in.header)

sample = bcf_in.header.samples[0]

for rec in bcf_in:
    assert rec.samples[sample]['GL4'][
        0] != None, 'The record {} is invalid!'.format(rec)
Пример #19
0
 def setup(cls, source):
     curr = cls(source)
     curr.f = VariantFile(curr.source)
     return curr
Пример #20
0
def add_freqs():
    vcf_path = sys.argv[2]
    fai_path = sys.argv[3]
    min_samples = int(sys.argv[4]) if len(sys.argv) == 5 else 0

    vcf = VariantFile(vcf_path, 'r', drop_samples=False)

    ref_name, ref_len = open(fai_path).readlines()[0].strip('\n').split(
        '\t')[0:2]
    new_contig = f"##contig=<ID={ref_name},length={ref_len}>"

    vcf.header.add_line(new_contig)
    vcf.header.add_line(
        "##INFO=<ID=AF,Number=A,Type=Float,Description=\"Estimated allele frequency in the range (0,1)\">"
    )
    print('\n'.join(str(vcf.header).split('\n')[:-1]))

    removed_variants = 0
    removed_some_alt = 0
    tot_removed_alleles = 0
    tot_removed_genotypes = 0
    next_milestone = 1000
    for record in vcf:
        if record.pos <= 50 or record.pos >= int(ref_len) - 50:
            continue
        if record.pos > next_milestone:
            print("Reached position", record.pos, file=sys.stderr)
            next_milestone += 1000

        n_gts = defaultdict(int)
        low_samples = defaultdict(list)
        for sample in record.samples.itervalues():
            curr_gt = sample.allele_indices[0]
            n_gts[curr_gt] += 1
            if curr_gt != 0 and n_gts[curr_gt] < min_samples:
                low_samples[curr_gt].append(sample)

        to_delete = set([gt for gt in n_gts if n_gts[gt] < min_samples])
        if len(to_delete) > 0:
            if len(n_gts) - len(to_delete) == 1:
                removed_variants += 1
                continue
            tot_removed_alleles += len(to_delete)
            removed_some_alt += 1

            for gt in to_delete:
                for sample in low_samples[gt]:
                    sample.allele_indices = (0, )
                n_gts[0] += n_gts[gt]
                tot_removed_genotypes += n_gts[gt]
                n_gts[gt] = 0

        tot_alleles = len(n_gts)
        tot_samples = sum(n_gts.values())
        for gt in n_gts:
            n_gts[gt] /= tot_samples
            n_gts[gt] = round(n_gts[gt], 6)
        alt_freqs = [n_gts[i] for i in n_gts if i > 0]
        record.chrom = ref_name
        record.info.__setitem__("AF", alt_freqs)
        print(record, end='', flush=False)
    print("removed_variants=", removed_variants, file=sys.stderr)
    print("removed_some_alt=", removed_some_alt, file=sys.stderr)
    print("tot_removed_alleles=", tot_removed_alleles, file=sys.stderr)
    print("tot_removed_genotypes=", tot_removed_genotypes, file=sys.stderr)
Пример #21
0
def force_calling(bam_path, ivcf_path, output_path, sigs_dir,
                  max_cluster_bias_dict, threshold_gloab_dict, gt_round,
                  threads):
    logging.info('Check the parameter -Ivcf: OK.')
    logging.info('Enable to perform force calling.')
    #print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
    sv_dict = dict()
    #'''
    for sv_type in ["DEL", "DUP"]:
        sv_dict[sv_type] = parse_sigs(sv_type, sigs_dir)
    sv_dict['INS'] = parse_inssigs(sigs_dir)
    sv_dict['INV'] = parse_invsigs(sigs_dir)
    sv_dict['TRA'] = parse_trasigs(sigs_dir)
    #'''
    vcf_reader = VariantFile(ivcf_path, 'r')
    row_count = 0
    for record in vcf_reader.fetch():
        row_count += 1
    idx = -1
    #gt_list = Manager().list([[] for x in range(row_count)])
    gt_list = list()
    result = []
    process_pool = Pool(processes=threads)
    vcf_reader = VariantFile(ivcf_path, 'r')
    for record in vcf_reader.fetch():
        idx += 1
        sv_type, chrom, sv_chr2, pos, sv_end, sv_strand = parse_record(record)
        if sv_type not in ["DEL", "INS", "DUP", "INV", "TRA"]:
            continue
        search_id_list = []
        if sv_type == 'TRA' and 'TRA' in sv_dict and chrom in sv_dict[
                'TRA'] and sv_chr2 in sv_dict['TRA'][chrom]:
            search_id_list = sv_dict['TRA'][chrom][sv_chr2]
        elif sv_type == 'INV' and 'INV' in sv_dict and chrom in sv_dict['INV']:
            if sv_strand in sv_dict['INV'][chrom]:
                search_id_list = sv_dict['INV'][chrom][sv_strand]
            else:
                for strand_iter in sv_dict['INV'][chrom]:
                    sv_strand = strand_iter
                    search_id_list = sv_dict['INV'][chrom][strand_iter]
                    break
        elif sv_type != 'TRA' and sv_type != 'INV' and sv_type in sv_dict and chrom in sv_dict[
                sv_type]:
            search_id_list = sv_dict[sv_type][chrom]
        max_cluster_bias = 0
        if sv_type == 'INS' or sv_type == 'DEL':
            read_id_list, max_cluster_bias, indel_seq, CIPOS, CILEN = find_in_indel_list(
                sv_type, search_id_list, max_cluster_bias_dict[sv_type], pos,
                sv_end, threshold_gloab_dict[sv_type])
        else:
            read_id_list, max_cluster_bias = find_in_list(
                sv_type, search_id_list, max_cluster_bias_dict[sv_type], pos,
                sv_end)
            CIPOS = '.,.'
            CILEN = '.,.'
        if sv_type == 'INV' and 'INV' in sv_dict and chrom in sv_dict[
                'INV'] and len(read_id_list) == 0:
            for strand_iter in sv_dict['INV'][chrom]:
                if strand_iter != sv_strand:
                    search_id_list = sv_dict['INV'][chrom][strand_iter]
                    read_id_list, max_cluster_bias = find_in_list(
                        sv_type, search_id_list,
                        max_cluster_bias_dict[sv_type], pos, sv_end)
                    if len(read_id_list) != 0:
                        sv_strand = strand_iter
                        break
        #print(read_id_list)
        if sv_type == 'INS':
            max_cluster_bias = max(1000, max_cluster_bias)
        else:
            max_cluster_bias = max(max_cluster_bias_dict[sv_type],
                                   max_cluster_bias)
        para = Para(record, CIPOS, CILEN)
        '''
        if sv_type == 'INS':
            fx_para = [([bam_path, pos, chrom, read_id_list, max_cluster_bias, gt_round], idx, row_count, para, sv_strand, 'INS')]
            gt_list.append(call_gt_wrapper(fx_para))
        if sv_type == 'DEL':
            fx_para = [([bam_path, pos, chrom, read_id_list, max_cluster_bias, gt_round], idx, row_count, para, sv_strand, 'DEL')]
            gt_list.append(call_gt_wrapper(fx_para))
        if sv_type == 'INV':
            fx_para = [([bam_path, pos, sv_end, chrom, read_id_list, max_cluster_bias, gt_round], idx, row_count, para, sv_strand, 'INV')]
            gt_list.append(call_gt_wrapper(fx_para))
        if sv_type == 'DUP':
            fx_para = [([bam_path, pos, sv_end, chrom, read_id_list, max_cluster_bias, gt_round], idx, row_count, para, sv_strand, 'DUP')]
            gt_list.append(call_gt_wrapper(fx_para))
        if sv_type == 'TRA':
            fx_para = [([bam_path, pos, sv_end, chrom, sv_chr2, read_id_list, max_cluster_bias, gt_round], idx, row_count, para, sv_strand, 'TRA')]
            gt_list.append(call_gt_wrapper(fx_para))
        '''
        #'''
        if sv_type == 'INS':
            fx_para = [([
                bam_path, pos, chrom, read_id_list, max_cluster_bias, gt_round
            ], idx, row_count, para, sv_strand, indel_seq, 'INS')]
            gt_list.append(process_pool.map_async(call_gt_wrapper, fx_para))
        if sv_type == 'DEL':
            fx_para = [([
                bam_path, pos, chrom, read_id_list, max_cluster_bias, gt_round
            ], idx, row_count, para, sv_strand, '<DEL>', 'DEL')]
            gt_list.append(process_pool.map_async(call_gt_wrapper, fx_para))
        if sv_type == 'INV':
            fx_para = [([
                bam_path, pos, sv_end, chrom, read_id_list, max_cluster_bias,
                gt_round
            ], idx, row_count, para, sv_strand, '<INV>', 'INV')]
            gt_list.append(process_pool.map_async(call_gt_wrapper, fx_para))
        if sv_type == 'DUP':
            fx_para = [([
                bam_path, pos, sv_end, chrom, read_id_list, max_cluster_bias,
                gt_round
            ], idx, row_count, para, sv_strand, '<DUP>', 'DUP')]
            gt_list.append(process_pool.map_async(call_gt_wrapper, fx_para))
        if sv_type == 'TRA':
            fx_para = [([
                bam_path, pos, sv_end, chrom, sv_chr2, read_id_list,
                max_cluster_bias, gt_round
            ], idx, row_count, para, sv_strand, '<TRA>', 'TRA')]
            gt_list.append(process_pool.map_async(call_gt_wrapper, fx_para))
        #'''
    process_pool.close()
    process_pool.join()

    semi_result = list()
    for item in gt_list:
        try:
            semi_result.append(item.get()[0])
        except:
            pass
    logging.info('Finished force calling.')
    return semi_result
Пример #22
0
"""Extract reference (FASTA) and sample names from the VCF file."""

import argparse
import os

from pysam import VariantFile
from resolwe_runtime_utils import error, warning

parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("vcf_file",
                    help="VCF file (can be compressed using gzip/bgzip).")
parser.add_argument("summary", help="Summary file to append to.")
args = parser.parse_args()

try:
    vcf = VariantFile(args.vcf_file)
except (OSError, ValueError) as error_msg:
    proc_error = "Input VCF file does not exist or could not be correctly opened."
    print(error(proc_error))
    raise ValueError(error_msg)

vcf_header = vcf.header
header_records = {record.key: record.value for record in vcf_header.records}

with open(args.summary, "a") as out_file:
    try:
        fasta_name = os.path.basename(header_records["reference"])
    except KeyError:
        fasta_name = ""
        print(
            warning(
Пример #23
0
def main(argv):
    parser = argparse.ArgumentParser(
        description=__doc__,
        prog='svtk standardize',
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('vcf', help='Raw VCF.')
    parser.add_argument('fout', help='Standardized VCF.')
    parser.add_argument('source',
                        help='Source algorithm. '
                        '[delly,lumpy,manta,wham,melt]')
    parser.add_argument('-p',
                        '--prefix',
                        help='If provided, variant names '
                        'will be overwritten with this prefix.')
    parser.add_argument('--include-reference-sites',
                        action='store_true',
                        default=False,
                        help='Include records where all '
                        'samples are called 0/0 or ./.')
    parser.add_argument('--standardizer',
                        help='Path to python file with '
                        'custom standardizer definition. (Not yet supported.)')
    parser.add_argument('--contigs',
                        type=argparse.FileType('r'),
                        help='Reference fasta index (.fai). If provided, '
                        'contigs in index will be used in VCF header. '
                        'Otherwise all GRCh37 contigs will be used in header. '
                        'Variants on contigs not in provided list will be '
                        'removed.')
    parser.add_argument('--min-size',
                        type=int,
                        default=50,
                        help='Minimum SV size to report [50].')
    parser.add_argument('--call-null-sites',
                        action='store_true',
                        default=False,
                        help='Call sites with null genotypes (./.). Generally '
                        'useful when an algorithm has been run on a single '
                        'sample and has only reported variant sites.')

    # Print help if no arguments specified
    if len(argv) == 0:
        parser.print_help()
        sys.exit(1)
    args = parser.parse_args(argv)

    # Add contigs to header if provided
    if args.contigs:
        template = pkg_resources.resource_filename(
            'svtk', 'data/no_contigs_template.vcf')
        template = VariantFile(template)
        header = template.header
        contig_line = '##contig=<ID={contig},length={length}>'
        for line in args.contigs:
            contig, length = line.split()[:2]
            header.add_line(contig_line.format(**locals()))
    # Use GRCh37 by default
    else:
        template = pkg_resources.resource_filename('svtk',
                                                   'data/GRCh37_template.vcf')
        template = VariantFile(template)
        header = template.header

    vcf = VariantFile(args.vcf)

    # Template header includes all necessary FILTER, INFO, and FORMAT fields
    # Just need to add samples from VCF being standardized
    for sample in vcf.header.samples:
        header.add_sample(sample)

    # Tag source in header
    meta = '##FORMAT=<ID={0},Number=1,Type=Integer,Description="Called by {1}"'
    meta = meta.format(args.source, args.source.capitalize())
    header.add_line(meta)
    header.add_line('##source={0}'.format(args.source))

    fout = VariantFile(args.fout, mode='w', header=header)

    standardizer = VCFStandardizer.create(args.source, vcf, fout, args.prefix,
                                          args.min_size,
                                          args.include_reference_sites,
                                          args.call_null_sites)

    for record in standardizer.standardize_vcf():
        fout.write(record)

    fout.close()
    vcf.close()
Пример #24
0
def match_replicates(args):
    # Load FASTA reference
    refs = Fastafile(expanduser(args.reference))

    # Open input variant files
    in_vars = [VariantFile(var) for var in [args.vcf1, args.vcf2]]

    out_vars = [None, None]

    if args.out1:
        in_vars[0].header.formats.add('BD', '1', 'String', 'Match decision for call (match: =, mismatch: X, error: N)')
        in_vars[0].header.formats.add('BK', '1', 'String', 'Sub-type for match decision (trivial: T, haplotype: H, error: N)')
        out_vars[0] = VariantFile(args.out1, 'w', header=in_vars[0].header)

    if args.out2:
        in_vars[1].header.formats.add('BD', '1', 'String', 'Match decision for call (match: =, mismatch: X, error: N)')
        in_vars[1].header.formats.add('BK', '1', 'String', 'Sub-type for match decision (trivial: T, haplotype: H, error: N)')
        out_vars[1] = VariantFile(args.out2, 'w', header=in_vars[1].header)

    match_status_map = {True : '=', False : 'X', None : '.'}

    # Create parallel locus iterator by chromosome
    for chrom, ref, loci in records_by_chromosome(refs, in_vars, [args.name1, args.name2], args):
        # Create superloci by taking the union of overlapping loci across all of the locus streams
        loci = [sort_almost_sorted(l, key=NormalizedLocus.extreme_order_key) for l in loci]
        superloci = union(loci, interval_func=attrgetter('min_start', 'max_stop'))

        # Proceed by superlocus
        for _, _, (super1, super2) in superloci:
            super1.sort(key=NormalizedLocus.natural_order_key)
            super2.sort(key=NormalizedLocus.natural_order_key)

            super_start, super_stop = get_superlocus_bounds([super1, super2])

            print('-'*80)
            print('{}:[{:d}-{:d}):'.format(chrom, super_start, super_stop))
            print()

            for i, superlocus in enumerate([super1, super2], 1):
                for locus in superlocus:
                    lstart = locus.start
                    lstop = locus.stop
                    lref = locus.alleles[0] or '-'
                    indices = locus.allele_indices
                    sep = '|' if locus.phased else '/'
                    geno = sep.join(locus.alleles[a] or '-' if a is not None else '.' for a in indices)
                    print('  NORM{:d}: [{:5d}-{:5d}) ref={} geno={}'.format(i, lstart, lstop, lref, geno))
            print()

            match, match_type = superlocus_equal(ref, super_start, super_stop, super1, super2, debug=args.debug)
            match_status = match_status_map[match]

            print('    MATCH={} TYPE={}'.format(match_status, match_type))
            print()

            # The hard work is done.  The rest is just output and formatting...

            if out_vars[0]:
                for locus in sorted(super1, key=NormalizedLocus.record_order_key):
                    locus.record.samples[args.name1]['BD'] = match_status
                    locus.record.samples[args.name1]['BK'] = match_type
                    out_vars[0].write(locus.record)

            if out_vars[1]:
                for locus in sorted(super2, key=NormalizedLocus.record_order_key):
                    locus.record.samples[args.name2]['BD'] = match_status
                    locus.record.samples[args.name2]['BK'] = match_type
                    out_vars[1].write(locus.record)

            for i, superlocus in enumerate([super1, super2], 1):
                for locus in superlocus:
                    print('  VCF{:d}: {}'.format(i, locus.record), end='')
            print()

    for out_var in out_vars:
        if out_var is not None:
            out_var.close()
Пример #25
0
#=========================================================================#
# Script: plot-quals.py                                                   #
#-------------------------------------------------------------------------#
# Generates a histogram of the quality scores based on the variant calls  #
# in calls/all.vcf.                                                       #
#=========================================================================#
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from pysam import VariantFile

quals = [record.qual for record in VariantFile(snakemake.input[0])]
plt.hist(quals)

plt.savefig(snakemake.output[0])
def main():
    args = process_input()

    chrom_vcf = args.chrom_vcf
    min_r2 = args.min_r2
    min_maf = args.min_maf
    out_prefix = args.out_prefix
    r2_field_name = args.r2_field_name
    maf_field_name = args.maf_field_name
    new_ids = args.new_ids

    ####
    # Read new ids in dictionary
    ####

    new_ids_dict = dict()
    if new_ids is not None:
        with open(new_ids, "r") as f:
            for line in f:
                old_id, new_id = line.rstrip().split("\t")
                new_ids_dict[old_id] = new_id
        print "Ids {0} ids to remap".format(len(new_ids_dict))

    out_vcf_list = "{0}.vcf_list.tsv".format(out_prefix)
    out_vcf_list_handle = open(out_vcf_list, "w")

    for chrom, vcf in chrom_vcf.iteritems():
        chrom_match = re.match("(chr)?(.+)", chrom)
        if chrom_match is not None:
            chrom = chrom_match.group(2)
        else:
            raise ValueError(
                "Chomosome name {0} not formatted correctly!".format(chrom))

        out_vcf_name = "{0}.chr{1}.vcf".format(out_prefix, chrom)
        out_vcf_name_gz = "{0}.chr{1}.vcf.gz".format(out_prefix, chrom)
        out_vcf_name_gz_tbi = "{0}.chr{1}.vcf.gz.tbi".format(out_prefix, chrom)

        print "Processing chr{0} {1}...".format(chrom, vcf)
        in_vcf_handle = VariantFile(vcf)
        pass_filter = in_vcf_handle.header.filters["PASS"]

        out_vcf_list_handle.write("{0}\t{1}".format(chrom, out_vcf_name_gz))
        out_vcf_list_handle.write("\n")

        ####
        # It appears that writing to a BCF is the only method that works in this version of pysam
        ####

        #'wb' for BCF
        #
        #out_vcf_handle = VariantFile(out_vcf_name,'wb',header=in_vcf_handle.header)
        #out_vcf_handle = pysam.libcbgzf.BGZFile(out_vcf_name,"wb")
        #out_vcf_handle.write(str(in_vcf_handle.header))

        #cmd = "bgzip -c > {0}".format(out_vcf_name)
        #print cmd

        out_vcf_handle = open(out_vcf_name, "w")

        print "Relabeling and writing header..."
        relabeled_ids = 0
        old_header_lines = str(in_vcf_handle.header).split("\n")
        for line in old_header_lines:

            if line == "":
                continue

            if re.match("^#CHROM.+", line):
                cols = line.split("\t")
                for i in range(9, len(cols)):
                    if cols[i] in new_ids_dict:
                        relabeled_ids += 1
                        cols[i] = new_ids_dict[cols[i]]
                #merge new columns
                new_line = "\t".join(cols)
                out_vcf_handle.write(new_line)
            else:
                out_vcf_handle.write(line)

            #write new line
            out_vcf_handle.write("\n")

        print "Relabeled {0} ids".format(relabeled_ids)

        rec_count = 0

        for rec in in_vcf_handle:
            rec_count += 1
            if rec_count % 50000 == 0:
                print "Line: {0:d} {1}:{2:d}".format(rec_count, rec.chrom,
                                                     rec.pos)
            r2 = rec.info[r2_field_name]
            maf = rec.info[maf_field_name]
            if r2 > min_r2 and maf > min_maf:
                #clear filters
                rec.filter.clear()
                #set filter to be pass
                rec.filter.add("PASS")
                #new lines are already there
                out_vcf_handle.write(str(rec))

        #print "Running bgzip on "
        ##execute bgzip
        #bgz_handle = Popen(["bgzip", out_vcf_name])
        #bgz_handle.wait()

        in_vcf_handle.close()
        out_vcf_handle.close()

        print "Writing tabix index for {0}...".format(out_vcf_name,
                                                      preset="vcf")
        #seems to only compress files
        pysam.tabix_index(out_vcf_name, preset="vcf")

        if not os.path.isfile(out_vcf_name_gz_tbi):
            pysam.tabix_index(out_vcf_name_gz, preset="vcf")

        if os.path.isfile(out_vcf_name):
            os.remove(out_vcf_name)

    out_vcf_list_handle.close()
    print "Finished writing {0}".format(out_vcf_list)
    print "Complete!"
Пример #27
0
def main(argv):
    parser = argparse.ArgumentParser(
        description=__doc__,
        prog='svtk vcfcluster',
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('filelist',
                        type=argparse.FileType('r'),
                        help='List of paths to standardized VCFS')
    parser.add_argument('fout', help='Clustered VCF.')
    parser.add_argument('-r',
                        '--region',
                        default=None,
                        help='Restrict clustering to genomic region.')
    parser.add_argument('-d',
                        '--dist',
                        type=int,
                        default=500,
                        help='Maximum clustering distance. Suggested to use '
                        'max of median + 7*MAD over samples. [500]')
    parser.add_argument('-f',
                        '--frac',
                        type=float,
                        default=0.1,
                        help='Minimum reciprocal overlap between variants. '
                        '[0.1]')
    parser.add_argument('-x',
                        '--blacklist',
                        metavar='BED.GZ',
                        type=TabixFile,
                        default=None,
                        help='Tabix indexed bed of blacklisted regions. Any '
                        'SV with a breakpoint falling inside one of these '
                        'regions is filtered from output.')
    parser.add_argument('-z',
                        '--svsize',
                        type=int,
                        default=500,
                        help='Minimum SV size to report for intrachromosomal '
                        'events. [0]')
    parser.add_argument('-p',
                        '--prefix',
                        default='MERGED',
                        help='Prefix for merged variant IDs. [MERGED]')
    parser.add_argument('-t',
                        '--svtypes',
                        default='DEL,DUP,INV,BND',
                        help='Comma delimited list of svtypes to cluster '
                        '[DEL,DUP,INV,BND]')
    parser.add_argument('--ignore-svtypes',
                        action='store_true',
                        default=False,
                        help='Ignore svtypes when clustering.')
    parser.add_argument('-o',
                        '--sample-overlap',
                        type=float,
                        default=0.0,
                        help='Minimum sample overlap for two variants to be '
                        'clustered together.')
    parser.add_argument('--preserve-ids',
                        action='store_true',
                        default=False,
                        help='Include list of IDs of constituent records in '
                        'each cluster.')
    parser.add_argument('--preserve-genotypes',
                        action='store_true',
                        default=False,
                        help='In a set of clustered variants, report best '
                        '(highest GQ) non-reference genotype when available.')
    parser.add_argument('--preserve-header',
                        action='store_true',
                        default=False,
                        help='Use header from clustering VCFs')
    parser.add_argument(
        '--skip-merge',
        action='store_true',
        default=False,
        help='Do not merge clustered records. Adds CLUSTER info fields.')
    parser.add_argument(
        '--merge-only',
        action='store_true',
        default=False,
        help=
        'When run on a vcf generated with --skip-merge, only merges records '
        'with identical CLUSTER fields.')
    parser.add_argument(
        '--single-end',
        action='store_true',
        default=False,
        help='Require only one end to be within the minimum distance.')
    #  parser.add_argument('--cluster-bed', type=argparse.FileType('w'),
    #                      help='Bed of constituent calls in each cluster')

    # Print help if no arguments specified
    if len(argv) == 0:
        parser.print_help()
        sys.exit(1)
    args = parser.parse_args(argv)

    if args.skip_merge and args.merge_only:
        raise ValueError('Cannot use both --skip-merge and --merge-only')

    # Parse SV files and lists of samples and sources
    filepaths = [line.strip() for line in args.filelist.readlines()]
    vcfs = parse_filepaths(filepaths)

    svtypes = args.svtypes.split(',')
    match_svtypes = not args.ignore_svtypes

    do_merge = not args.skip_merge
    do_cluster = not args.merge_only
    svc = VCFCluster(vcfs,
                     dist=args.dist,
                     blacklist=args.blacklist,
                     frac=args.frac,
                     svtypes=svtypes,
                     region=args.region,
                     match_svtypes=match_svtypes,
                     preserve_ids=args.preserve_ids,
                     preserve_genotypes=args.preserve_genotypes,
                     sample_overlap=args.sample_overlap,
                     preserve_header=args.preserve_header,
                     do_cluster=do_cluster,
                     do_merge=do_merge,
                     single_end=args.single_end)

    # Open new file
    if args.fout in '- stdout'.split():
        fout = sys.stdout
    else:
        fout = open(args.fout, 'w')

    fout = VariantFile(fout, mode='w', header=svc.header)

    for i, cluster in enumerate(svc.cluster()):
        if args.prefix:
            cluster_id = [args.prefix]
        else:
            cluster_id = ['SV']
        if args.region:
            chrom = args.region.split(':')[0]
            cluster_id.append(chrom)
        if do_merge and do_cluster:
            cluster_index = i
        else:
            cluster_index = cluster[0].info['CLUSTER']
        cluster_id.append(str(cluster_index + 1))
        cluster_id = '_'.join(cluster_id)

        for record in cluster:
            # Name record
            if do_merge:
                name = cluster_id
            else:
                name = record.id

            record.id = name
            fout.write(record)

            # Size filter (CTX have size -1)
            if -1 < record.info['SVLEN'] < args.svsize:
                continue

            #  if args.cluster_bed is not None:
            #  flatten_pos(cluster, record.ID, args.cluster_bed)

    fout.close()
Пример #28
0
    def create_matrix(self, src_file_name):
        '''
                Функция создания одной LD-матрицы.
                '''

        #Считывание исходной таблицы, извлечение оттуда
        #rsIDs и создание словаря, в котором позиции и
        #идентификаторы вариантов разбиты по хромосомам.
        data_by_chrs = create_src_dict(self.src_dir_path, src_file_name,
                                       self.meta_lines_quan,
                                       self.intgen_convdb_path)

        #В одну папку второго уровня планируется размещать все
        #результаты, полученные по данным одного исходного файла.
        src_file_base = src_file_name.rsplit('.', maxsplit=1)[0]
        trg_dir_path = os.path.join(self.trg_top_dir_path,
                                    f'{src_file_base}_LD_matr')

        #Для вариантов одной хромосомы
        #создастся одна матрица.
        for chrom in data_by_chrs:

            #Проверяем, набралось ли хотя
            #бы 2 варианта, относящиеся к
            #текущей хромосоме. Если да, то
            #появляется смысл в реальном создании
            #конечной папки второго уровня.
            if len(data_by_chrs[chrom]) < 2:
                continue
            if os.path.exists(trg_dir_path) == False:
                os.mkdir(trg_dir_path)

            #Чтобы потом проще было визуально оценивать
            #влияние физического расстояния на LD,
            #rsIDs отсортируются по геномным позициям.
            data_by_chrs[chrom].sort(key=lambda row: row[0])
            poss_srtd, rs_ids_srtd = [], []
            for row in data_by_chrs[chrom]:
                poss_srtd.append(row[0])
                rs_ids_srtd.append(row[1])

            #Знание количества rsIDs в
            #ближайшей перспективе пригодится,
            #чтобы задать размеры матрицы,
            #а в дальнейшей - чтобы
            #оформить её табличную версию.
            vars_quan = len(rs_ids_srtd)

            #Основой текстовой или
            #графической матрицы
            #будет двумерный массив
            #такой структуры:
            '''
                         0    0    0    ...
                        val   0    0    ...
                        val  val   0    ...
                        ...  ...  ...   ...
                        '''

            #Построение шаблона квадратного двумерного массива, состоящего
            #из нулей. Нули в дальнейшем могут заменяться на значения LD.
            ld_two_dim = [[0 for col_index in range(vars_quan)]
                          for row_index in range(vars_quan)]

            #В случае, если будет рисоваться диаграмма,
            #такой же шаблон понадобится для создания
            #матрицы сопутствующей информации.
            if self.matrix_type in ['heatmap', 'both']:
                info_two_dim = copy.deepcopy(ld_two_dim)

            #Для расчёта LD и аннотирования вариантов
            #потребуются данные проекта 1000 Genomes.
            #Открываем соответствующий текущей хромосоме
            #tabix-индексированный 1000 Genomes-архив с
            #помощью pysam. Pysam тут пригождается для
            #быстрого доступа к случайным строкам архива.
            with VariantFile(
                    os.path.join(self.intgen_dir_path,
                                 f'{chrom}.vcf.gz')) as intgen_vcf_opened:

                #Перебор индексов строк и столбцов
                #изначально нулевых матриц.
                for row_index in range(vars_quan):
                    for col_index in range(vars_quan):

                        #Матрица, в принципе, может
                        #быть квадратом, состоящим
                        #из двух одинаковых по форме
                        #и содержимому прямоугольных
                        #треугольников, разделённых
                        #диагональю 0-ячеек. Думаю,
                        #разумнее оставить лишь один
                        #из этих треугольников. Для
                        #этого получаем только те
                        #значения, которые соответствуют
                        #ячейкам двумерного массива,
                        #индекс строки которых
                        #больше индекса столбца.
                        if row_index <= col_index:
                            continue

                        #Вытаскивание из 1000 Genomes и отбор
                        #по сэмплам фазированных генотипов текущей
                        #пары вариантов. Разбиение пар генотипов
                        #на отдельные, что необходимо из-за требования
                        #калькулятора LD. Извлечение из 1000 Genomes
                        #аннотаций каждого варианта обрабатываемой пары.
                        y_var_genotypes, x_var_genotypes = [], []
                        y_var_row = data_by_chrs[chrom][row_index]
                        for intgen_rec in intgen_vcf_opened.fetch(
                                chrom, y_var_row[0] - 1, y_var_row[0]):
                            if intgen_rec.id != y_var_row[1]:
                                continue
                            y_var_alleles = intgen_rec.ref + '/' + intgen_rec.alts[
                                0]
                            y_var_type = intgen_rec.info['VT'][0]
                            for sample_name in self.sample_names:
                                try:
                                    y_var_genotypes += intgen_rec.samples[
                                        sample_name]['GT']
                                except KeyError:
                                    continue
                            break
                        x_var_row = data_by_chrs[chrom][col_index]
                        for intgen_rec in intgen_vcf_opened.fetch(
                                chrom, x_var_row[0] - 1, x_var_row[0]):
                            if intgen_rec.id != x_var_row[1]:
                                continue
                            x_var_alleles = intgen_rec.ref + '/' + intgen_rec.alts[
                                0]
                            x_var_type = intgen_rec.info['VT'][0]
                            for sample_name in self.sample_names:
                                try:
                                    x_var_genotypes += intgen_rec.samples[
                                        sample_name]['GT']
                                except KeyError:
                                    continue
                            break

                        #Обращение к оффлайн-калькулятору
                        #для получения словаря с r2, D' и
                        #частотами альтернативных аллелей
                        #пары вариантов для выбранных
                        #исследователем популяций и полов.
                        trg_vals = calc_ld(y_var_genotypes, x_var_genotypes)

                        #Каждый элемент визуализируемой матрицы
                        #аннотируется: параллельно с накоплением
                        #массива LD-значений растёт массив дополнительной
                        #информации по каждой паре вариантов.
                        if self.matrix_type in ['heatmap', 'both']:
                            info_two_dim[row_index][col_index] = f'''
r2: {trg_vals["r_square"]}<br>
D': {trg_vals["d_prime"]}<br>
abs_dist: {abs(poss_srtd[col_index] - poss_srtd[row_index])}<br><br>
{rs_ids_srtd[col_index]}.hg38_pos: {poss_srtd[col_index]}<br>
{rs_ids_srtd[row_index]}.hg38_pos: {poss_srtd[row_index]}<br><br>
{rs_ids_srtd[col_index]}.alleles: {x_var_alleles}<br>
{rs_ids_srtd[row_index]}.alleles: {y_var_alleles}<br><br>
{rs_ids_srtd[col_index]}.type: {x_var_type}<br>
{rs_ids_srtd[row_index]}.type: {y_var_type}<br><br>
{rs_ids_srtd[col_index]}.alt_freq: {trg_vals['var_2_alt_freq']}<br>
{rs_ids_srtd[row_index]}.alt_freq: {trg_vals['var_1_alt_freq']}
'''

                        #Исследователь мог установить нижний порог LD.
                        #Соответствующий блок кода неспроста расположен после
                        #блока накопления аннотаций: на диаграммах клеточки с
                        #подпороговыми LD будут закрашены как нулевые, но
                        #зато при наведении курсора там отобразятся настоящие
                        #LD-значения, как раз извлекаемые из массива с аннотациями.
                        #При обратном расположении этих блоков аннотации подпороговых
                        #LD не сохранялись бы, ведь в блоке фильтрации - continue.
                        if self.ld_low_thres != None:
                            if trg_vals[self.ld_measure] < self.ld_low_thres:
                                continue

                        #Если значение LD не отсеилось как подпороговое,
                        #то попадёт в LD-матрицу: 0-ячейка будет заменена
                        #на найденное значение LD выбранной величины.
                        ld_two_dim[row_index][col_index] = trg_vals[
                            self.ld_measure]

            #Стремящееся быть информативным название
            #конечного файла. Какое к нему далее будет
            #пристыковано расширение - зависит от
            #выбранного исследователем формата.
            trg_file_base = f'{src_file_base}_chr{chrom}_{self.ld_measure[0]}'

            #Визуализация матрицы с помощью plotly.
            if self.matrix_type in ['heatmap', 'both']:

                #Исследователь дал добро
                #выводить на диаграмму надписи:
                #rsIDs в качестве лейблов осей и
                #значения LD внутри квадратиков
                #непосредственно тепловой карты.
                if self.disp_letters:

                    #Создание объекта аннотированной тепловой карты.
                    #Из чего он состоит - см. в ридми к ld_triangle.
                    #Здесь только отмечу, что create_annotated_heatmap -
                    #высокоуровневая функция библиотеки plotly,
                    #берущая на себя большую часть этой работы.
                    ld_heatmap = ff.create_annotated_heatmap(
                        ld_two_dim,
                        x=rs_ids_srtd,
                        y=rs_ids_srtd,
                        hovertext=info_two_dim,
                        hoverinfo='text',
                        xgap=1,
                        ygap=1,
                        colorscale=self.color_pal,
                        showscale=False)

                    #Возможная кастомизация размера шрифта
                    #подписей к осям и чисел в квадратиках.
                    if self.font_size != None:
                        ld_heatmap.layout.xaxis.tickfont.size = self.font_size
                        ld_heatmap.layout.yaxis.tickfont.size = self.font_size
                        for ann_num in range(len(
                                ld_heatmap.layout.annotations)):
                            ld_heatmap['layout']['annotations'][ann_num][
                                'font']['size'] = self.font_size

                #Исследователь предпочёл выводить на тепловую
                #карту минимум текстовых данных. Жертвовать
                #надписями обычно приходится во избежание их
                #взамного наползания в крупных диаграммах.
                #Построим объект смысловой части диаграммы
                #и объект вторичных настроек, собирём
                #их в финальный объект. Подробнее о
                #структуре объектов plotly - в ридми.
                else:
                    trace = go.Heatmap(z=ld_two_dim,
                                       hovertext=info_two_dim,
                                       hoverinfo='text',
                                       xgap=1,
                                       ygap=1,
                                       colorscale=self.color_pal,
                                       showscale=False)
                    layout = go.Layout(xaxis_showticklabels=False,
                                       yaxis_showticklabels=False)
                    ld_heatmap = go.Figure(data=trace, layout=layout)

                #Опциональное приведение
                #диаграммы к квадратной форме.
                if self.square_shape:
                    ld_heatmap.update_layout(xaxis_constraintoward='left',
                                             yaxis_scaleanchor='x',
                                             yaxis_scaleratio=1,
                                             plot_bgcolor='rgba(0,0,0,0)')

                #Следующие настройки будут
                #касаться, в основном, надписей,
                #отличных от LD-значений и
                #rsIDs - заголовка и футера.
                #Чтобы размещать футер, пришлось
                #пойти на небольшую хитрость -
                #выделить под него тайтл оси X.
                #Помимо всего прочего, переворачиваем
                #диаграмму по Y ради визуальной
                #совместимости с хитмэпами LDmatrix.
                title = f'''
defines color: {self.ld_measure} ░
LD threshold: {self.ld_low_thres} ░
chromosome: {chrom} ░
genders: {", ".join(self.gend_names)} ░
populations: {", ".join(self.pop_names)}
'''
                ld_heatmap.update_layout(title_text=title,
                                         xaxis_side='bottom',
                                         yaxis_autorange='reversed')
                if self.dont_disp_footer == False:
                    footer = '''
made by ld_triangle from <a href="https://github.com/PlatonB/ld-tools">ld-tools</a> ░
readme:
<a href="https://github.com/PlatonB/ld-tools/blob/master/README.md">ru</a>
<a href="https://github.com/PlatonB/ld-tools/blob/master/README-EN.md">en</a> ░
<a href="https://www.tinkoff.ru/rm/bykadorov.platon1/7tX2Y99140/">donate</a>
'''
                    ld_heatmap.update_layout(xaxis_title_text=footer,
                                             xaxis_title_font_size=10)

                #Прописывание всех данных диаграммы в
                #JSON, если это необходимо исследователю.
                if self.heatmap_json:
                    debug_file_name = trg_file_base + '.json'
                    ld_heatmap.write_json(os.path.join(trg_dir_path,
                                                       debug_file_name),
                                          pretty=True)

                #Сохранение диаграммы в HTML.
                html_file_name = trg_file_base + '.html'
                ld_heatmap.write_html(
                    os.path.join(trg_dir_path, html_file_name))

            #Исследователь выбрал опцию создавать
            #табличные варианты LD-матриц.
            if self.matrix_type in ['table', 'both']:

                #Создание текстового конечного файла. Прописываем в него хэдер
                #с общими характеристиками матрицы, пустую строку и две шапки:
                #одна - с rsIDs, другая - с позициями. Потом прописываем
                #LD-строки, добавляя перед каждой из них тоже rsID и позицию.
                tsv_file_name = trg_file_base + '.tsv'
                with open(os.path.join(trg_dir_path, tsv_file_name),
                          'w') as tsv_file_opened:
                    tab, poss_srtd = '\t', list(
                        map(lambda pos: str(pos), poss_srtd))
                    tsv_file_opened.write(
                        f'##General\tinfo:\t{self.ld_measure}\tchr{chrom}\t{tab.join(self.pop_names)}\t{tab.join(self.gend_names)}\n\n'
                    )
                    tsv_file_opened.write('rsIDs\t\t' +
                                          '\t'.join(rs_ids_srtd) + '\n')
                    tsv_file_opened.write('\tPositions\t' +
                                          '\t'.join(poss_srtd) + '\n')
                    for row_index in range(vars_quan):
                        line = '\t'.join(map(str,
                                             ld_two_dim[row_index])) + '\n'
                        tsv_file_opened.write(rs_ids_srtd[row_index] + '\t' +
                                              poss_srtd[row_index] + '\t' +
                                              line)
Пример #29
0
#!/bin/python3.6
import sys
from pysam import VariantFile
import subprocess

vcf_in = VariantFile(sys.argv[1])
new_header = new_header = vcf_in.header
vcf_out = VariantFile(sys.argv[2], 'w', header=new_header)
sv_out = sys.argv[2] + '.svtypeDEL.txt'
indelArteFile = sys.argv[3]

for record in vcf_in.fetch():
    # import pdb; pdb.set_trace()
    try:
        if record.info["SVTYPE"] == 'DEL':
            with open(sv_out, 'a+') as svtype_out:
                svtype_out.write(str(record))
    except KeyError:
        if len(record.ref) != len(record.alts[0]):  # if InDel
            if (
                "mutect2" in record.info["CALLERS"] or "vardict" in record.info["CALLERS"]
            ):  # Support by either Vardict or Manta, ok.
                # Check if indel artefact
                # import pdb; pdb.set_trace()
                write = 1
                cmdIndelArte = 'grep -w ' + str(record.pos) + ' ' + indelArteFile
                artefactLines = (
                    subprocess.run(cmdIndelArte, stdout=subprocess.PIPE, shell='TRUE').stdout.decode('utf-8').strip()
                )
                for artefactLine in artefactLines.split("\n"):
                    if (
Пример #30
0
def vcf_to_ref(outfile,
               vcf_file,
               rec_file,
               pop2sample,
               random_read_samples=[],
               pos_id="Physical_Pos",
               map_ids=["AA_Map"],
               default_map="AA_Map",
               rec_rate=1e-8,
               chroms=None,
               bed=None,
               lax_alleles=False):

    pprint(pop2sample)

    #  get chromosomes
    with VariantFile(vcf_file.format(CHROM='1')) as vcf:
        if chroms is None:
            chroms = [i for i in vcf.header.contigs]
        else:
            chroms = parse_chroms(chroms)
        log_.info("chroms found: %s", chroms)

        sample2pop = defaultdict(list)
        for pop, v in pop2sample.items():
            for sample in v:
                if sample in vcf.header.samples:
                    sample2pop[sample].append(pop)

    samples = sample2pop.keys()
    pops = set(pop for s, v in sample2pop.items() for pop in v)
    pprint(sample2pop)
    pprint(pops)

    map_ids = ['map'] + map_ids

    data_cols = [f"{p}_{e}" for p in pops for e in EXT]

    with lzma.open(outfile, "wt") as ref:
        ref.write("chrom,pos,ref,alt,")
        if rec_file is None:
            ref.write("map,")
        else:
            ref.write(",".join(map_ids))
            ref.write(",")
        ref.write(",".join(data_cols))
        ref.write("\n")
        for chrom in chroms:

            # set up rec file
            if rec_file is not None:
                rec = pd.read_csv(rec_file.format(CHROM=chrom), sep=" ")
                if "chrom" in rec:
                    rec = rec[rec.chrom == chrom]

                rec['map'] = rec[default_map]
                rec_file_cols = list((pos_id, *map_ids))
                rec = rec[rec_file_cols]

                rec_iter = rec.iterrows()
                R0 = next(rec_iter)[1]
                R1 = next(rec_iter)[1]

            #skip chrom if empty
            with VariantFile(vcf_file.format(CHROM=chrom)) as vcf:
                try:
                    V = next(vcf)
                except StopIteration:
                    continue
            with VariantFile(vcf_file.format(CHROM=chrom)) as vcf:
                vcf.subset_samples(samples)
                for row in vcf.fetch(chrom):

                    alt_ix = 0

                    if len(row.alleles) <= 1 or len(row.alleles) > 3:
                        continue

                    if len(row.alleles) == 3:
                        alleles = [
                            i for v in row.samples.values() for i in v["GT"]
                        ]
                        if 3 in alleles:
                            continue
                        elif 1 in alleles and 2 in alleles:
                            continue
                        elif 1 not in alleles and 2 not in alleles:
                            continue
                        elif 1 in alleles:
                            alt_ix = 0
                        elif 2 in alleles:
                            alt_ix = 1
                        else:
                            raise ValueError(f"weird alleles {row.alleles}")
                        log_.debug(
                            f"{row.chrom}, {row.pos}, {row.alleles}, {Counter(alleles)}"
                        )

                    if row.alts[alt_ix] not in "ACGT" or lax_alleles:
                        continue

                    D = defaultdict(int)
                    # rec stuff
                    if rec_file is None:
                        map_ = row.pos * rec_rate
                        ref.write(
                            f"{row.chrom},{row.pos},{row.ref},{row.alts[alt_ix]},{map_},"
                        )
                    else:
                        if R1 is None:
                            map_ = R0[map_ids]
                        elif row.pos <= R0[pos_id]:
                            map_ = R0[map_ids]
                        elif R0[pos_id] < row.pos <= R1[pos_id]:
                            slope = (R1[map_ids] - R0[map_ids]) / (R1[pos_id] -
                                                                   R0[pos_id])
                            map_ = R0[map_ids] + slope * (
                                row.pos - R0[pos_id]) / (R1[pos_id] -
                                                         R0[pos_id])
                        elif row.pos > R1[pos_id]:
                            try:
                                while row.pos > R1[pos_id]:
                                    R0, R1 = R1, next(rec_iter)[1]
                            except StopIteration:
                                R0, R1 = R1, None
                            if R1 is None:
                                map_ = R0[map_ids]
                            else:
                                slope = (R1[map_ids] - R0[map_ids]) / (
                                    R1[pos_id] - R0[pos_id])
                                map_ = R0[map_ids] + slope * (
                                    row.pos - R0[pos_id]) / (R1[pos_id] -
                                                             R0[pos_id])

                        ref.write(
                            f"{row.chrom},{row.pos},{row.ref},{row.alts[alt_ix]},"
                        )
                        map_str = ",".join((str(m) for m in map_))
                        ref.write(f"{map_str},")

                    sample_data = row.samples
                    for s in sample_data:
                        if s in random_read_samples:
                            allele = sample_data[s]["GT"][0]
                            if allele is not None:
                                for pop in sample2pop[s]:
                                    D[f"{pop}_{EXT[allele > 0]}"] += 1
                        else:
                            for allele in sample_data[s]["GT"]:
                                if allele is not None:
                                    for pop in sample2pop[s]:
                                        D[f"{pop}_{EXT[allele > 0]}"] += 1

                    ref.write(",".join((str(D[c]) for c in data_cols)))
                    ref.write("\n")