示例#1
0
def VarDict(variant_id, vardict_variants):

    if variant_id in vardict_variants:

        vardict_variant_i = vardict_variants[variant_id]

        if (vardict_variant_i.filters
                == 'PASS') and ('Somatic' in vardict_variant_i.info):
            vardict_classification = 1

        elif 'Somatic' in vardict_variant_i.info:
            vardict_filters = vardict_variant_i.filters.split(';')

            disqualifying_filters = \
            ('d7'      in vardict_filters  or 'd5' in vardict_filters) or \
            ('DIFF0.2' in vardict_filters) or \
            ('LongAT'  in vardict_filters) or \
            ('MAF0.05' in vardict_filters) or \
            ('MSI6'    in vardict_filters) or \
            ('NM4'     in vardict_filters  or 'NM4.25' in vardict_filters) or \
            ('pSTD'    in vardict_filters) or \
            ('SN1.5'   in vardict_filters) or \
            ( 'P0.05'  in vardict_filters  and float(vardict_variant_i.get_info_value('SSF') ) >= 0.15 ) or \
            ( ('v3'    in vardict_filters  or 'v4' in vardict_filters) and int(vardict_variant_i.get_sample_value('VD', 0))<3 )

            no_bad_filter = not disqualifying_filters
            filter_fail_times = len(vardict_filters)

            if no_bad_filter and filter_fail_times <= 2:
                vardict_classification = 0.5
            else:
                vardict_classification = 0

        else:
            vardict_classification = 0

        # Somatic Score:
        score_vardict = vardict_variant_i.get_info_value('SSF')
        if score_vardict:
            score_vardict = float(score_vardict)
            score_vardict = genome.p2phred(score_vardict, max_phred=100)
        else:
            score_vardict = nan

        # MSI, MSILEN, and SHIFT3:
        msi = find_MSI(vardict_variant_i)
        msilen = find_MSILEN(vardict_variant_i)
        shift3 = find_SHIFT3(vardict_variant_i)

    else:
        vardict_classification = 0
        msi = msilen = shift3 = score_vardict = nan

    return vardict_classification, msi, msilen, shift3, score_vardict
示例#2
0
def VarDict(variant_id, vardict_variants):
    
    if variant_id in vardict_variants:
        
        vardict_variant_i = vardict_variants[ variant_id ]
        
        if (vardict_variant_i.filters == 'PASS') and ('Somatic' in vardict_variant_i.info):
            vardict_classification = 1
        
        elif 'Somatic' in vardict_variant_i.info:
            vardict_filters = vardict_variant_i.filters.split(';')
            
            disqualifying_filters = \
            ('d7'      in vardict_filters  or 'd5' in vardict_filters) or \
            ('DIFF0.2' in vardict_filters) or \
            ('LongAT'  in vardict_filters) or \
            ('MAF0.05' in vardict_filters) or \
            ('MSI6'    in vardict_filters) or \
            ('NM4'     in vardict_filters  or 'NM4.25' in vardict_filters) or \
            ('pSTD'    in vardict_filters) or \
            ('SN1.5'   in vardict_filters) or \
            ( 'P0.05'  in vardict_filters  and float(vardict_variant_i.get_info_value('SSF') ) >= 0.15 ) or \
            ( ('v3'    in vardict_filters  or 'v4' in vardict_filters) and int(vardict_variant_i.get_sample_value('VD', 0))<3 )
        
            no_bad_filter = not disqualifying_filters
            filter_fail_times = len(vardict_filters)
        
            if no_bad_filter and filter_fail_times<=2:
                vardict_classification = 0.5
            else:
                vardict_classification = 0

        else:
            vardict_classification = 0
            
        # Somatic Score:
        score_vardict = vardict_variant_i.get_info_value('SSF')
        if score_vardict:
            score_vardict = float(score_vardict)
            score_vardict = genome.p2phred(score_vardict, max_phred=100)
        else:
            score_vardict = nan

        # MSI, MSILEN, and SHIFT3:
        msi    = find_MSI(vardict_variant_i)
        msilen = find_MSILEN(vardict_variant_i)
        shift3 = find_SHIFT3(vardict_variant_i)                        

    else:
        vardict_classification = 0
        msi = msilen = shift3 = score_vardict = nan

    return vardict_classification, msi, msilen, shift3, score_vardict
示例#3
0
def JSM(variant_id, jsm_variants):
    
    if variant_id in jsm_variants:
        
        jsm_variant_i = jsm_variants[ variant_id ]
        jointsnvmix2_classification = 1
        aaab = float( jsm_variant_i.get_info_value('AAAB') )
        aabb = float( jsm_variant_i.get_info_value('AABB') )
        jointsnvmix2_p = 1 - aaab - aabb
        score_jointsnvmix2 = genome.p2phred(jointsnvmix2_p, max_phred=50)
        
    else:
        jointsnvmix2_classification = 0
        score_jointsnvmix2 = nan

    return jointsnvmix2_classification, score_jointsnvmix2
示例#4
0
def JSM(variant_id, jsm_variants):
    
    if variant_id in jsm_variants:
        
        jsm_variant_i = jsm_variants[ variant_id ]
        jointsnvmix2_classification = 1
        aaab = float( jsm_variant_i.get_info_value('AAAB') )
        aabb = float( jsm_variant_i.get_info_value('AABB') )
        jointsnvmix2_p = 1 - aaab - aabb
        score_jointsnvmix2 = genome.p2phred(jointsnvmix2_p, max_phred=50)
        
    else:
        jointsnvmix2_classification = 0
        score_jointsnvmix2 = nan

    return jointsnvmix2_classification, score_jointsnvmix2
示例#5
0
def vcf2tsv(is_vcf=None,
            is_bed=None,
            is_pos=None,
            nbam_fn=None,
            tbam_fn=None,
            truth=None,
            cosmic=None,
            dbsnp=None,
            mutect=None,
            varscan=None,
            jsm=None,
            sniper=None,
            vardict=None,
            muse=None,
            lofreq=None,
            scalpel=None,
            strelka=None,
            tnscope=None,
            platypus=None,
            dedup=True,
            min_mq=1,
            min_bq=5,
            min_caller=0,
            ref_fa=None,
            p_scale=None,
            outfile=None):

    # Convert contig_sequence to chrom_seq dict:
    fai_file = ref_fa + '.fai'
    chrom_seq = genome.faiordict2contigorder(fai_file, 'fai')

    # Determine input format:
    if is_vcf:
        mysites = is_vcf
    elif is_bed:
        mysites = is_bed
    elif is_pos:
        mysites = is_pos
    else:
        mysites = fai_file
        logger.info('No position supplied. Will evaluate the whole genome.')

    # Re-scale output or not:
    if p_scale == None:
        logger.info('NO RE-SCALING')
    elif p_scale.lower() == 'phred':
        p_scale = 'phred'
    elif p_scale.lower() == 'fraction':
        p_scale = 'fraction'
    else:
        p_scale = None
        logger.info('NO RE-SCALING')

        # Define NaN and Inf:
    nan = float('nan')
    inf = float('inf')
    pattern_chr_position = genome.pattern_chr_position

    ## Running
    with genome.open_textfile(mysites) as my_sites, open(outfile,
                                                         'w') as outhandle:

        my_line = my_sites.readline().rstrip()

        nbam = pysam.AlignmentFile(nbam_fn, reference_filename=ref_fa)
        tbam = pysam.AlignmentFile(tbam_fn, reference_filename=ref_fa)
        ref_fa = pysam.FastaFile(ref_fa)

        if truth:
            truth = genome.open_textfile(truth)
            truth_line = genome.skip_vcf_header(truth)

        if cosmic:
            cosmic = genome.open_textfile(cosmic)
            cosmic_line = genome.skip_vcf_header(cosmic)

        if dbsnp:
            dbsnp = genome.open_textfile(dbsnp)
            dbsnp_line = genome.skip_vcf_header(dbsnp)

        # 10 Incorporate callers: get thru the #'s
        if mutect:
            mutect = genome.open_textfile(mutect)
            mutect_line = genome.skip_vcf_header(mutect)

        if varscan:
            varscan = genome.open_textfile(varscan)
            varscan_line = genome.skip_vcf_header(varscan)

        if jsm:
            jsm = genome.open_textfile(jsm)
            jsm_line = genome.skip_vcf_header(jsm)

        if sniper:
            sniper = genome.open_textfile(sniper)
            sniper_line = genome.skip_vcf_header(sniper)

        if vardict:
            vardict = genome.open_textfile(vardict)
            vardict_line = genome.skip_vcf_header(vardict)

        if muse:
            muse = genome.open_textfile(muse)
            muse_line = genome.skip_vcf_header(muse)

        if lofreq:
            lofreq = genome.open_textfile(lofreq)
            lofreq_line = genome.skip_vcf_header(lofreq)

        if scalpel:
            scalpel = genome.open_textfile(scalpel)
            scalpel_line = genome.skip_vcf_header(scalpel)

        if strelka:
            strelka = genome.open_textfile(strelka)
            strelka_line = genome.skip_vcf_header(strelka)

        if tnscope:
            tnscope = genome.open_textfile(tnscope)
            tnscope_line = genome.skip_vcf_header(tnscope)

        if platypus:
            platypus = genome.open_textfile(platypus)
            platypus_line = genome.skip_vcf_header(platypus)

        # Get through all the headers:
        while my_line.startswith('#') or my_line.startswith('track='):
            my_line = my_sites.readline().rstrip()

        # First coordinate, for later purpose of making sure the input is sorted properly
        coordinate_i = re.match(genome.pattern_chr_position, my_line)
        coordinate_i = coordinate_i.group() if coordinate_i else ''

        # First line:
        outhandle.write(out_header.replace('{', '').replace('}', '') + '\n')

        while my_line:

            # If VCF, get all the variants with the same coordinate into a list:
            if is_vcf:

                my_vcf = genome.Vcf_line(my_line)

                my_coordinates = [(my_vcf.chromosome, my_vcf.position)]

                variants_at_my_coordinate = []

                alt_bases = my_vcf.altbase.split(',')
                for alt_i in alt_bases:
                    vcf_i = copy(my_vcf)
                    vcf_i.altbase = alt_i
                    variants_at_my_coordinate.append(vcf_i)

                # As long as the "coordinate" stays the same, it will keep reading until it's different.
                while my_coordinates[0] == (my_vcf.chromosome,
                                            my_vcf.position):

                    my_line = my_sites.readline().rstrip()
                    my_vcf = genome.Vcf_line(my_line)

                    ########## This block is code is to ensure the input VCF file is properly sorted ##
                    coordinate_j = re.match(genome.pattern_chr_position,
                                            my_line)
                    coordinate_j = coordinate_j.group() if coordinate_j else ''

                    if genome.whoisbehind(coordinate_i, coordinate_j,
                                          chrom_seq) == 1:
                        raise Exception(
                            '{} does not seem to be properly sorted.'.format(
                                mysites))

                    coordinate_i = coordinate_j
                    ###################################################################################

                    if my_coordinates[0] == (my_vcf.chromosome,
                                             my_vcf.position):

                        alt_bases = my_vcf.altbase.split(',')
                        for alt_i in alt_bases:

                            vcf_i = copy(my_vcf)
                            vcf_i.altbase = alt_i
                            variants_at_my_coordinate.append(vcf_i)

            elif is_bed:
                bed_item = my_line.split('\t')
                my_coordinates = genomic_coordinates(bed_item[0],
                                                     int(bed_item[1]) + 1,
                                                     int(bed_item[2]))

            elif is_pos:
                pos_item = my_line.split('\t')
                my_coordinates = genomic_coordinates(pos_item[0],
                                                     int(pos_item[1]),
                                                     int(pos_item[1]))

            elif fai_file:
                fai_item = my_line.split('\t')
                my_coordinates = genomic_coordinates(fai_item[0], 1,
                                                     int(fai_item[1]))

            ##### ##### ##### ##### ##### #####
            for my_coordinate in my_coordinates:

                ######## If VCF, can get ref base, variant base, as well as other identifying information ########
                if is_vcf:

                    ref_bases = []
                    alt_bases = []
                    indel_lengths = []
                    all_my_identifiers = []

                    for variant_i in variants_at_my_coordinate:

                        ref_base = variant_i.refbase
                        first_alt = variant_i.altbase.split(',')[0]
                        indel_length = len(first_alt) - len(ref_base)

                        ref_bases.append(ref_base)
                        alt_bases.append(first_alt)
                        indel_lengths.append(indel_length)

                        # Extract these information if they exist in the VCF file, but they could be re-written if dbSNP/COSMIC are supplied.
                        if_dbsnp = 1 if re.search(r'rs[0-9]+',
                                                  variant_i.identifier) else 0
                        if_cosmic = 1 if re.search(r'COS[MN][0-9]+',
                                                   variant_i.identifier) else 0
                        if_common = 1 if variant_i.get_info_value(
                            'COMMON') == '1' else 0
                        num_cases = variant_i.get_info_value(
                            'CNT') if variant_i.get_info_value('CNT') else nan

                        if variant_i.identifier == '.':
                            my_identifier_i = set()
                        else:
                            my_identifier_i = variant_i.identifier.split(';')
                            my_identifier_i = set(my_identifier_i)

                        all_my_identifiers.append(my_identifier_i)

                ## If not, 1) get ref_base, first_alt from other VCF files.
                #          2) Create placeholders for dbSNP and COSMIC that can be overwritten with dbSNP/COSMIC VCF files (if provided)
                else:
                    variants_at_my_coordinate = [
                        None
                    ]  # Just to have something to iterate
                    ref_base = first_alt = indel_length = None

                    # Could be re-written if dbSNP/COSMIC are supplied. If not, they will remain NaN.
                    if_dbsnp = if_cosmic = if_common = num_cases = nan

                # Keep track of NumCallers:
                num_callers = 0

                #################################### Find the same coordinate in those VCF files ####################################
                if mutect:
                    got_mutect, mutect_variants, mutect_line = genome.find_vcf_at_coordinate(
                        my_coordinate, mutect_line, mutect, chrom_seq)
                if varscan:
                    got_varscan, varscan_variants, varscan_line = genome.find_vcf_at_coordinate(
                        my_coordinate, varscan_line, varscan, chrom_seq)
                if jsm:
                    got_jsm, jsm_variants, jsm_line = genome.find_vcf_at_coordinate(
                        my_coordinate, jsm_line, jsm, chrom_seq)
                if sniper:
                    got_sniper, sniper_variants, sniper_line = genome.find_vcf_at_coordinate(
                        my_coordinate, sniper_line, sniper, chrom_seq)
                if vardict:
                    got_vardict, vardict_variants, vardict_line = genome.find_vcf_at_coordinate(
                        my_coordinate, vardict_line, vardict, chrom_seq)
                if muse:
                    got_muse, muse_variants, muse_line = genome.find_vcf_at_coordinate(
                        my_coordinate, muse_line, muse, chrom_seq)
                if lofreq:
                    got_lofreq, lofreq_variants, lofreq_line = genome.find_vcf_at_coordinate(
                        my_coordinate, lofreq_line, lofreq, chrom_seq)
                if scalpel:
                    got_scalpel, scalpel_variants, scalpel_line = genome.find_vcf_at_coordinate(
                        my_coordinate, scalpel_line, scalpel, chrom_seq)
                if strelka:
                    got_strelka, strelka_variants, strelka_line = genome.find_vcf_at_coordinate(
                        my_coordinate, strelka_line, strelka, chrom_seq)
                if tnscope:
                    got_tnscope, tnscope_variants, tnscope_line = genome.find_vcf_at_coordinate(
                        my_coordinate, tnscope_line, tnscope, chrom_seq)
                if platypus:
                    got_platypus, platypus_variants, platypus_line = genome.find_vcf_at_coordinate(
                        my_coordinate, platypus_line, platypus, chrom_seq)
                if truth:
                    got_truth, truth_variants, truth_line = genome.find_vcf_at_coordinate(
                        my_coordinate, truth_line, truth, chrom_seq)
                if dbsnp:
                    got_dbsnp, dbsnp_variants, dbsnp_line = genome.find_vcf_at_coordinate(
                        my_coordinate, dbsnp_line, dbsnp, chrom_seq)
                if cosmic:
                    got_cosmic, cosmic_variants, cosmic_line = genome.find_vcf_at_coordinate(
                        my_coordinate, cosmic_line, cosmic, chrom_seq)

                # Now, use pysam to look into the BAM file(s), variant by variant from the input:
                for ith_call, my_call in enumerate(variants_at_my_coordinate):

                    if is_vcf:
                        # The particular line in the input VCF file:
                        variant_id = ((my_call.chromosome, my_call.position),
                                      my_call.refbase, my_call.altbase)

                        ref_base = ref_bases[ith_call]
                        first_alt = alt_bases[ith_call]
                        indel_length = indel_lengths[ith_call]
                        my_identifiers = all_my_identifiers[ith_call]

                    else:
                        variant_id = ((my_coordinate[0], my_coordinate[1]),
                                      ref_base, first_alt)

                    #################### Collect Caller Vcf ####################:
                    if mutect:
                        mutect_classification, nlod, tlod, tandem, ecnt = annotate_caller.MuTect(
                            variant_id, mutect_variants)
                        num_callers += mutect_classification
                    else:
                        mutect_classification = nlod = tlod = tandem = ecnt = nan

                    if varscan:
                        varscan_classification = annotate_caller.VarScan(
                            variant_id, varscan_variants)
                        num_callers += varscan_classification
                    else:
                        varscan_classification = nan

                    if jsm:
                        jointsnvmix2_classification, score_jointsnvmix2 = annotate_caller.JSM(
                            variant_id, jsm_variants)
                        num_callers += jointsnvmix2_classification
                    else:
                        jointsnvmix2_classification = score_jointsnvmix2 = nan

                    if sniper:
                        sniper_classification, score_somaticsniper = annotate_caller.SomaticSniper(
                            variant_id, sniper_variants)
                        num_callers += sniper_classification
                    else:
                        sniper_classification = score_somaticsniper = nan

                    if vardict:
                        vardict_classification, msi, msilen, shift3, score_vardict = annotate_caller.VarDict(
                            variant_id, vardict_variants)
                        num_callers += vardict_classification
                    else:
                        vardict_classification = msi = msilen = shift3 = score_vardict = nan

                    if muse:
                        muse_classification = annotate_caller.MuSE(
                            variant_id, muse_variants)
                        num_callers += muse_classification
                    else:
                        muse_classification = nan

                    if lofreq:
                        lofreq_classification = annotate_caller.LoFreq(
                            variant_id, lofreq_variants)
                        num_callers += lofreq_classification
                    else:
                        lofreq_classification = nan

                    if scalpel:
                        scalpel_classification = annotate_caller.Scalpel(
                            variant_id, scalpel_variants)
                        num_callers += scalpel_classification
                    else:
                        scalpel_classification = nan

                    if strelka:
                        strelka_classification, somatic_evs, qss, tqss = annotate_caller.Strelka(
                            variant_id, strelka_variants)
                        num_callers += strelka_classification
                    else:
                        strelka_classification = somatic_evs = qss = tqss = nan

                    if tnscope:
                        tnscope_classification = annotate_caller.TNscope(
                            variant_id, tnscope_variants)
                        num_callers += tnscope_classification
                    else:
                        tnscope_classification = nan

                    if platypus:
                        platypus_classification = annotate_caller.countPASS(
                            variant_id, platypus_variants)
                        num_callers += platypus_classification
                    else:
                        platypus_classification = nan

                    # Potentially write the output only if it meets this threshold:
                    if num_callers >= min_caller:

                        ########## Ground truth file ##########
                        if truth:
                            if variant_id in truth_variants:
                                judgement = 1
                                my_identifiers.add('TruePositive')
                            else:
                                judgement = 0
                                my_identifiers.add('FalsePositive')
                        else:
                            judgement = nan

                        ########## dbSNP ########## Will overwrite dbSNP info from input VCF file
                        if dbsnp:
                            if_dbsnp, if_common, rsID = annotate_caller.dbSNP(
                                variant_id, dbsnp_variants)
                            for ID_i in rsID:
                                my_identifiers.add(ID_i)

                        ########## COSMIC ########## Will overwrite COSMIC info from input VCF file
                        if cosmic:
                            if_cosmic, num_cases, cosmicID = annotate_caller.COSMIC(
                                variant_id, cosmic_variants)
                            for ID_i in cosmicID:
                                my_identifiers.add(ID_i)

                        ########## ######### ######### INFO EXTRACTION FROM BAM FILES ########## ######### #########
                        nBamFeatures = sequencing_features.from_bam(
                            nbam, my_coordinate, ref_base, first_alt, min_mq,
                            min_bq)
                        tBamFeatures = sequencing_features.from_bam(
                            tbam, my_coordinate, ref_base, first_alt, min_mq,
                            min_bq)

                        n_ref = nBamFeatures['ref_for'] + nBamFeatures[
                            'ref_rev']
                        n_alt = nBamFeatures['alt_for'] + nBamFeatures[
                            'alt_rev']
                        t_ref = tBamFeatures['ref_for'] + tBamFeatures[
                            'ref_rev']
                        t_alt = tBamFeatures['alt_for'] + tBamFeatures[
                            'alt_rev']
                        sor = sequencing_features.somaticOddRatio(
                            n_ref, n_alt, t_ref, t_alt)

                        # Calculate VarScan'2 SCC directly without using VarScan2 output:
                        try:
                            score_varscan2 = genome.p2phred(
                                stats.fisher_exact(
                                    ((t_alt, n_alt), (t_ref, n_ref)),
                                    alternative='greater')[1])
                        except ValueError:
                            score_varscan2 = nan

                        # Homopolymer eval:
                        homopolymer_length, site_homopolymer_length = sequencing_features.from_genome_reference(
                            ref_fa, my_coordinate, ref_base, first_alt)

                        # Fill the ID field of the TSV/VCF
                        my_identifiers = ';'.join(
                            my_identifiers) if my_identifiers else '.'

                        ###
                        out_line = out_header.format( \
                        CHROM                   = my_coordinate[0],                                                    \
                        POS                     = my_coordinate[1],                                                    \
                        ID                      = my_identifiers,                                                      \
                        REF                     = ref_base,                                                            \
                        ALT                     = first_alt,                                                           \
                        if_MuTect               = mutect_classification,                                               \
                        if_VarScan2             = varscan_classification,                                              \
                        if_JointSNVMix2         = jointsnvmix2_classification,                                         \
                        if_SomaticSniper        = sniper_classification,                                               \
                        if_VarDict              = vardict_classification,                                              \
                        MuSE_Tier               = muse_classification,                                                 \
                        if_LoFreq               = lofreq_classification,                                               \
                        if_Scalpel              = scalpel_classification,                                              \
                        if_Strelka              = strelka_classification,                                              \
                        if_TNscope              = tnscope_classification,                                              \
                        if_Platypus             = platypus_classification,                                             \
                        Strelka_Score           = somatic_evs,                                                         \
                        Strelka_QSS             = qss,                                                                 \
                        Strelka_TQSS            = tqss,                                                                \
                        VarScan2_Score          = rescale(score_varscan2,      'phred', p_scale, 1001),                \
                        SNVMix2_Score           = rescale(score_jointsnvmix2,  'phred', p_scale, 1001),                \
                        Sniper_Score            = rescale(score_somaticsniper, 'phred', p_scale, 1001),                \
                        VarDict_Score           = rescale(score_vardict,       'phred', p_scale, 1001),                \
                        if_dbsnp                = if_dbsnp,                                                            \
                        COMMON                  = if_common,                                                           \
                        if_COSMIC               = if_cosmic,                                                           \
                        COSMIC_CNT              = num_cases,                                                           \
                        Consistent_Mates        = tBamFeatures['consistent_mates'],                                    \
                        Inconsistent_Mates      = tBamFeatures['inconsistent_mates'],                                  \
                        N_DP                    = nBamFeatures['dp'],                                                  \
                        nBAM_REF_MQ             = '%g' % nBamFeatures['ref_mq'],                                       \
                        nBAM_ALT_MQ             = '%g' % nBamFeatures['alt_mq'],                                       \
                        nBAM_Z_Ranksums_MQ      = '%g' % nBamFeatures['z_ranksums_mq'],                                \
                        nBAM_REF_BQ             = '%g' % nBamFeatures['ref_bq'],                                       \
                        nBAM_ALT_BQ             = '%g' % nBamFeatures['alt_bq'],                                       \
                        nBAM_Z_Ranksums_BQ      = '%g' % nBamFeatures['z_ranksums_bq'],                                \
                        nBAM_REF_NM             = '%g' % nBamFeatures['ref_NM'],                                       \
                        nBAM_ALT_NM             = '%g' % nBamFeatures['alt_NM'],                                       \
                        nBAM_NM_Diff            = '%g' % nBamFeatures['NM_Diff'],                                      \
                        nBAM_REF_Concordant     = nBamFeatures['ref_concordant_reads'],                                \
                        nBAM_REF_Discordant     = nBamFeatures['ref_discordant_reads'],                                \
                        nBAM_ALT_Concordant     = nBamFeatures['alt_concordant_reads'],                                \
                        nBAM_ALT_Discordant     = nBamFeatures['alt_discordant_reads'],                                \
                        nBAM_Concordance_FET    = rescale(nBamFeatures['concordance_fet'], 'fraction', p_scale, 1001), \
                        N_REF_FOR               = nBamFeatures['ref_for'],                                             \
                        N_REF_REV               = nBamFeatures['ref_rev'],                                             \
                        N_ALT_FOR               = nBamFeatures['alt_for'],                                             \
                        N_ALT_REV               = nBamFeatures['alt_rev'],                                             \
                        nBAM_StrandBias_FET     = rescale(nBamFeatures['strandbias_fet'], 'fraction', p_scale, 1001),  \
                        nBAM_Z_Ranksums_EndPos  = '%g' % nBamFeatures['z_ranksums_endpos'],                            \
                        nBAM_REF_Clipped_Reads  = nBamFeatures['ref_SC_reads'],                                        \
                        nBAM_ALT_Clipped_Reads  = nBamFeatures['alt_SC_reads'],                                        \
                        nBAM_Clipping_FET       = rescale(nBamFeatures['clipping_fet'], 'fraction', p_scale, 1001),    \
                        nBAM_MQ0                = nBamFeatures['MQ0'],                                                 \
                        nBAM_Other_Reads        = nBamFeatures['noise_read_count'],                                    \
                        nBAM_Poor_Reads         = nBamFeatures['poor_read_count'],                                     \
                        nBAM_REF_InDel_3bp      = nBamFeatures['ref_indel_3bp'],                                       \
                        nBAM_REF_InDel_2bp      = nBamFeatures['ref_indel_2bp'],                                       \
                        nBAM_REF_InDel_1bp      = nBamFeatures['ref_indel_1bp'],                                       \
                        nBAM_ALT_InDel_3bp      = nBamFeatures['alt_indel_3bp'],                                       \
                        nBAM_ALT_InDel_2bp      = nBamFeatures['alt_indel_2bp'],                                       \
                        nBAM_ALT_InDel_1bp      = nBamFeatures['alt_indel_1bp'],                                       \
                        M2_NLOD                 = nlod,                                                                \
                        M2_TLOD                 = tlod,                                                                \
                        M2_STR                  = tandem,                                                              \
                        M2_ECNT                 = ecnt,                                                                \
                        SOR                     = sor,                                                                 \
                        MSI                     = msi,                                                                 \
                        MSILEN                  = msilen,                                                              \
                        SHIFT3                  = shift3,                                                              \
                        MaxHomopolymer_Length   = homopolymer_length,                                                  \
                        SiteHomopolymer_Length  = site_homopolymer_length,                                             \
                        T_DP                    = tBamFeatures['dp'],                                                  \
                        tBAM_REF_MQ             = '%g' % tBamFeatures['ref_mq'],                                       \
                        tBAM_ALT_MQ             = '%g' % tBamFeatures['alt_mq'],                                       \
                        tBAM_Z_Ranksums_MQ      = '%g' % tBamFeatures['z_ranksums_mq'],                                \
                        tBAM_REF_BQ             = '%g' % tBamFeatures['ref_bq'],                                       \
                        tBAM_ALT_BQ             = '%g' % tBamFeatures['alt_bq'],                                       \
                        tBAM_Z_Ranksums_BQ      = '%g' % tBamFeatures['z_ranksums_bq'],                                \
                        tBAM_REF_NM             = '%g' % tBamFeatures['ref_NM'],                                       \
                        tBAM_ALT_NM             = '%g' % tBamFeatures['alt_NM'],                                       \
                        tBAM_NM_Diff            = '%g' % tBamFeatures['NM_Diff'],                                      \
                        tBAM_REF_Concordant     = tBamFeatures['ref_concordant_reads'],                                \
                        tBAM_REF_Discordant     = tBamFeatures['ref_discordant_reads'],                                \
                        tBAM_ALT_Concordant     = tBamFeatures['alt_concordant_reads'],                                \
                        tBAM_ALT_Discordant     = tBamFeatures['alt_discordant_reads'],                                \
                        tBAM_Concordance_FET    = rescale(tBamFeatures['concordance_fet'], 'fraction', p_scale, 1001), \
                        T_REF_FOR               = tBamFeatures['ref_for'],                                             \
                        T_REF_REV               = tBamFeatures['ref_rev'],                                             \
                        T_ALT_FOR               = tBamFeatures['alt_for'],                                             \
                        T_ALT_REV               = tBamFeatures['alt_rev'],                                             \
                        tBAM_StrandBias_FET     = rescale(tBamFeatures['strandbias_fet'], 'fraction', p_scale, 1001),  \
                        tBAM_Z_Ranksums_EndPos  = '%g' % tBamFeatures['z_ranksums_endpos'],                            \
                        tBAM_REF_Clipped_Reads  = tBamFeatures['ref_SC_reads'],                                        \
                        tBAM_ALT_Clipped_Reads  = tBamFeatures['alt_SC_reads'],                                        \
                        tBAM_Clipping_FET       = rescale(tBamFeatures['clipping_fet'], 'fraction', p_scale, 1001),    \
                        tBAM_MQ0                = tBamFeatures['MQ0'],                                                 \
                        tBAM_Other_Reads        = tBamFeatures['noise_read_count'],                                    \
                        tBAM_Poor_Reads         = tBamFeatures['poor_read_count'],                                     \
                        tBAM_REF_InDel_3bp      = tBamFeatures['ref_indel_3bp'],                                       \
                        tBAM_REF_InDel_2bp      = tBamFeatures['ref_indel_2bp'],                                       \
                        tBAM_REF_InDel_1bp      = tBamFeatures['ref_indel_1bp'],                                       \
                        tBAM_ALT_InDel_3bp      = tBamFeatures['alt_indel_3bp'],                                       \
                        tBAM_ALT_InDel_2bp      = tBamFeatures['alt_indel_2bp'],                                       \
                        tBAM_ALT_InDel_1bp      = tBamFeatures['alt_indel_1bp'],                                       \
                        InDel_Length            = indel_length,                                                        \
                        TrueVariant_or_False    = judgement )

                        # Print it out to stdout:
                        outhandle.write(out_line + '\n')

            # Read into the next line:
            if not is_vcf:
                my_line = my_sites.readline().rstrip()

        ##########  Close all open files if they were opened  ##########
        opened_files = (ref_fa, nbam, tbam, truth, cosmic, dbsnp, mutect,
                        varscan, jsm, sniper, vardict, muse, lofreq, scalpel,
                        strelka, tnscope, platypus)
        [opened_file.close() for opened_file in opened_files if opened_file]
示例#6
0
def vcf2tsv(is_vcf=None,
            is_bed=None,
            is_pos=None,
            bam_fn=None,
            truth=None,
            cosmic=None,
            dbsnp=None,
            mutect=None,
            varscan=None,
            vardict=None,
            lofreq=None,
            scalpel=None,
            strelka=None,
            dedup=True,
            min_mq=1,
            min_bq=5,
            min_caller=0,
            ref_fa=None,
            p_scale=None,
            outfile=None):

    # Convert contig_sequence to chrom_seq dict:
    fai_file = ref_fa + '.fai'
    chrom_seq = genome.faiordict2contigorder(fai_file, 'fai')

    # Determine input format:
    if is_vcf:
        mysites = is_vcf
    elif is_bed:
        mysites = is_bed
    elif is_pos:
        mysites = is_pos
    else:
        mysites = fai_file
        logger.info('No position supplied. Will evaluate the whole genome.')

    # Re-scale output or not:
    if p_scale == None:
        logger.info('NO RE-SCALING')
    elif p_scale.lower() == 'phred':
        p_scale = 'phred'
    elif p_scale.lower() == 'fraction':
        p_scale = 'fraction'
    else:
        p_scale = None
        logger.info('NO RE-SCALING')

    # Define NaN and Inf:
    nan = float('nan')
    inf = float('inf')
    pattern_chr_position = genome.pattern_chr_position

    ## Running
    with genome.open_textfile(mysites) as my_sites, open(outfile,
                                                         'w') as outhandle:

        my_line = my_sites.readline().rstrip()

        bam = pysam.AlignmentFile(bam_fn, reference_filename=ref_fa)
        ref_fa = pysam.FastaFile(ref_fa)

        if truth:
            truth = genome.open_textfile(truth)
            truth_line = genome.skip_vcf_header(truth)

        if cosmic:
            cosmic = genome.open_textfile(cosmic)
            cosmic_line = genome.skip_vcf_header(cosmic)

        if dbsnp:
            dbsnp = genome.open_textfile(dbsnp)
            dbsnp_line = genome.skip_vcf_header(dbsnp)

        # 6 Incorporate callers: get thru the #'s
        if mutect:
            mutect = genome.open_textfile(mutect)
            mutect_line = genome.skip_vcf_header(mutect)

        if varscan:
            varscan = genome.open_textfile(varscan)
            varscan_line = genome.skip_vcf_header(varscan)

        if vardict:
            vardict = genome.open_textfile(vardict)
            vardict_line = genome.skip_vcf_header(vardict)

        if lofreq:
            lofreq = genome.open_textfile(lofreq)
            lofreq_line = genome.skip_vcf_header(lofreq)

        if scalpel:
            scalpel = genome.open_textfile(scalpel)
            scalpel_line = genome.skip_vcf_header(scalpel)

        if strelka:
            strelka = genome.open_textfile(strelka)
            strelka_line = genome.skip_vcf_header(strelka)

        # Get through all the headers:
        while my_line.startswith('#') or my_line.startswith('track='):
            my_line = my_sites.readline().rstrip()

        # First coordinate, for later purpose of making sure the input is sorted properly
        coordinate_i = re.match(genome.pattern_chr_position, my_line)
        coordinate_i = coordinate_i.group() if coordinate_i else ''

        # First line:
        outhandle.write(out_header.replace('{', '').replace('}', '') + '\n')

        while my_line:

            # If VCF, get all the variants with the same coordinate into a list:
            if is_vcf:

                my_vcf = genome.Vcf_line(my_line)
                my_coordinates = [(my_vcf.chromosome, my_vcf.position)]

                variants_at_my_coordinate = []

                alt_bases = my_vcf.altbase.split(',')
                for alt_i in alt_bases:
                    vcf_i = copy(my_vcf)
                    vcf_i.altbase = alt_i
                    variants_at_my_coordinate.append(vcf_i)

                # As long as the "coordinate" stays the same, it will keep reading until it's different.
                while my_coordinates[0] == (my_vcf.chromosome,
                                            my_vcf.position):

                    my_line = my_sites.readline().rstrip()
                    my_vcf = genome.Vcf_line(my_line)

                    ########## This block is code is to ensure the input VCF file is properly sorted ##
                    coordinate_j = re.match(genome.pattern_chr_position,
                                            my_line)
                    coordinate_j = coordinate_j.group() if coordinate_j else ''

                    if genome.whoisbehind(coordinate_i, coordinate_j,
                                          chrom_seq) == 1:
                        raise Exception(
                            '{} does not seem to be properly sorted.'.format(
                                mysites))

                    coordinate_i = coordinate_j
                    ###################################################################################

                    if my_coordinates[0] == (my_vcf.chromosome,
                                             my_vcf.position):

                        alt_bases = my_vcf.altbase.split(',')
                        for alt_i in alt_bases:

                            vcf_i = copy(my_vcf)
                            vcf_i.altbase = alt_i
                            variants_at_my_coordinate.append(vcf_i)

            elif is_bed:
                bed_item = my_line.split('\t')
                my_coordinates = genomic_coordinates(bed_item[0],
                                                     int(bed_item[1]) + 1,
                                                     int(bed_item[2]))

            elif is_pos:
                pos_item = my_line.split('\t')
                my_coordinates = genomic_coordinates(pos_item[0],
                                                     int(pos_item[1]),
                                                     int(pos_item[1]))

            elif fai_file:
                fai_item = my_line.split('\t')
                my_coordinates = genomic_coordinates(fai_item[0], 1,
                                                     int(fai_item[1]))

            ##### ##### ##### ##### ##### #####
            for my_coordinate in my_coordinates:

                ######## If VCF, can get ref base, variant base, as well as other identifying information ########
                if is_vcf:

                    ref_bases = []
                    alt_bases = []
                    indel_lengths = []
                    all_my_identifiers = []

                    for variant_i in variants_at_my_coordinate:

                        ref_base = variant_i.refbase
                        first_alt = variant_i.altbase.split(',')[0]
                        indel_length = len(first_alt) - len(ref_base)

                        ref_bases.append(ref_base)
                        alt_bases.append(first_alt)
                        indel_lengths.append(indel_length)

                        # Extract these information if they exist in the VCF file, but they could be re-written if dbSNP/COSMIC are supplied.
                        if_dbsnp = 1 if re.search(r'rs[0-9]+',
                                                  variant_i.identifier) else 0
                        if_cosmic = 1 if re.search(r'COS[MN][0-9]+',
                                                   variant_i.identifier) else 0
                        if_common = 1 if variant_i.get_info_value(
                            'COMMON') == '1' else 0
                        num_cases = variant_i.get_info_value(
                            'CNT') if variant_i.get_info_value('CNT') else nan

                        if variant_i.identifier == '.':
                            my_identifier_i = set()
                        else:
                            my_identifier_i = variant_i.identifier.split(';')
                            my_identifier_i = set(my_identifier_i)

                        all_my_identifiers.append(my_identifier_i)

                ## If not, 1) get ref_base, first_alt from other VCF files.
                #          2) Create placeholders for dbSNP and COSMIC that can be overwritten with dbSNP/COSMIC VCF files (if provided)
                else:
                    variants_at_my_coordinate = [
                        None
                    ]  # Just to have something to iterate
                    ref_base = first_alt = indel_length = None

                    # Could be re-written if dbSNP/COSMIC are supplied. If not, they will remain NaN.
                    if_dbsnp = if_cosmic = if_common = num_cases = nan

                #################################### Find the same coordinate in those VCF files ####################################
                if mutect:
                    got_mutect, mutect_variants, mutect_line = genome.find_vcf_at_coordinate(
                        my_coordinate, mutect_line, mutect, chrom_seq)
                if varscan:
                    got_varscan, varscan_variants, varscan_line = genome.find_vcf_at_coordinate(
                        my_coordinate, varscan_line, varscan, chrom_seq)
                if vardict:
                    got_vardict, vardict_variants, vardict_line = genome.find_vcf_at_coordinate(
                        my_coordinate, vardict_line, vardict, chrom_seq)
                if lofreq:
                    got_lofreq, lofreq_variants, lofreq_line = genome.find_vcf_at_coordinate(
                        my_coordinate, lofreq_line, lofreq, chrom_seq)
                if scalpel:
                    got_scalpel, scalpel_variants, scalpel_line = genome.find_vcf_at_coordinate(
                        my_coordinate, scalpel_line, scalpel, chrom_seq)
                if strelka:
                    got_strelka, strelka_variants, strelka_line = genome.find_vcf_at_coordinate(
                        my_coordinate, strelka_line, strelka, chrom_seq)
                if truth:
                    got_truth, truth_variants, truth_line = genome.find_vcf_at_coordinate(
                        my_coordinate, truth_line, truth, chrom_seq)
                if dbsnp:
                    got_dbsnp, dbsnp_variants, dbsnp_line = genome.find_vcf_at_coordinate(
                        my_coordinate, dbsnp_line, dbsnp, chrom_seq)
                if cosmic:
                    got_cosmic, cosmic_variants, cosmic_line = genome.find_vcf_at_coordinate(
                        my_coordinate, cosmic_line, cosmic, chrom_seq)

                # Now, use pysam to look into the tBAM file(s), variant by variant from the input:
                for ith_call, my_call in enumerate(variants_at_my_coordinate):

                    if is_vcf:
                        # The particular line in the input VCF file:
                        variant_id = ((my_call.chromosome, my_call.position),
                                      my_call.refbase, my_call.altbase)

                        ref_base = ref_bases[ith_call]
                        first_alt = alt_bases[ith_call]
                        indel_length = indel_lengths[ith_call]
                        my_identifiers = all_my_identifiers[ith_call]

                    else:
                        variant_id = ((my_coordinate[0], my_coordinate[1]),
                                      ref_base, first_alt)

                    # Reset num_caller to 0 for each variant in the same coordinate
                    num_callers = 0

                    #################### Collect Caller Vcf ####################:
                    if mutect:
                        mutect_classification, tlod, ecnt = annotate_caller.ssMuTect(
                            variant_id, mutect_variants)
                        num_callers += mutect_classification
                    else:
                        mutect_classification = tlod = ecnt = nan

                    if varscan:
                        varscan_classification, score_varscan2 = annotate_caller.ssVarScan(
                            variant_id, varscan_variants)
                        num_callers += varscan_classification
                    else:
                        varscan_classification = score_varscan2 = nan

                    if vardict:
                        vardict_classification, msi, msilen, shift3, t_pmean, t_pstd, t_qstd = annotate_caller.ssVarDict(
                            variant_id, vardict_variants)
                        num_callers += vardict_classification
                    else:
                        vardict_classification = msi = msilen = shift3 = t_pmean = t_pstd = t_qstd = nan

                    if lofreq:
                        lofreq_classification = annotate_caller.ssLoFreq(
                            variant_id, lofreq_variants)
                        num_callers += lofreq_classification
                    else:
                        lofreq_classification = nan

                    if scalpel:
                        scalpel_classification = annotate_caller.ssScalpel(
                            variant_id, scalpel_variants)
                        num_callers += scalpel_classification
                    else:
                        scalpel_classification = nan

                    if strelka:
                        strelka_classification = annotate_caller.ssStrelka(
                            variant_id, strelka_variants)
                        num_callers += strelka_classification
                    else:
                        strelka_classification = nan

                    # Potentially write the output only if it meets this threshold:
                    if num_callers >= min_caller:

                        ########## Ground truth file ##########
                        if truth:
                            if variant_id in truth_variants.keys():
                                judgement = 1
                                my_identifiers.add('TruePositive')
                            else:
                                judgement = 0
                                my_identifiers.add('FalsePositive')
                        else:
                            judgement = nan

                        ########## dbSNP ########## Will overwrite dbSNP info from input VCF file
                        if dbsnp:
                            if_dbsnp, if_common, rsID = annotate_caller.dbSNP(
                                variant_id, dbsnp_variants)
                            for ID_i in rsID:
                                my_identifiers.add(ID_i)

                        ########## COSMIC ########## Will overwrite COSMIC info from input VCF file
                        if cosmic:
                            if_cosmic, num_cases, cosmicID = annotate_caller.COSMIC(
                                variant_id, cosmic_variants)
                            for ID_i in cosmicID:
                                my_identifiers.add(ID_i)

                        ########## ######### INFO EXTRACTION FROM BAM FILES ########## #########
                        # Tumor tBAM file:
                        tBamFeatures = sequencing_features.from_bam(
                            bam, my_coordinate, ref_base, first_alt, min_mq,
                            min_bq)

                        # Homopolymer eval:
                        homopolymer_length, site_homopolymer_length = sequencing_features.from_genome_reference(
                            ref_fa, my_coordinate, ref_base, first_alt)

                        # Linguistic sequence complexity in a +/-80bp window, but substring calculation stops at 20-bp substring.
                        seq_span_80bp = ref_fa.fetch(
                            my_coordinate[0], max(0, my_coordinate[1] - 41),
                            my_coordinate[1] + 40)
                        seq_left_80bp = ref_fa.fetch(
                            my_coordinate[0], max(0, my_coordinate[1] - 81),
                            my_coordinate[1])
                        seq_right_80bp = ref_fa.fetch(my_coordinate[0],
                                                      my_coordinate[1],
                                                      my_coordinate[1] + 81)

                        if len(seq_span_80bp) > 20:
                            LC_spanning = sequencing_features.subLC(
                                seq_span_80bp, 20)
                        else:
                            LC_spanning = math.nan

                        if len(seq_left_80bp) > 20:
                            left_LC = sequencing_features.subLC(
                                seq_left_80bp, 20)
                        else:
                            left_LC = math.nan

                        if len(seq_right_80bp) > 20:
                            right_LC = sequencing_features.subLC(
                                seq_right_80bp, 20)
                        else:
                            right_LC = math.nan

                        LC_adjacent = min(left_LC, right_LC)

                        LC_spanning_phred = genome.p2phred(1 - LC_spanning, 40)
                        LC_adjacent_phred = genome.p2phred(1 - LC_adjacent, 40)

                        # Fill the ID field of the TSV/VCF
                        my_identifiers = ';'.join(
                            my_identifiers) if my_identifiers else '.'

                        ###
                        out_line = out_header.format( \
                        CHROM                      = my_coordinate[0],                                                    \
                        POS                        = my_coordinate[1],                                                    \
                        ID                         = my_identifiers,                                                      \
                        REF                        = ref_base,                                                            \
                        ALT                        = first_alt,                                                           \
                        if_MuTect                  = mutect_classification,                                               \
                        if_Strelka                 = strelka_classification,                                              \
                        if_VarScan2                = varscan_classification,                                              \
                        if_VarDict                 = vardict_classification,                                              \
                        if_LoFreq                  = lofreq_classification,                                               \
                        if_Scalpel                 = scalpel_classification,                                              \
                        VarScan2_Score             = rescale(score_varscan2,      'phred', p_scale, 1001),                \
                        if_dbsnp                   = if_dbsnp,                                                            \
                        COMMON                     = if_common,                                                           \
                        if_COSMIC                  = if_cosmic,                                                           \
                        COSMIC_CNT                 = num_cases,                                                           \
                        Consistent_Mates           = tBamFeatures['consistent_mates'],                                    \
                        Inconsistent_Mates         = tBamFeatures['inconsistent_mates'],                                  \
                        Seq_Complexity_Span        = LC_spanning_phred,                                                   \
                        Seq_Complexity_Adj         = LC_adjacent_phred,                                                   \
                        M2_TLOD                    = tlod,                                                                \
                        M2_ECNT                    = ecnt,                                                                \
                        MSI                        = msi,                                                                 \
                        MSILEN                     = msilen,                                                              \
                        SHIFT3                     = shift3,                                                              \
                        MaxHomopolymer_Length      = homopolymer_length,                                                  \
                        SiteHomopolymer_Length     = site_homopolymer_length,                                             \
                        T_DP                       = tBamFeatures['dp'],                                                  \
                        tBAM_REF_MQ                = '%g' % tBamFeatures['ref_mq'],                                       \
                        tBAM_ALT_MQ                = '%g' % tBamFeatures['alt_mq'],                                       \
                        tBAM_p_MannWhitneyU_MQ     = '%g' % tBamFeatures['p_mannwhitneyu_mq'],                            \
                        tBAM_REF_BQ                = '%g' % tBamFeatures['ref_bq'],                                       \
                        tBAM_ALT_BQ                = '%g' % tBamFeatures['alt_bq'],                                       \
                        tBAM_p_MannWhitneyU_BQ     = '%g' % tBamFeatures['p_mannwhitneyu_bq'],                            \
                        tBAM_REF_NM                = '%g' % tBamFeatures['ref_NM'],                                       \
                        tBAM_ALT_NM                = '%g' % tBamFeatures['alt_NM'],                                       \
                        tBAM_NM_Diff               = '%g' % tBamFeatures['NM_Diff'],                                      \
                        tBAM_REF_Concordant        = tBamFeatures['ref_concordant_reads'],                                \
                        tBAM_REF_Discordant        = tBamFeatures['ref_discordant_reads'],                                \
                        tBAM_ALT_Concordant        = tBamFeatures['alt_concordant_reads'],                                \
                        tBAM_ALT_Discordant        = tBamFeatures['alt_discordant_reads'],                                \
                        tBAM_Concordance_FET       = rescale(tBamFeatures['concordance_fet'], 'fraction', p_scale, 1001), \
                        T_REF_FOR                  = tBamFeatures['ref_for'],                                             \
                        T_REF_REV                  = tBamFeatures['ref_rev'],                                             \
                        T_ALT_FOR                  = tBamFeatures['alt_for'],                                             \
                        T_ALT_REV                  = tBamFeatures['alt_rev'],                                             \
                        tBAM_StrandBias_FET        = rescale(tBamFeatures['strandbias_fet'], 'fraction', p_scale, 1001),  \
                        tBAM_p_MannWhitneyU_EndPos = '%g' % tBamFeatures['p_mannwhitneyu_endpos'],                        \
                        tBAM_REF_Clipped_Reads     = tBamFeatures['ref_SC_reads'],                                        \
                        tBAM_ALT_Clipped_Reads     = tBamFeatures['alt_SC_reads'],                                        \
                        tBAM_Clipping_FET          = rescale(tBamFeatures['clipping_fet'], 'fraction', p_scale, 1001),    \
                        tBAM_MQ0                   = tBamFeatures['MQ0'],                                                 \
                        tBAM_Other_Reads           = tBamFeatures['noise_read_count'],                                    \
                        tBAM_Poor_Reads            = tBamFeatures['poor_read_count'],                                     \
                        tBAM_REF_InDel_3bp         = tBamFeatures['ref_indel_3bp'],                                       \
                        tBAM_REF_InDel_2bp         = tBamFeatures['ref_indel_2bp'],                                       \
                        tBAM_REF_InDel_1bp         = tBamFeatures['ref_indel_1bp'],                                       \
                        tBAM_ALT_InDel_3bp         = tBamFeatures['alt_indel_3bp'],                                       \
                        tBAM_ALT_InDel_2bp         = tBamFeatures['alt_indel_2bp'],                                       \
                        tBAM_ALT_InDel_1bp         = tBamFeatures['alt_indel_1bp'],                                       \
                        InDel_Length               = indel_length,                                                        \
                        TrueVariant_or_False       = judgement )

                        # Print it out to stdout:
                        outhandle.write(out_line + '\n')

            # Read into the next line:
            if not is_vcf:
                my_line = my_sites.readline().rstrip()

        ##########  Close all open files if they were opened  ##########
        opened_files = (ref_fa, bam, truth, cosmic, dbsnp, mutect, varscan,
                        vardict, lofreq, scalpel, strelka)
        [opened_file.close() for opened_file in opened_files if opened_file]
示例#7
0
def tsv2vcf(tsv_fn, vcf_fn, tools, pass_score=0.5, lowqual_score=0.1, hom_threshold=0.85, het_threshold=0.01, single_mode=False, paired_mode=True, normal_sample_name='NORMAL', tumor_sample_name='TUMOR', print_reject=True, phred_scaled=True):

    tools_code = {'CGA':           'M',
                  'MuTect':        'M',
                  'MuTect2':       'M',
                  'VarScan2':      'V',
                  'JointSNVMix2':  'J',
                  'SomaticSniper': 'S',
                  'VarDict':       'D',
                  'MuSE':          'U',
                  'LoFreq':        'L',
                  'Scalpel':       'P',
                  'Strelka':       'K',
                  'TNscope':       'T',
                  'Platypus':      'Y'}
    
    
    mvjsdu = ''
    for tool_i in tools:
        assert tool_i in tools_code.keys()
        mvjsdu = mvjsdu + tools_code[tool_i]
    
    total_num_tools = len(mvjsdu)
    tool_string = ', '.join( tools )
        
    
    with open(tsv_fn) as tsv, open(vcf_fn, 'w') as vcf:
        
        # First line is a header:
        tsv_i = tsv.readline().rstrip()
        
        tsv_header = tsv_i.split('\t')
        
        # Make the header items into indices (single/paired have different tool names)
        toolcode2index = {}
        for n,item in enumerate(tsv_header):
        
            if   'if_MuTect'        == item:
                toolcode2index['M'] = n
            elif 'if_VarScan2'      == item:
                toolcode2index['V'] = n
            elif 'if_JointSNVMix2'  == item:
                toolcode2index['J'] = n
            elif 'if_SomaticSniper' == item:
                toolcode2index['S'] = n
            elif 'if_VarDict'       == item:
                toolcode2index['D'] = n
            elif 'MuSE_Tier'        == item:
                toolcode2index['U'] = n
                MuSE_Tier = tsv_header.index('MuSE_Tier')
            elif 'if_LoFreq'        == item:
                toolcode2index['L'] = n
            elif 'if_Scalpel'       == item:
                toolcode2index['P'] = n
            elif 'if_Strelka'       == item:
                toolcode2index['K'] = n
            elif 'if_TNscope'       == item:
                toolcode2index['T'] = n
            elif 'if_Platypus'       == item:
                toolcode2index['Y'] = n


        ALT                  = tsv_header.index('ALT')
        CHROM                = tsv_header.index('CHROM')
        ID                   = tsv_header.index('ID')
        POS                  = tsv_header.index('POS')
        REF                  = tsv_header.index('REF')
        T_ALT_FOR            = tsv_header.index('T_ALT_FOR')
        T_ALT_REV            = tsv_header.index('T_ALT_REV')
        tBAM_ALT_BQ          = tsv_header.index('tBAM_ALT_BQ')
        tBAM_ALT_Concordant  = tsv_header.index('tBAM_ALT_Concordant')
        tBAM_ALT_Discordant  = tsv_header.index('tBAM_ALT_Discordant')
        tBAM_ALT_MQ          = tsv_header.index('tBAM_ALT_MQ')
        tBAM_ALT_NM          = tsv_header.index('tBAM_ALT_NM')
        tBAM_Concordance_FET = tsv_header.index('tBAM_Concordance_FET')
        tBAM_MQ0             = tsv_header.index('tBAM_MQ0')
        tBAM_REF_BQ          = tsv_header.index('tBAM_REF_BQ')
        tBAM_REF_Concordant  = tsv_header.index('tBAM_REF_Concordant')
        tBAM_REF_Discordant  = tsv_header.index('tBAM_REF_Discordant')
        tBAM_REF_MQ          = tsv_header.index('tBAM_REF_MQ')
        tBAM_REF_NM          = tsv_header.index('tBAM_REF_NM')
        tBAM_StrandBias_FET  = tsv_header.index('tBAM_StrandBias_FET')
        tBAM_Z_Ranksums_BQ   = tsv_header.index('tBAM_Z_Ranksums_BQ')
        tBAM_Z_Ranksums_MQ   = tsv_header.index('tBAM_Z_Ranksums_MQ')
        T_REF_FOR            = tsv_header.index('T_REF_FOR')
        T_REF_REV            = tsv_header.index('T_REF_REV')
        
        
        if not single_mode:
            N_ALT_FOR            = tsv_header.index('N_ALT_FOR')
            N_ALT_REV            = tsv_header.index('N_ALT_REV')
            nBAM_ALT_BQ          = tsv_header.index('nBAM_ALT_BQ')
            nBAM_ALT_Concordant  = tsv_header.index('nBAM_ALT_Concordant')
            nBAM_ALT_MQ          = tsv_header.index('nBAM_ALT_MQ')
            nBAM_ALT_NM          = tsv_header.index('nBAM_ALT_NM')
            nBAM_Concordance_FET = tsv_header.index('nBAM_Concordance_FET')
            nBAM_MQ0             = tsv_header.index('nBAM_MQ0')
            nBAM_REF_BQ          = tsv_header.index('nBAM_REF_BQ')
            nBAM_REF_Concordant  = tsv_header.index('nBAM_REF_Concordant')
            nBAM_REF_Discordant  = tsv_header.index('nBAM_REF_Discordant')
            nBAM_REF_MQ          = tsv_header.index('nBAM_REF_MQ')
            nBAM_REF_NM          = tsv_header.index('nBAM_REF_NM')
            nBAM_StrandBias_FET  = tsv_header.index('nBAM_StrandBias_FET')
            nBAM_Z_Ranksums_BQ   = tsv_header.index('nBAM_Z_Ranksums_BQ')
            nBAM_Z_Ranksums_MQ   = tsv_header.index('nBAM_Z_Ranksums_MQ')
            N_REF_FOR            = tsv_header.index('N_REF_FOR')
            N_REF_REV            = tsv_header.index('N_REF_REV')

        try:
            SCORE = tsv_header.index('SCORE')
        except ValueError:
            pass


        # Create vcf headers:
        vcf.write('##fileformat=VCFv4.1\n')
        vcf.write(version_line + '\n')
        vcf.write('##FILTER=<ID=LowQual,Description="Less confident somatic mutation calls with probability value at least {}">\n'.format(lowqual_score) )
        vcf.write('##FILTER=<ID=PASS,Description="Accept as a confident somatic mutation calls with probability value at least {}">\n'.format(pass_score) )
        vcf.write('##FILTER=<ID=REJECT,Description="Rejected as a confident somatic mutation with ONCOSCORE below 2">\n')
        vcf.write('##INFO=<ID=SOMATIC,Number=0,Type=Flag,Description="Somatic mutation in primary">\n')
        vcf.write('##INFO=<ID={COMBO},Number={NUM},Type=Integer,Description="Calling decision of the {NUM} algorithms: {TOOL_STRING}">\n'.format(COMBO=mvjsdu, NUM=total_num_tools, TOOL_STRING=tool_string) )
        vcf.write('##INFO=<ID=NUM_TOOLS,Number=1,Type=Float,Description="Number of tools called it Somatic">\n')
        
        if single_mode:
            vcf.write('##INFO=<ID=AF,Number=1,Type=Float,Description="Variant Allele Fraction">\n')
        
        vcf.write('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n')
        vcf.write('##FORMAT=<ID=DP4,Number=4,Type=Integer,Description="ref forward, ref reverse, alt forward, alt reverse">\n')
        vcf.write('##FORMAT=<ID=CD4,Number=4,Type=Integer,Description="ref concordant, ref discordant, alt concordant, alt discordant">\n')
    
        vcf.write('##FORMAT=<ID=refMQ,Number=1,Type=Float,Description="average mapping score for reference reads">\n')
        vcf.write('##FORMAT=<ID=altMQ,Number=1,Type=Float,Description="average mapping score for alternate reads">\n')
        vcf.write('##FORMAT=<ID=refBQ,Number=1,Type=Float,Description="average base quality score for reference reads">\n')
        vcf.write('##FORMAT=<ID=altBQ,Number=1,Type=Float,Description="average base quality score for alternate reads">\n')
        vcf.write('##FORMAT=<ID=refNM,Number=1,Type=Float,Description="average edit distance for reference reads">\n')
        vcf.write('##FORMAT=<ID=altNM,Number=1,Type=Float,Description="average edit distance for alternate reads">\n')
    
        vcf.write('##FORMAT=<ID=fetSB,Number=1,Type=Float,Description="Strand bias FET">\n')
        vcf.write('##FORMAT=<ID=fetCD,Number=1,Type=Float,Description="Concordance FET">\n')
        vcf.write('##FORMAT=<ID=zMQ,Number=1,Type=Float,Description="z-score rank sum of mapping quality">\n')
        vcf.write('##FORMAT=<ID=zBQ,Number=1,Type=Float,Description="z-score rank sum of base quality">\n')
        vcf.write('##FORMAT=<ID=MQ0,Number=1,Type=Integer,Description="Number of reads with mapping quality of 0">\n')
        vcf.write('##FORMAT=<ID=VAF,Number=1,Type=Float,Description="Variant Allele Frequency">\n')
    
        if single_mode:
            vcf.write('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{}\n'.format(tumor_sample_name) )
        elif paired_mode:
            vcf.write('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{}\t{}\n'.format(normal_sample_name, tumor_sample_name) )
        
        
        # Start writing content:
        tsv_i = tsv.readline().rstrip()
        
        
        while tsv_i:
            
            tsv_item = tsv_i.split('\t')
            try:
                score = float( tsv_item[SCORE] )
            except NameError:
                score = nan
    
            if phred_scaled:
                scaled_score = p2phred(1-score, max_phred = 255)
            else:
                scaled_score = score
            
            
            try:
                # Non-PASS MuSE calls are made into fractions. 
                if tsv_item[MuSE_Tier] != '1':
                    if_MuSE = '0'
                else:
                    if_MuSE = '1'
            except NameError:
                if_MuSE = '.'
            
            
            MVJS = []
            num_tools = 0
            for tool_i in mvjsdu:
                
                if_Tool = tsv_item[ toolcode2index[tool_i] ]
                
                if if_Tool == '1':
                    if_Tool = '1'
                
                elif if_Tool == 'nan':
                    if_Tool = '.'
                
                else:
                    if_Tool = '0'
                
                MVJS.append( if_Tool )
                num_tools = num_tools + int(if_Tool)
                
            MVJS = ','.join(MVJS)
                
            info_string = '{COMBO}={MVJSD};NUM_TOOLS={NUM_TOOLS}'.format( COMBO=mvjsdu, MVJSD=MVJS, NUM_TOOLS=num_tools )
    
            # NORMAL
            if not single_mode:
                n_ref_mq  = tsv_item[nBAM_REF_MQ]          if tsv_item[nBAM_REF_MQ]          != 'nan' else '.'
                n_alt_mq  = tsv_item[nBAM_ALT_MQ]          if tsv_item[nBAM_ALT_MQ]          != 'nan' else '.'
                n_ref_bq  = tsv_item[nBAM_REF_BQ]          if tsv_item[nBAM_REF_BQ]          != 'nan' else '.'
                n_alt_bq  = tsv_item[nBAM_ALT_BQ]          if tsv_item[nBAM_ALT_BQ]          != 'nan' else '.'
                n_ref_nm  = tsv_item[nBAM_REF_NM]          if tsv_item[nBAM_REF_NM]          != 'nan' else '.'
                n_alt_nm  = tsv_item[nBAM_ALT_NM]          if tsv_item[nBAM_ALT_NM]          != 'nan' else '.'
                n_MQ0     = tsv_item[nBAM_MQ0]             if tsv_item[nBAM_MQ0]             != 'nan' else '.'
                
                n_sb      = tsv_item[nBAM_StrandBias_FET]  if tsv_item[nBAM_StrandBias_FET]  != 'nan' else '.'
                n_cd      = tsv_item[nBAM_Concordance_FET] if tsv_item[nBAM_Concordance_FET] != 'nan' else '.'
                n_bqb     = tsv_item[nBAM_Z_Ranksums_BQ]   if tsv_item[nBAM_Z_Ranksums_BQ]   != 'nan' else '.'
                n_mqb     = tsv_item[nBAM_Z_Ranksums_MQ]   if tsv_item[nBAM_Z_Ranksums_MQ]   != 'nan' else '.'
                
                n_ref_for = tsv_item[N_REF_FOR] if tsv_item[N_REF_FOR] != 'nan' else '0'
                n_ref_rev = tsv_item[N_REF_REV] if tsv_item[N_REF_REV] != 'nan' else '0'
                n_alt_for = tsv_item[N_ALT_FOR] if tsv_item[N_ALT_FOR] != 'nan' else '0'
                n_alt_rev = tsv_item[N_ALT_REV] if tsv_item[N_ALT_REV] != 'nan' else '0'
                
                n_ref_con = tsv_item[nBAM_REF_Concordant] if tsv_item[nBAM_REF_Concordant] != 'nan' else '0'
                n_ref_dis = tsv_item[nBAM_REF_Discordant] if tsv_item[nBAM_REF_Discordant] != 'nan' else '0'
                n_alt_con = tsv_item[nBAM_ALT_Concordant] if tsv_item[nBAM_ALT_Concordant] != 'nan' else '0'
                n_alt_dis = tsv_item[nBAM_ALT_Concordant] if tsv_item[nBAM_ALT_Concordant] != 'nan' else '0'
                
        
                # DP4toGT:
                gt = dp4_to_gt(n_ref_for, n_ref_rev, n_alt_for, n_alt_rev, hom_threshold, het_threshold)
                
                # 4-number strings:
                dp4_string = ','.join(( n_ref_for, n_ref_rev, n_alt_for, n_alt_rev ))
                cd4_string = ','.join(( n_ref_con, n_ref_dis, n_alt_con, n_alt_dis ))
                
                try:
                    vaf = ( int(n_alt_for) + int(n_alt_rev) ) / ( int(n_alt_for) + int(n_alt_rev) + int(n_ref_for) + int(n_ref_rev) )
                except ZeroDivisionError:
                    vaf = 0
                vaf = '%.3g' % vaf
                
                normal_sample_string = '{GT}:{DP4}:{CD4}:{refMQ}:{altMQ}:{refBQ}:{altBQ}:{refNM}:{altNM}:{fetSB}:{fetCD}:{zMQ}:{zBQ}:{MQ0}:{VAF}'.format(GT=gt, DP4=dp4_string, CD4=cd4_string, refMQ=n_ref_mq, altMQ=n_alt_mq, refBQ=n_ref_bq, altBQ=n_alt_bq, refNM=n_ref_nm, altNM=n_alt_nm, fetSB=n_sb, fetCD=n_cd, zMQ=n_mqb, zBQ=n_bqb, MQ0=n_MQ0, VAF=vaf)
    
    
            ### TUMOR ###
            t_ref_mq  = tsv_item[tBAM_REF_MQ]          if tsv_item[tBAM_REF_MQ]          != 'nan' else '.'
            t_alt_mq  = tsv_item[tBAM_ALT_MQ]          if tsv_item[tBAM_ALT_MQ]          != 'nan' else '.'
            t_ref_bq  = tsv_item[tBAM_REF_BQ]          if tsv_item[tBAM_REF_BQ]          != 'nan' else '.'
            t_alt_bq  = tsv_item[tBAM_ALT_BQ]          if tsv_item[tBAM_ALT_BQ]          != 'nan' else '.'
            t_ref_nm  = tsv_item[tBAM_REF_NM]          if tsv_item[tBAM_REF_NM]          != 'nan' else '.'
            t_alt_nm  = tsv_item[tBAM_ALT_NM]          if tsv_item[tBAM_ALT_NM]          != 'nan' else '.'        
            t_MQ0     = tsv_item[tBAM_MQ0]             if tsv_item[tBAM_MQ0]             != 'nan' else '.'
            
            t_sb      = tsv_item[tBAM_StrandBias_FET]  if tsv_item[tBAM_StrandBias_FET]  != 'nan' else '.'
            t_cd      = tsv_item[tBAM_Concordance_FET] if tsv_item[tBAM_Concordance_FET] != 'nan' else '.'        
            t_bqb     = tsv_item[tBAM_Z_Ranksums_BQ]   if tsv_item[tBAM_Z_Ranksums_BQ]   != 'nan' else '.'
            t_mqb     = tsv_item[tBAM_Z_Ranksums_MQ]   if tsv_item[tBAM_Z_Ranksums_MQ]   != 'nan' else '.'
            
            t_ref_for = tsv_item[T_REF_FOR] if tsv_item[T_REF_FOR] != 'nan' else '0'
            t_ref_rev = tsv_item[T_REF_REV] if tsv_item[T_REF_REV] != 'nan' else '0'
            t_alt_for = tsv_item[T_ALT_FOR] if tsv_item[T_ALT_FOR] != 'nan' else '0'
            t_alt_rev = tsv_item[T_ALT_REV] if tsv_item[T_ALT_REV] != 'nan' else '0'
    
            t_ref_con = tsv_item[tBAM_REF_Concordant] if tsv_item[tBAM_REF_Concordant] != 'nan' else '0'
            t_ref_dis = tsv_item[tBAM_REF_Discordant] if tsv_item[tBAM_REF_Discordant] != 'nan' else '0'
            t_alt_con = tsv_item[tBAM_ALT_Concordant] if tsv_item[tBAM_ALT_Concordant] != 'nan' else '0'
            t_alt_dis = tsv_item[tBAM_ALT_Discordant] if tsv_item[tBAM_ALT_Discordant] != 'nan' else '0'
    
            # DP4toGT:
            gt = dp4_to_gt(t_ref_for, t_ref_rev, t_alt_for, t_alt_rev, hom_threshold, het_threshold)
            
            # 4-number strings:
            dp4_string = ','.join(( t_ref_for, t_ref_rev, t_alt_for, t_alt_rev ))
            cd4_string = ','.join(( t_ref_con, t_ref_dis, t_alt_con, t_alt_dis ))        
            
            try:
                vd  = int(t_alt_for) + int(t_alt_rev)
                vaf = vd / ( vd + int(t_ref_for) + int(t_ref_rev) )
            except ZeroDivisionError:
                vd  = 0
                vaf = 0
                
            vaf = '%.3g' % vaf
    
            # Add VAF to info string if and only if there is one single sample in the VCF sample
            if single_mode:
                info_string = info_string + ';AF={}'.format(vaf)
                
    
            tumor_sample_string = '{GT}:{DP4}:{CD4}:{refMQ}:{altMQ}:{refBQ}:{altBQ}:{refNM}:{altNM}:{fetSB}:{fetCD}:{zMQ}:{zBQ}:{MQ0}:{VAF}'.format(GT=gt, DP4=dp4_string, CD4=cd4_string, refMQ=t_ref_mq, altMQ=t_alt_mq, refBQ=t_ref_bq, altBQ=t_alt_bq, refNM=t_ref_nm, altNM=t_alt_nm, fetSB=t_sb, fetCD=t_cd, zMQ=t_mqb, zBQ=t_bqb, MQ0=t_MQ0, VAF=vaf)
    
            field_string = 'GT:DP4:CD4:refMQ:altMQ:refBQ:altBQ:refNM:altNM:fetSB:fetCD:zMQ:zBQ:MQ0:VAF'
            
            if score is nan:
                scaled_score = 0
            
            
            # PASS
            if score >= pass_score or (score is nan and num_tools > 0.5*total_num_tools):
                
                vcf_line = '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format( tsv_item[CHROM], tsv_item[POS], tsv_item[ID], tsv_item[REF], tsv_item[ALT], '%.4f' % scaled_score, 'PASS', 'SOMATIC;'+info_string, field_string)
                
                if single_mode:
                    vcf_line = vcf_line + '\t' + tumor_sample_string
                elif paired_mode:
                    vcf_line = vcf_line + '\t' + normal_sample_string + '\t' + tumor_sample_string
                
                vcf.write( vcf_line + '\n' )
                
            # Low Qual
            elif score >= lowqual_score or (score is nan and num_tools >= 1 and num_tools >= 0.33*total_num_tools):
                                            
                vcf_line = '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format( tsv_item[CHROM], tsv_item[POS], tsv_item[ID], tsv_item[REF], tsv_item[ALT], '%.4f' % scaled_score, 'LowQual', info_string, field_string)
                
                if single_mode:
                    vcf_line = vcf_line + '\t' + tumor_sample_string
                elif paired_mode:
                    vcf_line = vcf_line + '\t' + normal_sample_string + '\t' + tumor_sample_string
                
                vcf.write( vcf_line + '\n' )
            
            # REJECT
            elif print_reject:
                
                vcf_line = '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format( tsv_item[CHROM], tsv_item[POS], tsv_item[ID], tsv_item[REF], tsv_item[ALT], '%.4f' % scaled_score, 'REJECT', info_string, field_string)
    
                if single_mode:
                    vcf_line = vcf_line + '\t' + tumor_sample_string
                elif paired_mode:
                    vcf_line = vcf_line + '\t' + normal_sample_string + '\t' + tumor_sample_string
                
                vcf.write( vcf_line + '\n' )
    
    
            # Next line:
            tsv_i = tsv.readline().rstrip()
示例#8
0
def tsv2vcf(tsv_fn,
            vcf_fn,
            tools,
            pass_score=0.5,
            lowqual_score=0.1,
            hom_threshold=0.85,
            het_threshold=0.01,
            single_mode=False,
            paired_mode=True,
            normal_sample_name='NORMAL',
            tumor_sample_name='TUMOR',
            print_reject=True,
            phred_scaled=True):

    tools_code = {
        'CGA': 'M',
        'MuTect': 'M',
        'MuTect2': 'M',
        'VarScan2': 'V',
        'JointSNVMix2': 'J',
        'SomaticSniper': 'S',
        'VarDict': 'D',
        'MuSE': 'U',
        'LoFreq': 'L',
        'Scalpel': 'P',
        'Strelka': 'K',
        'TNscope': 'T',
        'Platypus': 'Y'
    }

    mvjsdu = ''
    for tool_i in tools:
        assert tool_i in tools_code.keys()
        mvjsdu = mvjsdu + tools_code[tool_i]

    total_num_tools = len(mvjsdu)
    tool_string = ', '.join(tools)

    with open(tsv_fn) as tsv, open(vcf_fn, 'w') as vcf:

        # First line is a header:
        tsv_i = tsv.readline().rstrip()

        tsv_header = tsv_i.split('\t')

        # Make the header items into indices (single/paired have different tool names)
        toolcode2index = {}
        for n, item in enumerate(tsv_header):

            if 'if_MuTect' == item:
                toolcode2index['M'] = n
            elif 'if_VarScan2' == item:
                toolcode2index['V'] = n
            elif 'if_JointSNVMix2' == item:
                toolcode2index['J'] = n
            elif 'if_SomaticSniper' == item:
                toolcode2index['S'] = n
            elif 'if_VarDict' == item:
                toolcode2index['D'] = n
            elif 'MuSE_Tier' == item:
                toolcode2index['U'] = n
                MuSE_Tier = tsv_header.index('MuSE_Tier')
            elif 'if_LoFreq' == item:
                toolcode2index['L'] = n
            elif 'if_Scalpel' == item:
                toolcode2index['P'] = n
            elif 'if_Strelka' == item:
                toolcode2index['K'] = n
            elif 'if_TNscope' == item:
                toolcode2index['T'] = n
            elif 'if_Platypus' == item:
                toolcode2index['Y'] = n

        ALT = tsv_header.index('ALT')
        CHROM = tsv_header.index('CHROM')
        ID = tsv_header.index('ID')
        POS = tsv_header.index('POS')
        REF = tsv_header.index('REF')
        T_ALT_FOR = tsv_header.index('T_ALT_FOR')
        T_ALT_REV = tsv_header.index('T_ALT_REV')
        tBAM_ALT_BQ = tsv_header.index('tBAM_ALT_BQ')
        tBAM_ALT_Concordant = tsv_header.index('tBAM_ALT_Concordant')
        tBAM_ALT_Discordant = tsv_header.index('tBAM_ALT_Discordant')
        tBAM_ALT_MQ = tsv_header.index('tBAM_ALT_MQ')
        tBAM_ALT_NM = tsv_header.index('tBAM_ALT_NM')
        tBAM_Concordance_FET = tsv_header.index('tBAM_Concordance_FET')
        tBAM_MQ0 = tsv_header.index('tBAM_MQ0')
        tBAM_REF_BQ = tsv_header.index('tBAM_REF_BQ')
        tBAM_REF_Concordant = tsv_header.index('tBAM_REF_Concordant')
        tBAM_REF_Discordant = tsv_header.index('tBAM_REF_Discordant')
        tBAM_REF_MQ = tsv_header.index('tBAM_REF_MQ')
        tBAM_REF_NM = tsv_header.index('tBAM_REF_NM')
        tBAM_StrandBias_FET = tsv_header.index('tBAM_StrandBias_FET')
        tBAM_Z_Ranksums_BQ = tsv_header.index('tBAM_Z_Ranksums_BQ')
        tBAM_Z_Ranksums_MQ = tsv_header.index('tBAM_Z_Ranksums_MQ')
        T_REF_FOR = tsv_header.index('T_REF_FOR')
        T_REF_REV = tsv_header.index('T_REF_REV')

        if not single_mode:
            N_ALT_FOR = tsv_header.index('N_ALT_FOR')
            N_ALT_REV = tsv_header.index('N_ALT_REV')
            nBAM_ALT_BQ = tsv_header.index('nBAM_ALT_BQ')
            nBAM_ALT_Concordant = tsv_header.index('nBAM_ALT_Concordant')
            nBAM_ALT_MQ = tsv_header.index('nBAM_ALT_MQ')
            nBAM_ALT_NM = tsv_header.index('nBAM_ALT_NM')
            nBAM_Concordance_FET = tsv_header.index('nBAM_Concordance_FET')
            nBAM_MQ0 = tsv_header.index('nBAM_MQ0')
            nBAM_REF_BQ = tsv_header.index('nBAM_REF_BQ')
            nBAM_REF_Concordant = tsv_header.index('nBAM_REF_Concordant')
            nBAM_REF_Discordant = tsv_header.index('nBAM_REF_Discordant')
            nBAM_REF_MQ = tsv_header.index('nBAM_REF_MQ')
            nBAM_REF_NM = tsv_header.index('nBAM_REF_NM')
            nBAM_StrandBias_FET = tsv_header.index('nBAM_StrandBias_FET')
            nBAM_Z_Ranksums_BQ = tsv_header.index('nBAM_Z_Ranksums_BQ')
            nBAM_Z_Ranksums_MQ = tsv_header.index('nBAM_Z_Ranksums_MQ')
            N_REF_FOR = tsv_header.index('N_REF_FOR')
            N_REF_REV = tsv_header.index('N_REF_REV')

        try:
            SCORE = tsv_header.index('SCORE')
        except ValueError:
            pass

        # Create vcf headers:
        vcf.write('##fileformat=VCFv4.1\n')
        vcf.write(version_line + '\n')
        vcf.write(
            '##FILTER=<ID=LowQual,Description="Less confident somatic mutation calls with probability value at least {}">\n'
            .format(lowqual_score))
        vcf.write(
            '##FILTER=<ID=PASS,Description="Accept as a confident somatic mutation calls with probability value at least {}">\n'
            .format(pass_score))
        vcf.write(
            '##FILTER=<ID=REJECT,Description="Rejected as a confident somatic mutation with ONCOSCORE below 2">\n'
        )
        vcf.write(
            '##INFO=<ID=SOMATIC,Number=0,Type=Flag,Description="Somatic mutation in primary">\n'
        )
        vcf.write(
            '##INFO=<ID={COMBO},Number={NUM},Type=Integer,Description="Calling decision of the {NUM} algorithms: {TOOL_STRING}">\n'
            .format(COMBO=mvjsdu, NUM=total_num_tools,
                    TOOL_STRING=tool_string))
        vcf.write(
            '##INFO=<ID=NUM_TOOLS,Number=1,Type=Float,Description="Number of tools called it Somatic">\n'
        )

        if single_mode:
            vcf.write(
                '##INFO=<ID=AF,Number=1,Type=Float,Description="Variant Allele Fraction">\n'
            )

        vcf.write(
            '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n')
        vcf.write(
            '##FORMAT=<ID=DP4,Number=4,Type=Integer,Description="ref forward, ref reverse, alt forward, alt reverse">\n'
        )
        vcf.write(
            '##FORMAT=<ID=CD4,Number=4,Type=Integer,Description="ref concordant, ref discordant, alt concordant, alt discordant">\n'
        )

        vcf.write(
            '##FORMAT=<ID=refMQ,Number=1,Type=Float,Description="average mapping score for reference reads">\n'
        )
        vcf.write(
            '##FORMAT=<ID=altMQ,Number=1,Type=Float,Description="average mapping score for alternate reads">\n'
        )
        vcf.write(
            '##FORMAT=<ID=refBQ,Number=1,Type=Float,Description="average base quality score for reference reads">\n'
        )
        vcf.write(
            '##FORMAT=<ID=altBQ,Number=1,Type=Float,Description="average base quality score for alternate reads">\n'
        )
        vcf.write(
            '##FORMAT=<ID=refNM,Number=1,Type=Float,Description="average edit distance for reference reads">\n'
        )
        vcf.write(
            '##FORMAT=<ID=altNM,Number=1,Type=Float,Description="average edit distance for alternate reads">\n'
        )

        vcf.write(
            '##FORMAT=<ID=fetSB,Number=1,Type=Float,Description="Strand bias FET">\n'
        )
        vcf.write(
            '##FORMAT=<ID=fetCD,Number=1,Type=Float,Description="Concordance FET">\n'
        )
        vcf.write(
            '##FORMAT=<ID=zMQ,Number=1,Type=Float,Description="z-score rank sum of mapping quality">\n'
        )
        vcf.write(
            '##FORMAT=<ID=zBQ,Number=1,Type=Float,Description="z-score rank sum of base quality">\n'
        )
        vcf.write(
            '##FORMAT=<ID=MQ0,Number=1,Type=Integer,Description="Number of reads with mapping quality of 0">\n'
        )
        vcf.write(
            '##FORMAT=<ID=VAF,Number=1,Type=Float,Description="Variant Allele Frequency">\n'
        )

        if single_mode:
            vcf.write(
                '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{}\n'.
                format(tumor_sample_name))
        elif paired_mode:
            vcf.write(
                '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{}\t{}\n'
                .format(normal_sample_name, tumor_sample_name))

        # Start writing content:
        tsv_i = tsv.readline().rstrip()

        while tsv_i:

            tsv_item = tsv_i.split('\t')
            try:
                score = float(tsv_item[SCORE])
            except NameError:
                score = nan

            if phred_scaled:
                scaled_score = p2phred(1 - score, max_phred=255)
            else:
                scaled_score = score

            try:
                # Non-PASS MuSE calls are made into fractions.
                if tsv_item[MuSE_Tier] != '1':
                    if_MuSE = '0'
                else:
                    if_MuSE = '1'
            except NameError:
                if_MuSE = '.'

            MVJS = []
            num_tools = 0
            for tool_i in mvjsdu:

                if_Tool = tsv_item[toolcode2index[tool_i]]

                if if_Tool == '1':
                    if_Tool = '1'

                elif if_Tool == 'nan':
                    if_Tool = '.'

                else:
                    if_Tool = '0'

                MVJS.append(if_Tool)
                num_tools = num_tools + int(if_Tool)

            MVJS = ','.join(MVJS)

            info_string = '{COMBO}={MVJSD};NUM_TOOLS={NUM_TOOLS}'.format(
                COMBO=mvjsdu, MVJSD=MVJS, NUM_TOOLS=num_tools)

            # NORMAL
            if not single_mode:
                n_ref_mq = tsv_item[
                    nBAM_REF_MQ] if tsv_item[nBAM_REF_MQ] != 'nan' else '.'
                n_alt_mq = tsv_item[
                    nBAM_ALT_MQ] if tsv_item[nBAM_ALT_MQ] != 'nan' else '.'
                n_ref_bq = tsv_item[
                    nBAM_REF_BQ] if tsv_item[nBAM_REF_BQ] != 'nan' else '.'
                n_alt_bq = tsv_item[
                    nBAM_ALT_BQ] if tsv_item[nBAM_ALT_BQ] != 'nan' else '.'
                n_ref_nm = tsv_item[
                    nBAM_REF_NM] if tsv_item[nBAM_REF_NM] != 'nan' else '.'
                n_alt_nm = tsv_item[
                    nBAM_ALT_NM] if tsv_item[nBAM_ALT_NM] != 'nan' else '.'
                n_MQ0 = tsv_item[
                    nBAM_MQ0] if tsv_item[nBAM_MQ0] != 'nan' else '.'

                n_sb = tsv_item[nBAM_StrandBias_FET] if tsv_item[
                    nBAM_StrandBias_FET] != 'nan' else '.'
                n_cd = tsv_item[nBAM_Concordance_FET] if tsv_item[
                    nBAM_Concordance_FET] != 'nan' else '.'
                n_bqb = tsv_item[nBAM_Z_Ranksums_BQ] if tsv_item[
                    nBAM_Z_Ranksums_BQ] != 'nan' else '.'
                n_mqb = tsv_item[nBAM_Z_Ranksums_MQ] if tsv_item[
                    nBAM_Z_Ranksums_MQ] != 'nan' else '.'

                n_ref_for = tsv_item[
                    N_REF_FOR] if tsv_item[N_REF_FOR] != 'nan' else '0'
                n_ref_rev = tsv_item[
                    N_REF_REV] if tsv_item[N_REF_REV] != 'nan' else '0'
                n_alt_for = tsv_item[
                    N_ALT_FOR] if tsv_item[N_ALT_FOR] != 'nan' else '0'
                n_alt_rev = tsv_item[
                    N_ALT_REV] if tsv_item[N_ALT_REV] != 'nan' else '0'

                n_ref_con = tsv_item[nBAM_REF_Concordant] if tsv_item[
                    nBAM_REF_Concordant] != 'nan' else '0'
                n_ref_dis = tsv_item[nBAM_REF_Discordant] if tsv_item[
                    nBAM_REF_Discordant] != 'nan' else '0'
                n_alt_con = tsv_item[nBAM_ALT_Concordant] if tsv_item[
                    nBAM_ALT_Concordant] != 'nan' else '0'
                n_alt_dis = tsv_item[nBAM_ALT_Concordant] if tsv_item[
                    nBAM_ALT_Concordant] != 'nan' else '0'

                # DP4toGT:
                gt = dp4_to_gt(n_ref_for, n_ref_rev, n_alt_for, n_alt_rev,
                               hom_threshold, het_threshold)

                # 4-number strings:
                dp4_string = ','.join(
                    (n_ref_for, n_ref_rev, n_alt_for, n_alt_rev))
                cd4_string = ','.join(
                    (n_ref_con, n_ref_dis, n_alt_con, n_alt_dis))

                try:
                    vaf = (int(n_alt_for) +
                           int(n_alt_rev)) / (int(n_alt_for) + int(n_alt_rev) +
                                              int(n_ref_for) + int(n_ref_rev))
                except ZeroDivisionError:
                    vaf = 0
                vaf = '%.3g' % vaf

                normal_sample_string = '{GT}:{DP4}:{CD4}:{refMQ}:{altMQ}:{refBQ}:{altBQ}:{refNM}:{altNM}:{fetSB}:{fetCD}:{zMQ}:{zBQ}:{MQ0}:{VAF}'.format(
                    GT=gt,
                    DP4=dp4_string,
                    CD4=cd4_string,
                    refMQ=n_ref_mq,
                    altMQ=n_alt_mq,
                    refBQ=n_ref_bq,
                    altBQ=n_alt_bq,
                    refNM=n_ref_nm,
                    altNM=n_alt_nm,
                    fetSB=n_sb,
                    fetCD=n_cd,
                    zMQ=n_mqb,
                    zBQ=n_bqb,
                    MQ0=n_MQ0,
                    VAF=vaf)

            ### TUMOR ###
            t_ref_mq = tsv_item[
                tBAM_REF_MQ] if tsv_item[tBAM_REF_MQ] != 'nan' else '.'
            t_alt_mq = tsv_item[
                tBAM_ALT_MQ] if tsv_item[tBAM_ALT_MQ] != 'nan' else '.'
            t_ref_bq = tsv_item[
                tBAM_REF_BQ] if tsv_item[tBAM_REF_BQ] != 'nan' else '.'
            t_alt_bq = tsv_item[
                tBAM_ALT_BQ] if tsv_item[tBAM_ALT_BQ] != 'nan' else '.'
            t_ref_nm = tsv_item[
                tBAM_REF_NM] if tsv_item[tBAM_REF_NM] != 'nan' else '.'
            t_alt_nm = tsv_item[
                tBAM_ALT_NM] if tsv_item[tBAM_ALT_NM] != 'nan' else '.'
            t_MQ0 = tsv_item[tBAM_MQ0] if tsv_item[tBAM_MQ0] != 'nan' else '.'

            t_sb = tsv_item[tBAM_StrandBias_FET] if tsv_item[
                tBAM_StrandBias_FET] != 'nan' else '.'
            t_cd = tsv_item[tBAM_Concordance_FET] if tsv_item[
                tBAM_Concordance_FET] != 'nan' else '.'
            t_bqb = tsv_item[tBAM_Z_Ranksums_BQ] if tsv_item[
                tBAM_Z_Ranksums_BQ] != 'nan' else '.'
            t_mqb = tsv_item[tBAM_Z_Ranksums_MQ] if tsv_item[
                tBAM_Z_Ranksums_MQ] != 'nan' else '.'

            t_ref_for = tsv_item[
                T_REF_FOR] if tsv_item[T_REF_FOR] != 'nan' else '0'
            t_ref_rev = tsv_item[
                T_REF_REV] if tsv_item[T_REF_REV] != 'nan' else '0'
            t_alt_for = tsv_item[
                T_ALT_FOR] if tsv_item[T_ALT_FOR] != 'nan' else '0'
            t_alt_rev = tsv_item[
                T_ALT_REV] if tsv_item[T_ALT_REV] != 'nan' else '0'

            t_ref_con = tsv_item[tBAM_REF_Concordant] if tsv_item[
                tBAM_REF_Concordant] != 'nan' else '0'
            t_ref_dis = tsv_item[tBAM_REF_Discordant] if tsv_item[
                tBAM_REF_Discordant] != 'nan' else '0'
            t_alt_con = tsv_item[tBAM_ALT_Concordant] if tsv_item[
                tBAM_ALT_Concordant] != 'nan' else '0'
            t_alt_dis = tsv_item[tBAM_ALT_Discordant] if tsv_item[
                tBAM_ALT_Discordant] != 'nan' else '0'

            # DP4toGT:
            gt = dp4_to_gt(t_ref_for, t_ref_rev, t_alt_for, t_alt_rev,
                           hom_threshold, het_threshold)

            # 4-number strings:
            dp4_string = ','.join((t_ref_for, t_ref_rev, t_alt_for, t_alt_rev))
            cd4_string = ','.join((t_ref_con, t_ref_dis, t_alt_con, t_alt_dis))

            try:
                vd = int(t_alt_for) + int(t_alt_rev)
                vaf = vd / (vd + int(t_ref_for) + int(t_ref_rev))
            except ZeroDivisionError:
                vd = 0
                vaf = 0

            vaf = '%.3g' % vaf

            # Add VAF to info string if and only if there is one single sample in the VCF sample
            if single_mode:
                info_string = info_string + ';AF={}'.format(vaf)

            tumor_sample_string = '{GT}:{DP4}:{CD4}:{refMQ}:{altMQ}:{refBQ}:{altBQ}:{refNM}:{altNM}:{fetSB}:{fetCD}:{zMQ}:{zBQ}:{MQ0}:{VAF}'.format(
                GT=gt,
                DP4=dp4_string,
                CD4=cd4_string,
                refMQ=t_ref_mq,
                altMQ=t_alt_mq,
                refBQ=t_ref_bq,
                altBQ=t_alt_bq,
                refNM=t_ref_nm,
                altNM=t_alt_nm,
                fetSB=t_sb,
                fetCD=t_cd,
                zMQ=t_mqb,
                zBQ=t_bqb,
                MQ0=t_MQ0,
                VAF=vaf)

            field_string = 'GT:DP4:CD4:refMQ:altMQ:refBQ:altBQ:refNM:altNM:fetSB:fetCD:zMQ:zBQ:MQ0:VAF'

            if score is nan:
                scaled_score = 0

            # PASS
            if score >= pass_score or (score is nan
                                       and num_tools > 0.5 * total_num_tools):

                vcf_line = '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(
                    tsv_item[CHROM], tsv_item[POS], tsv_item[ID],
                    tsv_item[REF], tsv_item[ALT], '%.4f' % scaled_score,
                    'PASS', 'SOMATIC;' + info_string, field_string)

                if single_mode:
                    vcf_line = vcf_line + '\t' + tumor_sample_string
                elif paired_mode:
                    vcf_line = vcf_line + '\t' + normal_sample_string + '\t' + tumor_sample_string

                vcf.write(vcf_line + '\n')

            # Low Qual
            elif score >= lowqual_score or (
                    score is nan and num_tools >= 1
                    and num_tools >= 0.33 * total_num_tools):

                vcf_line = '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(
                    tsv_item[CHROM], tsv_item[POS], tsv_item[ID],
                    tsv_item[REF], tsv_item[ALT], '%.4f' % scaled_score,
                    'LowQual', info_string, field_string)

                if single_mode:
                    vcf_line = vcf_line + '\t' + tumor_sample_string
                elif paired_mode:
                    vcf_line = vcf_line + '\t' + normal_sample_string + '\t' + tumor_sample_string

                vcf.write(vcf_line + '\n')

            # REJECT
            elif print_reject:

                vcf_line = '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(
                    tsv_item[CHROM], tsv_item[POS], tsv_item[ID],
                    tsv_item[REF], tsv_item[ALT], '%.4f' % scaled_score,
                    'REJECT', info_string, field_string)

                if single_mode:
                    vcf_line = vcf_line + '\t' + tumor_sample_string
                elif paired_mode:
                    vcf_line = vcf_line + '\t' + normal_sample_string + '\t' + tumor_sample_string

                vcf.write(vcf_line + '\n')

            # Next line:
            tsv_i = tsv.readline().rstrip()
示例#9
0
def vcf2tsv(is_vcf=None, is_bed=None, is_pos=None, nbam_fn=None, tbam_fn=None, truth=None, cosmic=None, dbsnp=None, mutect=None, varscan=None, jsm=None, sniper=None, vardict=None, muse=None, lofreq=None, scalpel=None, strelka=None, tnscope=None, platypus=None, dedup=True, min_mq=1, min_bq=5, min_caller=0, ref_fa=None, p_scale=None, outfile=None):

    # Convert contig_sequence to chrom_seq dict:
    fai_file  = ref_fa + '.fai'
    chrom_seq = genome.faiordict2contigorder(fai_file, 'fai')

    # Determine input format:
    if is_vcf:
        mysites = is_vcf
    elif is_bed:
        mysites = is_bed
    elif is_pos:
        mysites = is_pos
    else:
        mysites = fai_file
        logger.info('No position supplied. Will evaluate the whole genome.')

    # Re-scale output or not:
    if p_scale == None:
        logger.info('NO RE-SCALING')
    elif p_scale.lower() == 'phred':
        p_scale = 'phred'
    elif p_scale.lower() == 'fraction':
        p_scale = 'fraction'
    else:
        p_scale = None
        logger.info('NO RE-SCALING')

        # Define NaN and Inf:
    nan = float('nan')
    inf = float('inf')
    pattern_chr_position = genome.pattern_chr_position

    ## Running
    with genome.open_textfile(mysites) as my_sites, open(outfile, 'w') as outhandle:

        my_line = my_sites.readline().rstrip()

        nbam    = pysam.AlignmentFile(nbam_fn, reference_filename=ref_fa)
        tbam    = pysam.AlignmentFile(tbam_fn, reference_filename=ref_fa)
        ref_fa  = pysam.FastaFile(ref_fa)

        if truth:
            truth = genome.open_textfile(truth)
            truth_line = genome.skip_vcf_header( truth )

        if cosmic:
            cosmic = genome.open_textfile(cosmic)
            cosmic_line = genome.skip_vcf_header( cosmic )

        if dbsnp:
            dbsnp = genome.open_textfile(dbsnp)
            dbsnp_line = genome.skip_vcf_header( dbsnp )

        # 10 Incorporate callers: get thru the #'s
        if mutect:
            mutect = genome.open_textfile(mutect)
            mutect_line = genome.skip_vcf_header( mutect )

        if varscan:
            varscan = genome.open_textfile(varscan)
            varscan_line = genome.skip_vcf_header( varscan )

        if jsm:
            jsm = genome.open_textfile(jsm)
            jsm_line = genome.skip_vcf_header( jsm )

        if sniper:
            sniper = genome.open_textfile(sniper)
            sniper_line = genome.skip_vcf_header( sniper )

        if vardict:
            vardict = genome.open_textfile(vardict)
            vardict_line = genome.skip_vcf_header( vardict )

        if muse:
            muse = genome.open_textfile(muse)
            muse_line = genome.skip_vcf_header( muse )

        if lofreq:
            lofreq = genome.open_textfile(lofreq)
            lofreq_line = genome.skip_vcf_header( lofreq )

        if scalpel:
            scalpel = genome.open_textfile(scalpel)
            scalpel_line = genome.skip_vcf_header( scalpel )

        if strelka:
            strelka = genome.open_textfile(strelka)
            strelka_line = genome.skip_vcf_header( strelka )

        if tnscope:
            tnscope = genome.open_textfile(tnscope)
            tnscope_line = genome.skip_vcf_header( tnscope )

        if platypus:
            platypus      = genome.open_textfile(platypus)
            platypus_line = genome.skip_vcf_header( platypus )

        # Get through all the headers:
        while my_line.startswith('#') or my_line.startswith('track='):
            my_line = my_sites.readline().rstrip()


        # First coordinate, for later purpose of making sure the input is sorted properly
        coordinate_i = re.match( genome.pattern_chr_position, my_line )
        coordinate_i = coordinate_i.group() if coordinate_i else ''

        # First line:
        outhandle.write( out_header.replace('{','').replace('}','')  + '\n' )

        while my_line:

            # If VCF, get all the variants with the same coordinate into a list:
            if is_vcf:

                my_vcf = genome.Vcf_line( my_line )

                my_coordinates = [(my_vcf.chromosome, my_vcf.position)]

                variants_at_my_coordinate = []

                alt_bases = my_vcf.altbase.split(',')
                for alt_i in alt_bases:
                    vcf_i = copy(my_vcf)
                    vcf_i.altbase = alt_i
                    variants_at_my_coordinate.append( vcf_i )


                # As long as the "coordinate" stays the same, it will keep reading until it's different.
                while my_coordinates[0] == (my_vcf.chromosome, my_vcf.position):

                    my_line = my_sites.readline().rstrip()
                    my_vcf = genome.Vcf_line( my_line )

                    ########## This block is code is to ensure the input VCF file is properly sorted ##
                    coordinate_j = re.match( genome.pattern_chr_position, my_line )
                    coordinate_j = coordinate_j.group() if coordinate_j else ''

                    if genome.whoisbehind(coordinate_i, coordinate_j, chrom_seq) == 1:
                        raise Exception( '{} does not seem to be properly sorted.'.format(mysites) )

                    coordinate_i = coordinate_j
                    ###################################################################################

                    if my_coordinates[0] == (my_vcf.chromosome, my_vcf.position):

                        alt_bases = my_vcf.altbase.split(',')
                        for alt_i in alt_bases:

                            vcf_i = copy(my_vcf)
                            vcf_i.altbase = alt_i
                            variants_at_my_coordinate.append( vcf_i )

            elif is_bed:
                bed_item = my_line.split('\t')
                my_coordinates = genomic_coordinates( bed_item[0], int(bed_item[1])+1, int(bed_item[2]) )

            elif is_pos:
                pos_item = my_line.split('\t')
                my_coordinates = genomic_coordinates( pos_item[0], int(pos_item[1]), int(pos_item[1]) )

            elif fai_file:
                fai_item = my_line.split('\t')
                my_coordinates = genomic_coordinates( fai_item[0], 1, int(fai_item[1]) )

            ##### ##### ##### ##### ##### #####
            for my_coordinate in my_coordinates:

                ######## If VCF, can get ref base, variant base, as well as other identifying information ########
                if is_vcf:

                    ref_bases = []
                    alt_bases = []
                    indel_lengths = []
                    all_my_identifiers = []

                    for variant_i in variants_at_my_coordinate:

                        ref_base = variant_i.refbase
                        first_alt = variant_i.altbase.split(',')[0]
                        indel_length = len(first_alt) - len(ref_base)

                        ref_bases.append( ref_base )
                        alt_bases.append( first_alt )
                        indel_lengths.append( indel_length )

                        # Extract these information if they exist in the VCF file, but they could be re-written if dbSNP/COSMIC are supplied.
                        if_dbsnp  = 1 if re.search(r'rs[0-9]+', variant_i.identifier) else 0
                        if_cosmic = 1 if re.search(r'COS[MN][0-9]+', variant_i.identifier) else 0
                        if_common = 1 if variant_i.get_info_value('COMMON') == '1' else 0
                        num_cases = variant_i.get_info_value('CNT') if variant_i.get_info_value('CNT') else nan

                        if variant_i.identifier == '.':
                            my_identifier_i = set()
                        else:
                            my_identifier_i = variant_i.identifier.split(';')
                            my_identifier_i = set( my_identifier_i )

                        all_my_identifiers.append( my_identifier_i )

                ## If not, 1) get ref_base, first_alt from other VCF files.
                #          2) Create placeholders for dbSNP and COSMIC that can be overwritten with dbSNP/COSMIC VCF files (if provided)
                else:
                    variants_at_my_coordinate = [None] # Just to have something to iterate
                    ref_base = first_alt = indel_length = None

                    # Could be re-written if dbSNP/COSMIC are supplied. If not, they will remain NaN.
                    if_dbsnp = if_cosmic = if_common = num_cases = nan

                # Keep track of NumCallers:
                num_callers = 0

                #################################### Find the same coordinate in those VCF files ####################################
                if mutect:   got_mutect,   mutect_variants,   mutect_line   = genome.find_vcf_at_coordinate(my_coordinate, mutect_line,  mutect,  chrom_seq)
                if varscan:  got_varscan,  varscan_variants,  varscan_line  = genome.find_vcf_at_coordinate(my_coordinate, varscan_line, varscan, chrom_seq)
                if jsm:      got_jsm,      jsm_variants,      jsm_line      = genome.find_vcf_at_coordinate(my_coordinate, jsm_line,     jsm,     chrom_seq)
                if sniper:   got_sniper,   sniper_variants,   sniper_line   = genome.find_vcf_at_coordinate(my_coordinate, sniper_line,  sniper,  chrom_seq)
                if vardict:  got_vardict,  vardict_variants,  vardict_line  = genome.find_vcf_at_coordinate(my_coordinate, vardict_line, vardict, chrom_seq)
                if muse:     got_muse,     muse_variants,     muse_line     = genome.find_vcf_at_coordinate(my_coordinate, muse_line,    muse,    chrom_seq)
                if lofreq:   got_lofreq,   lofreq_variants,   lofreq_line   = genome.find_vcf_at_coordinate(my_coordinate, lofreq_line,  lofreq,  chrom_seq)
                if scalpel:  got_scalpel,  scalpel_variants,  scalpel_line  = genome.find_vcf_at_coordinate(my_coordinate, scalpel_line, scalpel, chrom_seq)
                if strelka:  got_strelka,  strelka_variants,  strelka_line  = genome.find_vcf_at_coordinate(my_coordinate, strelka_line, strelka, chrom_seq)
                if tnscope:  got_tnscope,  tnscope_variants,  tnscope_line  = genome.find_vcf_at_coordinate(my_coordinate, tnscope_line, tnscope, chrom_seq)
                if platypus: got_platypus, platypus_variants, platypus_line = genome.find_vcf_at_coordinate(my_coordinate, platypus_line, platypus, chrom_seq)
                if truth:    got_truth,    truth_variants,    truth_line    = genome.find_vcf_at_coordinate(my_coordinate, truth_line,   truth,   chrom_seq)
                if dbsnp:    got_dbsnp,    dbsnp_variants,    dbsnp_line    = genome.find_vcf_at_coordinate(my_coordinate, dbsnp_line,   dbsnp,   chrom_seq)
                if cosmic:   got_cosmic,   cosmic_variants,   cosmic_line   = genome.find_vcf_at_coordinate(my_coordinate, cosmic_line,  cosmic,  chrom_seq)


                # Now, use pysam to look into the BAM file(s), variant by variant from the input:
                for ith_call, my_call in enumerate( variants_at_my_coordinate ):

                    if is_vcf:
                        # The particular line in the input VCF file:
                        variant_id = ( (my_call.chromosome, my_call.position), my_call.refbase, my_call.altbase )

                        ref_base       = ref_bases[ith_call]
                        first_alt      = alt_bases[ith_call]
                        indel_length   = indel_lengths[ith_call]
                        my_identifiers = all_my_identifiers[ith_call]

                    else:
                        variant_id = ( (my_coordinate[0], my_coordinate[1]), ref_base, first_alt )


                    #################### Collect Caller Vcf ####################:
                    if mutect:
                        mutect_classification, nlod, tlod, tandem, ecnt = annotate_caller.MuTect(variant_id, mutect_variants)
                        num_callers += mutect_classification
                    else:
                        mutect_classification = nlod = tlod = tandem = ecnt = nan


                    if varscan:
                        varscan_classification = annotate_caller.VarScan(variant_id, varscan_variants)
                        num_callers += varscan_classification
                    else:
                        varscan_classification = nan


                    if jsm:
                        jointsnvmix2_classification, score_jointsnvmix2 = annotate_caller.JSM(variant_id, jsm_variants)
                        num_callers += jointsnvmix2_classification
                    else:
                        jointsnvmix2_classification = score_jointsnvmix2 = nan


                    if sniper:
                        sniper_classification, score_somaticsniper = annotate_caller.SomaticSniper(variant_id, sniper_variants)
                        num_callers += sniper_classification
                    else:
                        sniper_classification = score_somaticsniper = nan


                    if vardict:
                        vardict_classification, msi, msilen, shift3, score_vardict = annotate_caller.VarDict(variant_id, vardict_variants)
                        num_callers += vardict_classification
                    else:
                        vardict_classification = msi = msilen = shift3 = score_vardict = nan


                    if muse:
                        muse_classification = annotate_caller.MuSE(variant_id, muse_variants)
                        num_callers += muse_classification
                    else:
                        muse_classification = nan


                    if lofreq:
                        lofreq_classification = annotate_caller.LoFreq(variant_id, lofreq_variants)
                        num_callers += lofreq_classification
                    else:
                        lofreq_classification = nan


                    if scalpel:
                        scalpel_classification = annotate_caller.Scalpel(variant_id, scalpel_variants)
                        num_callers += scalpel_classification
                    else:
                        scalpel_classification = nan


                    if strelka:
                        strelka_classification, somatic_evs, qss, tqss = annotate_caller.Strelka(variant_id, strelka_variants)
                        num_callers += strelka_classification
                    else:
                        strelka_classification = somatic_evs = qss = tqss = nan


                    if tnscope:
                        tnscope_classification = annotate_caller.TNscope(variant_id, tnscope_variants)
                        num_callers += tnscope_classification
                    else:
                        tnscope_classification = nan
                        
                    
                    if platypus:
                        platypus_classification = annotate_caller.countPASS(variant_id, platypus_variants)
                        num_callers += platypus_classification
                    else:
                        platypus_classification = nan


                    # Potentially write the output only if it meets this threshold:
                    if num_callers >= min_caller:

                        ########## Ground truth file ##########
                        if truth:
                            if variant_id in truth_variants:
                                judgement = 1
                                my_identifiers.add('TruePositive')
                            else:
                                judgement = 0
                                my_identifiers.add('FalsePositive')
                        else:
                            judgement = nan


                        ########## dbSNP ########## Will overwrite dbSNP info from input VCF file
                        if dbsnp:
                            if_dbsnp, if_common, rsID = annotate_caller.dbSNP(variant_id, dbsnp_variants)
                            for ID_i in rsID:
                                my_identifiers.add( ID_i )


                        ########## COSMIC ########## Will overwrite COSMIC info from input VCF file
                        if cosmic:
                            if_cosmic, num_cases, cosmicID = annotate_caller.COSMIC(variant_id, cosmic_variants)
                            for ID_i in cosmicID:
                                my_identifiers.add( ID_i )


                        ########## ######### ######### INFO EXTRACTION FROM BAM FILES ########## ######### #########
                        nBamFeatures = sequencing_features.from_bam(nbam, my_coordinate, ref_base, first_alt, min_mq, min_bq)
                        tBamFeatures = sequencing_features.from_bam(tbam, my_coordinate, ref_base, first_alt, min_mq, min_bq)

                        n_ref = nBamFeatures['ref_for'] + nBamFeatures['ref_rev']
                        n_alt = nBamFeatures['alt_for'] + nBamFeatures['alt_rev']
                        t_ref = tBamFeatures['ref_for'] + tBamFeatures['ref_rev']
                        t_alt = tBamFeatures['alt_for'] + tBamFeatures['alt_rev']
                        sor = sequencing_features.somaticOddRatio(n_ref, n_alt, t_ref, t_alt)

                        # Calculate VarScan'2 SCC directly without using VarScan2 output:
                        try:
                            score_varscan2 = genome.p2phred( stats.fisher_exact( ((t_alt, n_alt), (t_ref, n_ref)), alternative='greater' )[1] )
                        except ValueError:
                            score_varscan2 = nan

                        # Homopolymer eval:
                        homopolymer_length, site_homopolymer_length = sequencing_features.from_genome_reference(ref_fa, my_coordinate, ref_base, first_alt)

                        # Fill the ID field of the TSV/VCF
                        my_identifiers = ';'.join(my_identifiers) if my_identifiers else '.'

                        ###
                        out_line = out_header.format( \
                        CHROM                   = my_coordinate[0],                                                    \
                        POS                     = my_coordinate[1],                                                    \
                        ID                      = my_identifiers,                                                      \
                        REF                     = ref_base,                                                            \
                        ALT                     = first_alt,                                                           \
                        if_MuTect               = mutect_classification,                                               \
                        if_VarScan2             = varscan_classification,                                              \
                        if_JointSNVMix2         = jointsnvmix2_classification,                                         \
                        if_SomaticSniper        = sniper_classification,                                               \
                        if_VarDict              = vardict_classification,                                              \
                        MuSE_Tier               = muse_classification,                                                 \
                        if_LoFreq               = lofreq_classification,                                               \
                        if_Scalpel              = scalpel_classification,                                              \
                        if_Strelka              = strelka_classification,                                              \
                        if_TNscope              = tnscope_classification,                                              \
                        if_Platypus             = platypus_classification,                                             \
                        Strelka_Score           = somatic_evs,                                                         \
                        Strelka_QSS             = qss,                                                                 \
                        Strelka_TQSS            = tqss,                                                                \
                        VarScan2_Score          = rescale(score_varscan2,      'phred', p_scale, 1001),                \
                        SNVMix2_Score           = rescale(score_jointsnvmix2,  'phred', p_scale, 1001),                \
                        Sniper_Score            = rescale(score_somaticsniper, 'phred', p_scale, 1001),                \
                        VarDict_Score           = rescale(score_vardict,       'phred', p_scale, 1001),                \
                        if_dbsnp                = if_dbsnp,                                                            \
                        COMMON                  = if_common,                                                           \
                        if_COSMIC               = if_cosmic,                                                           \
                        COSMIC_CNT              = num_cases,                                                           \
                        Consistent_Mates        = tBamFeatures['consistent_mates'],                                    \
                        Inconsistent_Mates      = tBamFeatures['inconsistent_mates'],                                  \
                        N_DP                    = nBamFeatures['dp'],                                                  \
                        nBAM_REF_MQ             = '%g' % nBamFeatures['ref_mq'],                                       \
                        nBAM_ALT_MQ             = '%g' % nBamFeatures['alt_mq'],                                       \
                        nBAM_Z_Ranksums_MQ      = '%g' % nBamFeatures['z_ranksums_mq'],                                \
                        nBAM_REF_BQ             = '%g' % nBamFeatures['ref_bq'],                                       \
                        nBAM_ALT_BQ             = '%g' % nBamFeatures['alt_bq'],                                       \
                        nBAM_Z_Ranksums_BQ      = '%g' % nBamFeatures['z_ranksums_bq'],                                \
                        nBAM_REF_NM             = '%g' % nBamFeatures['ref_NM'],                                       \
                        nBAM_ALT_NM             = '%g' % nBamFeatures['alt_NM'],                                       \
                        nBAM_NM_Diff            = '%g' % nBamFeatures['NM_Diff'],                                      \
                        nBAM_REF_Concordant     = nBamFeatures['ref_concordant_reads'],                                \
                        nBAM_REF_Discordant     = nBamFeatures['ref_discordant_reads'],                                \
                        nBAM_ALT_Concordant     = nBamFeatures['alt_concordant_reads'],                                \
                        nBAM_ALT_Discordant     = nBamFeatures['alt_discordant_reads'],                                \
                        nBAM_Concordance_FET    = rescale(nBamFeatures['concordance_fet'], 'fraction', p_scale, 1001), \
                        N_REF_FOR               = nBamFeatures['ref_for'],                                             \
                        N_REF_REV               = nBamFeatures['ref_rev'],                                             \
                        N_ALT_FOR               = nBamFeatures['alt_for'],                                             \
                        N_ALT_REV               = nBamFeatures['alt_rev'],                                             \
                        nBAM_StrandBias_FET     = rescale(nBamFeatures['strandbias_fet'], 'fraction', p_scale, 1001),  \
                        nBAM_Z_Ranksums_EndPos  = '%g' % nBamFeatures['z_ranksums_endpos'],                            \
                        nBAM_REF_Clipped_Reads  = nBamFeatures['ref_SC_reads'],                                        \
                        nBAM_ALT_Clipped_Reads  = nBamFeatures['alt_SC_reads'],                                        \
                        nBAM_Clipping_FET       = rescale(nBamFeatures['clipping_fet'], 'fraction', p_scale, 1001),    \
                        nBAM_MQ0                = nBamFeatures['MQ0'],                                                 \
                        nBAM_Other_Reads        = nBamFeatures['noise_read_count'],                                    \
                        nBAM_Poor_Reads         = nBamFeatures['poor_read_count'],                                     \
                        nBAM_REF_InDel_3bp      = nBamFeatures['ref_indel_3bp'],                                       \
                        nBAM_REF_InDel_2bp      = nBamFeatures['ref_indel_2bp'],                                       \
                        nBAM_REF_InDel_1bp      = nBamFeatures['ref_indel_1bp'],                                       \
                        nBAM_ALT_InDel_3bp      = nBamFeatures['alt_indel_3bp'],                                       \
                        nBAM_ALT_InDel_2bp      = nBamFeatures['alt_indel_2bp'],                                       \
                        nBAM_ALT_InDel_1bp      = nBamFeatures['alt_indel_1bp'],                                       \
                        M2_NLOD                 = nlod,                                                                \
                        M2_TLOD                 = tlod,                                                                \
                        M2_STR                  = tandem,                                                              \
                        M2_ECNT                 = ecnt,                                                                \
                        SOR                     = sor,                                                                 \
                        MSI                     = msi,                                                                 \
                        MSILEN                  = msilen,                                                              \
                        SHIFT3                  = shift3,                                                              \
                        MaxHomopolymer_Length   = homopolymer_length,                                                  \
                        SiteHomopolymer_Length  = site_homopolymer_length,                                             \
                        T_DP                    = tBamFeatures['dp'],                                                  \
                        tBAM_REF_MQ             = '%g' % tBamFeatures['ref_mq'],                                       \
                        tBAM_ALT_MQ             = '%g' % tBamFeatures['alt_mq'],                                       \
                        tBAM_Z_Ranksums_MQ      = '%g' % tBamFeatures['z_ranksums_mq'],                                \
                        tBAM_REF_BQ             = '%g' % tBamFeatures['ref_bq'],                                       \
                        tBAM_ALT_BQ             = '%g' % tBamFeatures['alt_bq'],                                       \
                        tBAM_Z_Ranksums_BQ      = '%g' % tBamFeatures['z_ranksums_bq'],                                \
                        tBAM_REF_NM             = '%g' % tBamFeatures['ref_NM'],                                       \
                        tBAM_ALT_NM             = '%g' % tBamFeatures['alt_NM'],                                       \
                        tBAM_NM_Diff            = '%g' % tBamFeatures['NM_Diff'],                                      \
                        tBAM_REF_Concordant     = tBamFeatures['ref_concordant_reads'],                                \
                        tBAM_REF_Discordant     = tBamFeatures['ref_discordant_reads'],                                \
                        tBAM_ALT_Concordant     = tBamFeatures['alt_concordant_reads'],                                \
                        tBAM_ALT_Discordant     = tBamFeatures['alt_discordant_reads'],                                \
                        tBAM_Concordance_FET    = rescale(tBamFeatures['concordance_fet'], 'fraction', p_scale, 1001), \
                        T_REF_FOR               = tBamFeatures['ref_for'],                                             \
                        T_REF_REV               = tBamFeatures['ref_rev'],                                             \
                        T_ALT_FOR               = tBamFeatures['alt_for'],                                             \
                        T_ALT_REV               = tBamFeatures['alt_rev'],                                             \
                        tBAM_StrandBias_FET     = rescale(tBamFeatures['strandbias_fet'], 'fraction', p_scale, 1001),  \
                        tBAM_Z_Ranksums_EndPos  = '%g' % tBamFeatures['z_ranksums_endpos'],                            \
                        tBAM_REF_Clipped_Reads  = tBamFeatures['ref_SC_reads'],                                        \
                        tBAM_ALT_Clipped_Reads  = tBamFeatures['alt_SC_reads'],                                        \
                        tBAM_Clipping_FET       = rescale(tBamFeatures['clipping_fet'], 'fraction', p_scale, 1001),    \
                        tBAM_MQ0                = tBamFeatures['MQ0'],                                                 \
                        tBAM_Other_Reads        = tBamFeatures['noise_read_count'],                                    \
                        tBAM_Poor_Reads         = tBamFeatures['poor_read_count'],                                     \
                        tBAM_REF_InDel_3bp      = tBamFeatures['ref_indel_3bp'],                                       \
                        tBAM_REF_InDel_2bp      = tBamFeatures['ref_indel_2bp'],                                       \
                        tBAM_REF_InDel_1bp      = tBamFeatures['ref_indel_1bp'],                                       \
                        tBAM_ALT_InDel_3bp      = tBamFeatures['alt_indel_3bp'],                                       \
                        tBAM_ALT_InDel_2bp      = tBamFeatures['alt_indel_2bp'],                                       \
                        tBAM_ALT_InDel_1bp      = tBamFeatures['alt_indel_1bp'],                                       \
                        InDel_Length            = indel_length,                                                        \
                        TrueVariant_or_False    = judgement )

                        # Print it out to stdout:
                        outhandle.write(out_line + '\n')

            # Read into the next line:
            if not is_vcf:
                my_line = my_sites.readline().rstrip()

        ##########  Close all open files if they were opened  ##########
        opened_files = (ref_fa, nbam, tbam, truth, cosmic, dbsnp, mutect, varscan, jsm, sniper, vardict, muse, lofreq, scalpel, strelka, tnscope, platypus)
        [opened_file.close() for opened_file in opened_files if opened_file]