def combine_vcf_by_gatk(vcf_file_list, output_vcf, loadJava7=False):
    """
    Only for hg19 now.
    Run GATK to combine VCF files in vcf_file_list. The samples are REQURIED to be different in all
    files in the vcf_file_list
    """

    if loadJava7:
        load_java7 = LOAD_JAVA7_COMMAND
    else:
        load_java7 = ''

    gatk_command = '''
    %(load_java7)s
    java -Xmx4g -jar %(GATKCOMMAND)s \
        -R %(REF)s \
        -T CombineVariants \
        %(vcf_file_statement)s \
        -o %(output_vcf)s 
            ''' % {
                    'GATKCOMMAND': GATK_JAR,
                    'REF': B37_REF,
                    'output_vcf': output_vcf,
                    'load_java7': load_java7,
                    'vcf_file_statement': create_gatk_vcf_list_statements(vcf_file_list)
                    }
    run_subprocess_cmd(gatk_command)
    return 'Done'
def fix_vcf_header_for_virmid(virmid_vcf):
    """
    Remove the illegal space in the header line in the VCF file produced by VIRMID.
    The original virmid_vcf will be changed in place.
    """
    virmid_vcf = os.path.expanduser(virmid_vcf)
    cmd = "sed -i 's/ID=gt,Number=A,Type=String, Description=/ID=gt,Number=A,Type=String,Description=/' %s" % virmid_vcf
    run_subprocess_cmd(cmd)
def create_vcf_index(vcf_file):
    """
    Uses igvtools to create an index file for vcf_file. Relies on $SACGF environmental viable.
    igv.log will be removed """
    cmds = '''
    {IGVTOOLS_CMD} index {VCF_FILE}
    rm -f igv.log
    '''.format(IGVTOOLS_CMD=IGVTOOLS_CMD, VCF_FILE=vcf_file)
    run_subprocess_cmd(cmds)
def validate_vcf_spec_by_gatk(infile):
    """To perform only VCF format tests, by GATK"""
    gatk_command = '''
    java -Xmx4g -jar %(GATKCOMMAND)s \
        -R %(REF)s \
        -T ValidateVariants \
        --validationTypeToExclude ALL \
        --variant %(infile)s 
            ''' % {
                    'GATKCOMMAND': GATK_JAR,
                    'REF': B37_REF,
                    'infile': infile
                    }
    run_subprocess_cmd(gatk_command) # GATK writes stdout to stderr
def select_variants_by_gatk(invcf, outvcf, is_freebayes_somatic_vcf=False, sample_list=None,
        intervals=None, interval_padding=False, exclude_sample=False, hugeVCF=False, num_threads=None):
    """
    Run GATK -T SelectVariants, specifing the sample_list and/or intervals.
    When is_freebayes_somatic_vcf is False (the default), it uses --removeUnusedAlternates and --excludeNonVariants.
    intervals is a file path (e.g. A BED file), a single interval string ('1:100-200'),
    a single point position ('14:45645954') or a list of string like ['1:100-200', '14:45645954']
    outvcf must be a vcf.gz file.
    Only for hg19 now.
    """
    #TODO remove the tmp_vcf_out when GATK fix the * non-variant issue
    tmp_vcf_out = random_file_at_same_dir(outvcf, prefix='before_removing_star', extension='vcf.gz')

    sample_statement = '' 
    if sample_list:
        if exclude_sample:
            for sample in sample_list:
                sample_statement += '--exclude_sample_name %s ' % sample
        else:
            for sample in sample_list:
                sample_statement += '--sample_name %s ' % sample

    interval_statements = ''
    if intervals:
        if type(intervals) is list:
            for interval in intervals:
                interval_statements += ' --intervals %s ' % interval
        else: # it is a file path or a single interval string
            interval_statements += ' --intervals %s ' % intervals

    if interval_padding:
        interval_statements += ' --interval_padding %s ' % interval_padding

    if hugeVCF:
        java_Xmx = '-Xmx14g'
    else:
        java_Xmx = '-Xmx4g'

    if num_threads:
        nt_statement = '--num_threads %s' % num_threads
    else:
        nt_statement = ''

    if is_freebayes_somatic_vcf:
        remove_unused_alternates_option = ''
    else:
        remove_unused_alternates_option = '--removeUnusedAlternates --excludeNonVariants'

    # GATK SelectVariants does normalizing selected variants
    gatk_command = '''
    java %(java_Xmx)s -jar %(GATKCOMMAND)s \
        -R %(REF)s \
        --variant %(invcf)s \
        -o %(output_vcf)s \
        %(sample_statement)s  %(interval_statements)s \
        %(nt_statement)s \
        -T SelectVariants \
        %(remove_unused_alternates_option)s
    ''' % {
            'GATKCOMMAND': GATK_JAR,
            'java_Xmx': java_Xmx,
            'REF': B37_REF,
            'invcf': invcf,
            'output_vcf': tmp_vcf_out,
            'sample_statement': sample_statement,
            'nt_statement': nt_statement,
            'interval_statements': interval_statements,
            'remove_unused_alternates_option': remove_unused_alternates_option
            }
    run_subprocess_cmd(gatk_command)

    #TODO remove the tmp_vcf_out when GATK fix the * non-variant issue
    remove_star_cmd = '''
    {bcftools} view -O z --exclude "ALT='*'" {invcf} > {outvcf}
    {bcftools} index -t {outvcf}
    '''.format(bcftools=BCFTOOLS, invcf=tmp_vcf_out, outvcf=outvcf)
    run_subprocess_cmd(remove_star_cmd)
    remove_vcf_file(tmp_vcf_out)

    return 'Done'