def combine_vcf_by_gatk(vcf_file_list, output_vcf, loadJava7=False): """ Only for hg19 now. Run GATK to combine VCF files in vcf_file_list. The samples are REQURIED to be different in all files in the vcf_file_list """ if loadJava7: load_java7 = LOAD_JAVA7_COMMAND else: load_java7 = '' gatk_command = ''' %(load_java7)s java -Xmx4g -jar %(GATKCOMMAND)s \ -R %(REF)s \ -T CombineVariants \ %(vcf_file_statement)s \ -o %(output_vcf)s ''' % { 'GATKCOMMAND': GATK_JAR, 'REF': B37_REF, 'output_vcf': output_vcf, 'load_java7': load_java7, 'vcf_file_statement': create_gatk_vcf_list_statements(vcf_file_list) } run_subprocess_cmd(gatk_command) return 'Done'
def fix_vcf_header_for_virmid(virmid_vcf): """ Remove the illegal space in the header line in the VCF file produced by VIRMID. The original virmid_vcf will be changed in place. """ virmid_vcf = os.path.expanduser(virmid_vcf) cmd = "sed -i 's/ID=gt,Number=A,Type=String, Description=/ID=gt,Number=A,Type=String,Description=/' %s" % virmid_vcf run_subprocess_cmd(cmd)
def create_vcf_index(vcf_file): """ Uses igvtools to create an index file for vcf_file. Relies on $SACGF environmental viable. igv.log will be removed """ cmds = ''' {IGVTOOLS_CMD} index {VCF_FILE} rm -f igv.log '''.format(IGVTOOLS_CMD=IGVTOOLS_CMD, VCF_FILE=vcf_file) run_subprocess_cmd(cmds)
def validate_vcf_spec_by_gatk(infile): """To perform only VCF format tests, by GATK""" gatk_command = ''' java -Xmx4g -jar %(GATKCOMMAND)s \ -R %(REF)s \ -T ValidateVariants \ --validationTypeToExclude ALL \ --variant %(infile)s ''' % { 'GATKCOMMAND': GATK_JAR, 'REF': B37_REF, 'infile': infile } run_subprocess_cmd(gatk_command) # GATK writes stdout to stderr
def select_variants_by_gatk(invcf, outvcf, is_freebayes_somatic_vcf=False, sample_list=None, intervals=None, interval_padding=False, exclude_sample=False, hugeVCF=False, num_threads=None): """ Run GATK -T SelectVariants, specifing the sample_list and/or intervals. When is_freebayes_somatic_vcf is False (the default), it uses --removeUnusedAlternates and --excludeNonVariants. intervals is a file path (e.g. A BED file), a single interval string ('1:100-200'), a single point position ('14:45645954') or a list of string like ['1:100-200', '14:45645954'] outvcf must be a vcf.gz file. Only for hg19 now. """ #TODO remove the tmp_vcf_out when GATK fix the * non-variant issue tmp_vcf_out = random_file_at_same_dir(outvcf, prefix='before_removing_star', extension='vcf.gz') sample_statement = '' if sample_list: if exclude_sample: for sample in sample_list: sample_statement += '--exclude_sample_name %s ' % sample else: for sample in sample_list: sample_statement += '--sample_name %s ' % sample interval_statements = '' if intervals: if type(intervals) is list: for interval in intervals: interval_statements += ' --intervals %s ' % interval else: # it is a file path or a single interval string interval_statements += ' --intervals %s ' % intervals if interval_padding: interval_statements += ' --interval_padding %s ' % interval_padding if hugeVCF: java_Xmx = '-Xmx14g' else: java_Xmx = '-Xmx4g' if num_threads: nt_statement = '--num_threads %s' % num_threads else: nt_statement = '' if is_freebayes_somatic_vcf: remove_unused_alternates_option = '' else: remove_unused_alternates_option = '--removeUnusedAlternates --excludeNonVariants' # GATK SelectVariants does normalizing selected variants gatk_command = ''' java %(java_Xmx)s -jar %(GATKCOMMAND)s \ -R %(REF)s \ --variant %(invcf)s \ -o %(output_vcf)s \ %(sample_statement)s %(interval_statements)s \ %(nt_statement)s \ -T SelectVariants \ %(remove_unused_alternates_option)s ''' % { 'GATKCOMMAND': GATK_JAR, 'java_Xmx': java_Xmx, 'REF': B37_REF, 'invcf': invcf, 'output_vcf': tmp_vcf_out, 'sample_statement': sample_statement, 'nt_statement': nt_statement, 'interval_statements': interval_statements, 'remove_unused_alternates_option': remove_unused_alternates_option } run_subprocess_cmd(gatk_command) #TODO remove the tmp_vcf_out when GATK fix the * non-variant issue remove_star_cmd = ''' {bcftools} view -O z --exclude "ALT='*'" {invcf} > {outvcf} {bcftools} index -t {outvcf} '''.format(bcftools=BCFTOOLS, invcf=tmp_vcf_out, outvcf=outvcf) run_subprocess_cmd(remove_star_cmd) remove_vcf_file(tmp_vcf_out) return 'Done'