def star_alignment(star_executable, index_path, params): ''' This module will align the reads to the indexes stored in INDEX_PATH using STAR_EXECUTABLE. params contains n - Number of cores to use out_prefix - Prefix to use for output files file_prefix - Prefix to input fastq files logfile - Open file handle to a log file ''' print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': Aligning' + ' reads using STAR...', file=params.logfile) staralign_call = [star_executable] # Base staralign_call.extend(['--runThreadN', str(params.n)]) # Threads staralign_call.extend(['--genomeDir', index_path]) # index directory staralign_call.extend(['--outFileNamePrefix', params.out_prefix]) staralign_call.extend([ '--readFilesIn', ''.join([params.file_path, '/', params.file_prefix, '_1.fastq']), ''.join([params.file_path, '/', params.file_prefix, '_2.fastq']) ]) staralign_call.extend(['--outSAMattributes', 'NH', 'HI', 'AS', 'NM', 'MD']) staralign_call.extend(['--outSAMtype', 'BAM', 'SortedByCoordinate']) staralign_call.extend(['--quantMode', 'TranscriptomeSAM']) staralign_call.extend(['--outSAMunmapped', 'Within']) return_value = call(staralign_call) if return_value != 0: raise pi_errors.MyRuntimeError( dt.now().strftime('%I:%M %p %b %d, %Y') + \ ': Alignment failed', params.logfile) print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' + 'Alignment completed. Finishing up...', file=params.logfile)
def rsem_index(rsem_index_executable, fasta_input, bowtie_info, params): ''' This module will create the rsem indexes at params.index_destination using RSEM_INDEX_EXECUTABLE. If FASTA_INPUT = True, it will use the bowtie version to make bowtie indexes as well. bowtie_info is a tuple of (bowtie_path, bowtie_version) params contains index_destination - Folder to store the indexes n - number of cores to use genome_fasta - path to genomic fasta file. Can also specify DOWNLOAD. genome_version - hg19/hg38 logfile - Open file handle to a log file RETURN VALUES index_path - Path to directory where nidexes were stored ''' print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' + 'Creating rsem references...', file=params.logfile) index_path = os.path.abspath(params.index_destination) # If the directory doesn't exist, create it if not os.path.exists(index_path): prepare.py_mkdir(index_path) if params.genome_fasta == 'DOWNLOAD': params.genome_fasta = prepare.get_genome(params.genome_version, index_path, params.tbtf_executable, params.logfile) else: params.genome_fasta = pi_errors.test_param_value( params.genome_fasta, 'Genomic Fasta', '--genome_fasta', params.logfile) # If the gtf file is required, download it gencode_file = prepare.get_gtf(params.genome_version, index_path, params.logfile) print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' + 'Running rsem-prepare-reference on fasta reference.', file=params.logfile) rsem_prepref_call = [rsem_index_executable] # base call rsem_prepref_call.extend(['--gtf', gencode_file]) # gtf file if fasta_input: rsem_prepref_call.extend([ ''.join(['--', bowtie_version]), ''.join(['--', bowtie_version, '-path']), bowtie_path ]) else: rsem_prepref_call.append('--no-bowtie') rsem_prepref_call.append(params.genome_fasta) rsem_prepref_call.extend( [''.join([index_path, '/', params.genome_version])]) print(rsem_prepref_call, file=params.logfile) return_value = call(rsem_prepref_call) if return_value != 0: raise pi_errors.MyRuntimeError( dt.now().strftime('%I:%M %p %b %d, %Y') + \ ': Indexing Failed', params.logfile) print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' + 'Indexing completed.', file=params.logfile) return index_path
def main(): """ This wrapper script will run the tool cutadapt within the cutadapt docker container for the precision immuno project. The wrapper requires 1. cutadapt 2. GNU sed (Tested on version 4.2.1) Unless specified, the program will look for default executables on $PATH. The program DOES NOT look for jar files and they are required to be passed during execution. """ # Parse the arguments using prepare.parse_args() params = prepare.parse_args(main.__doc__, 'cutadapt', 'adapter_fixed') # params ERROR handling params.cutadapt_executable = pi_errors.test_param_value( params.cutadapt_executable, 'cutadapt', '--cutadapt', params.logfile) if not(set(params.fwd_3pr_adapter).issubset(set("ACTGN")) and \ set(params.rev_3pr_adapter).issubset(set("ACTGN"))): raise pi_errors.ParameterError( dt.now().strftime('%I:%M %p %b %d, %Y') + \ ': Adapter sequences can only contain A, C, T, G, and N.', params.logfile) # Move to working directory before doing I/O intensive work os.chdir(params.working_dir) # Remvove adapter contamination print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + \ ': Trimming adapters using cutadapt.', file=params.logfile) cutadapt_call = [params.cutadapt_executable] # base call cutadapt_call.extend(['-a', params.fwd_3pr_adapter]) # Fwd read 3' adapter cutadapt_call.extend(['-A', params.rev_3pr_adapter]) # Rev read 3' adapter cutadapt_call.extend(['-m', '35']) # Minimum size of read cutadapt_call.extend(['-o', ''.join([params.file_prefix, '_cutadapt_1.fastq'])]) cutadapt_call.extend(['-p', ''.join([params.file_prefix, '_cutadapt_2.fastq'])]) cutadapt_call.append(''.join([params.file_path, '/', params.file_prefix, '_1.fastq'])) cutadapt_call.append(''.join([params.file_path, '/', params.file_prefix, '_2.fastq'])) print(' '.join(cutadapt_call), file=params.logfile) return_value = call(cutadapt_call) if return_value != 0: raise pi_errors.MyRuntimeError( dt.now().strftime('%I:%M %p %b %d, %Y') + \ ': cutadapt failed', params.logfile) # Move files from temp directory to outdir prepare.move_output(params) print('RESULT ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': Process ' + 'completed', file=params.logfile) params.logfile.close()
def star_indexing(star_executable, read_length, params): ''' This module indexes a genome using STAR_EXECUTABLE using READ_LENGTH to set edge size. params contains index_destination - The location where the index should be stored logfile - Open file handle to a log file genome_version - hg19/hg38 n - number of cores to use tbtf_executable - path to twoBitToFa RETURN VALUES index_path - path ot the directory where indexes were stored ''' print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' + 'Indexing fasta...', file=params.logfile) params.index_destination = os.path.abspath(params.index_destination) if not os.path.exists(params.index_destination): prepare.py_mkdir(params.index_destination) edge_size = max(50, int(round(read_length / 50, 0) * 50)) # minimum edge # size = 50 index_path = ''.join( [params.index_destination, '/STAR_', str(edge_size), '_references']) if not os.path.exists(index_path): # make reference based on edge size prepare.py_mkdir(index_path) genome_fasta = prepare.get_genome(params.genome_version, index_path, params.tbtf_executable, params.logfile) gencode_file = prepare.get_gtf(params.genome_version, index_path, params.logfile) print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' + 'Running STAR index on fasta reference.', file=params.logfile) starindex_call = [star_executable] # Base call starindex_call.extend(['--runThreadN', str(params.n)]) # Threads starindex_call.extend(['--runMode', 'genomeGenerate']) # Indexing module starindex_call.extend(['--genomeDir', index_path]) # index directory starindex_call.extend(['--genomeFastaFiles', genome_fasta]) # Genomic fa starindex_call.extend(['--sjdbGTFfile', gencode_file]) # gencode annots starindex_call.extend(['--sjdbOverhang', str(read_length)]) # edge size print(starindex_call, file=params.logfile) return_value = call(starindex_call) if return_value != 0: raise pi_errors.MyRuntimeError( dt.now().strftime('%I:%M %p %b %d, %Y') + \ ': Indexing Failed', params.logfile) print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' + 'Indexing completed.', file=params.logfile) return index_path
def rsem_calculate_expression(rsem_calexp_executable, bowtie_info, index_info, params): ''' This module will process the bam of fastq files pointed to by PARAMS.FILE_PREFIX using RSEM_CALEXP_EXECUTABLE. bowtie_info is a tuple of (bowtie_path, bowtie_version) index_info is a tuple of (index_path, index_prefix) params contains: logfile - Open file handle to a log file n - number of cores to use file_path, file_prefix - path and prefix to input bam/fastq files ''' print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' + 'Calculating gene expression using rsem...', file=params.logfile) index_path, index_prefix = index_info bowtie_path, bowtie_version = bowtie_info rsem_calcexp_call = [rsem_calexp_executable] # base call rsem_calcexp_call.extend(['--paired-end']) rsem_calcexp_call.extend(['-p', str(params.n)]) if not fasta_input: rsem_calcexp_call.extend(['--bam']) rsem_calcexp_call.extend( [''.join([params.file_path, '/', params.file_prefix, '.bam'])]) rsem_calcexp_call.extend(['--no-bam-output']) else: rsem_calcexp_call.extend(['--output-genome-bam']) rsem_calcexp_call.extend([ ''.join(['--', bowtie_version]), ''.join(['--', bowtie_version, '-path']), bowtie_path ]) rsem_calcexp_call.extend([ ''.join([params.file_path, '/', params.file_prefix, '_1.fastq']), ''.join([params.file_path, '/', params.file_prefix, '_2.fastq']) ]) rsem_calcexp_call.extend(['/'.join([index_path, index_prefix])]) rsem_calcexp_call.extend([params.file_prefix]) print(rsem_calcexp_call, file=params.logfile) return_value = call(rsem_calcexp_call) if return_value != 0: raise pi_errors.MyRuntimeError('ERROR ' + \ dt.now().strftime('%I:%M %p %b %d, %Y') + \ ': RSEM failed', params.logfile) print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': Gene ' + 'expression calculated. Finishing up...', file=params.logfile)
def main(): ''' This wrapper script will run the entire alignment pipeline for genomic DNA (WGS or WXS) from alignment of fastqs, to sorting, indexing, and Read Group incorporation. The wrapper can even download and produce bwa references if required. The wrapper requires 1. bwa (For aligning reads) 2. java (For picard) 3. picard tools (For read groups) 4. samtools (For sam/bam manipulation) 5. twoBitToFa from the kent tools library (For extracting the reference genome in case indexing is required) Unless specified, the program will look for default executables on $PATH. The program DOES NOT look for jar files and they are required to be passed during execution. ''' # Parse the arguments using prepare.parse_args() params = prepare.parse_args(main.__doc__, 'bwa', 'bwa_alignment') # Params ERROR handling # The memory option for java should be of the form Xmx10G or Xmx10M if not (params.java_Xmx.endswith('G') or params.java_Xmx.endswith('M')): raise pi_errors.ParameterError( dt.now().strftime('%I:%M %p %b %d, %Y') + \ ': Please use a suitable value for --Xmx.', params.logfile) params.bwa_executable = pi_errors.test_param_value(params.bwa_executable, 'bwa', '--bwa', params.logfile) params.samtools_executable = pi_errors.test_param_value( params.samtools_executable, 'samtools', '--samtools', params.logfile) params.java_executable = pi_errors.test_param_value( params.java_executable, 'java', '--java', params.logfile) # If Indexing is required, does twoBitToFa point to a valid file? if params.index_location is None: params.tbtf_executable = pi_errors.test_param_value( params.tbtf_executable, 'twoBitToFa', '--twoBitToFa', params.logfile) if not params.picard_jar.endswith('jar'): raise pi_errors.ParameterError( dt.now().strftime('%I:%M %p %b %d, %Y') + \ ': Please specify a valid jar file for picard!', params.logfile) else: params.picard_jar = pi_errors.test_param_value(params.picard_jar, 'picard', '--picard_jar', params.logfile) if params.RGID is None: params.RGID = params.file_prefix #read_group = ''.join(['\'@RG\\tID:', params.RGID, '\\tPL:ILLUMINA\\tSM:', # params.sample_type, '\'']) # Check for indexes. If the user has specified that indexes need to # be created then do so. if params.index_location is None: print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' + 'Indexing fasta...', file=params.logfile) if not os.path.exists(params.index_destination): prepare.py_mkdir(params.index_destination) index_path = params.index_destination genome_fasta = prepare.get_genome(params.genome_version, index_path, params.twoBitToFa_executable, params.logfile) print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' + 'Running BWA index on fasta reference.', file=params.logfile) return_value = call([params.bwa_executable, 'index', genome_fasta]) if return_value != 0: raise pi_errors.MyRuntimeError( dt.now().strftime('%I:%M %p %b %d, %Y') + \ ': bwa index failed.', params.logfile) print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' + 'Running samtools faidx.', file=params.logfile) return_value = call( [params.samtools_executable, 'faidx', genome_fasta]) if return_value != 0: raise pi_errors.MyRuntimeError( dt.now().strftime('%I:%M %p %b %d, %Y') + \ ': samtools faidx failed', params.logfile) index_prefix = genome_fasta print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' + 'Indexing completed.', file=params.logfile) else: if params.index_location.endswith('.fa'): assert os.path.exists( params.index_location), 'Index file not found' index_prefix = params.index_location else: fastas = [ x for x in os.listdir(params.index_location) if x.endswith(".fa") ] if len(fastas) == 1: index_prefix = "".join([params.index_location, '/', fastas[0]]) elif len(fastas) == 0: raise pi_errors.InputFileError( dt.now().strftime('%I:%M %p %b %d, %Y') + \ ': No valid fasta found in provided index folder', params.logfile) else: raise pi_errors.InputFileError( dt.now().strftime('%I:%M %p %b %d, %Y') + \ ':Multiple fastas found in provided index folder. Try ' + \ 'running with --index_location /path/to/file/filename.fa', params.logfile) # Move to working directory before doing I/O intensive alignment os.chdir(params.working_dir) # Align reads to sam file print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': Aligning' + ' reads to reference.', file=params.logfile) bwa_call = [params.bwa_executable, 'mem'] # base call bwa_call.extend(['-t', str(params.n)]) # Number of threads #bwa_call.extend(['-R', read_group]) # Read group bwa_call.append(index_prefix) # bwa index bwa_call.append(''.join( [params.file_path, '/', params.file_prefix, '_1.fastq'])) bwa_call.append(''.join( [params.file_path, '/', params.file_prefix, '_2.fastq'])) print(' '.join(bwa_call), file=params.logfile) with open(''.join([params.file_prefix, '.sam']), 'w') as samfile, \ open(''.join([params.file_prefix, '_bwa_log.txt']), 'w') as logfile: return_value = call(bwa_call, stdout=samfile, stderr=logfile) if return_value != 0: raise pi_errors.MyRuntimeError( dt.now().strftime('%I:%M %p %b %d, %Y') + \ ': bwa mem failed.', params.logfile) print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' + 'Alignment completed. Converting to bam', file=params.logfile) # Convert the sam to a bam file with open(''.join([params.file_prefix, '.bam']), 'w') as bamfile: call([ params.samtools_executable, 'view', '-bS', ''.join( [params.file_prefix, '.sam']) ], stdout=bamfile) call(['rm', ''.join([params.file_prefix, '.sam'])]) print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': bam file' + ' created. Preparing file for inserting RG into header.', file=params.logfile) # Fix PG line sam_header = check_output([ params.samtools_executable, 'view', '-H', ''.join([params.file_prefix, '.bam']) ]) sam_header = sam_header.strip().split( '\n') # Strip whitespace and separate pg_line = sam_header[-1].split('\t') # Grab @PG line + split by tab # Then remove the CL field form the PG line sam_header[-1] = '\t'.join([x for x in pg_line if not x.startswith('CL')]) with open(''.join([params.file_prefix, '_sam.header']), 'w') as hdr_file: print('\n'.join(sam_header), file=hdr_file) with open(''.join([params.file_prefix, '_fixPG.bam']), 'w') as \ fixpg_bamfile: return_value = call([ params.samtools_executable, 'reheader', ''.join([ params.file_prefix, '_sam.header' ]), ''.join([params.file_prefix, '.bam']) ], stdout=fixpg_bamfile) if return_value != 0: raise pi_errors.MyRuntimeError( dt.now().strftime('%I:%M %p %b %d, %Y') + \ ': samtools reheader failed', params.logfile) call([ 'rm', ''.join([params.file_prefix, '.bam']), ''.join([params.file_prefix, '_sam.header']) ]) # Sort and Index the _fixPG.bam file return_value = call([ params.samtools_executable, 'sort', ''.join([params.file_prefix, '_fixPG.bam']), ''.join([params.file_prefix, '_fixPG_sorted']) ]) if return_value != 0: raise pi_errors.MyRuntimeError( dt.now().strftime('%I:%M %p %b %d, %Y') + \ ': samtools sort failed.', params.logfile) return_value = call([ params.samtools_executable, 'index', ''.join([params.file_prefix, '_fixPG_sorted.bam']) ]) if return_value != 0: raise pi_errors.MyRuntimeError( dt.now().strftime('%I:%M %p %b %d, %Y') + \ ': samtools index failed.', params.logfile) call(['rm', ''.join([params.file_prefix, '_fixPG.bam'])]) print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' + 'Inserting @RG tag into header.', file=params.logfile) # Reheader the indexed _fixPG_sorted.bam to prepare for mutect picard_call = [ params.java_executable, ''.join(['-Xmx', params.java_Xmx]), '-jar' ] # Base java call picard_call.append(params.picard_jar) # picard picard_call.append('AddOrReplaceReadGroups') # module picard_call.append('CREATE_INDEX=true') picard_call.append(''.join(['I=', params.file_prefix, '_fixPG_sorted.bam'])) picard_call.append(''.join( ['O=', params.file_prefix, '_fixPG_sorted_reheader.bam'])) picard_call.append('SO=coordinate') picard_call.append('ID=1') picard_call.append(''.join(['LB=', params.file_prefix])) picard_call.append('PL=ILLUMINA') picard_call.append('PU=12345') picard_call.append(''.join(['SM=', params.sample_type])) with open(''.join([params.file_prefix, '_picard_log.txt']), 'w') as logfile: return_value = call(picard_call, stdout=logfile) if return_value != 0: raise pi_errors.MyRuntimeError( dt.now().strftime('%I:%M %p %b %d, %Y') + \ ': picard AddOrReplaceReadGroups failed.', params.logfile) print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': @RG ' + 'inserted. Indexing bam', file=params.logfile) # Index _fixPG_sorted_reheader.bam file return_value = call([ params.samtools_executable, 'index', ''.join([params.file_prefix, '_fixPG_sorted_reheader.bam']) ]) if return_value != 0: raise pi_errors.MyRuntimeError( dt.now().strftime('%I:%M %p %b %d, %Y') + \ ': samtools index failed.', params.logfile) # Remove intermediate files call([ 'rm', ''.join([params.file_prefix, '_fixPG_sorted.bam']), ''.join([params.file_prefix, '_fixPG_sorted.bam.bai']) ]) print( 'PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' + 'Alignment completed. Finishing up...', params.logfile) # Move files from temp directory to outdir prepare.move_output(params) print('RESULT ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': Process ' + 'completed', file=params.logfile) params.logfile.close()
def main(): ''' This wrapper script will run the tool PHLAT within the phlat docker container for the precision immuno project. The wrapper requires: 1. PHLAT.py 2. bowtie2 3. gdown.pl (For donwloading the PHLAT Index - available from https://raw.githubusercontent.com/Nanolx/patchimage/master/tools/gdown.pl) Unless specified, the program will look for default executables on $PATH. The program DOES NOT look for jar files and they are required to be passed during execution. ''' # Parse the arguments using prepare.parse_args() params = prepare.parse_args(main.__doc__, 'phlat', 'MHC_typing') # params ERROR handling if not params.phlat_executable.endswith('PHLAT.py'): params.phlat_executable = '/'.join([params.phlat_executable, 'PHLAT.py']) params.phlat_executable = pi_errors.test_param_value( params.phlat_executable, 'PHLAT', '--phlat', params.logfile) params.bowtie2_executable = pi_errors.test_param_value( params.bowtie2_executable, 'bowtie2', '--bowtie2', params.logfile) phlat_dir = os.path.split(os.path.split(params.phlat_executable)[0])[0] params.gdownpl_executable = pi_errors.test_param_value( params.gdownpl_executable, 'gdown.pl', '--gdownpl', params.logfile) if params.index_location is None: print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') +': ' + 'Downloading Indexes...', file=params.logfile) params.index_destination = os.path.abspath(params.index_destination) if not os.path.exists(params.index_destination): prepare.py_mkdir(params.index_destination) getindex_call = [params.gdownpl_executable, 'https://drive.google.com' + '/uc?export=download&confirm=yAjx&id=0Bz-w5tutuZIYY3' + 'h5YlMzTjhnbGM', ''.join([params.index_destination, '/index4phlat.tar.gz'])] print(getindex_call, file=params.logfile) return_value = call(getindex_call) if return_value != 0: raise pi_errors.MyRuntimeError( dt.now().strftime('%I:%M %p %b %d, %Y') + \ ': Could not download indexes. Try manually downloading.', params.logfile) extract_call = ['tar', '-C', params.index_destination, '-zxvf', '/'.join([params.index_destination, 'index4phlat.tar.gz'])] return_value = call(extract_call) if return_value != 0: raise pi_errors.MyRuntimeError( dt.now().strftime('%I:%M %p %b %d, %Y') + \ ': Index4phlat could not be extracted.', params.logfile) else: call(['rm', '/'.join([params.index_destination, 'index4phlat.tar.gz'])]) index_path = '/'.join([params.index_destination, 'index4phlat']) print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' + 'Indexes Downloaded.', file=params.logfile) else: params.index_location = os.path.abspath(params.index_location) if not os.path.exists(''.join([params.index_location, '/ucsc.artHLA.1.bt2'])): raise pi_errors.InputFileError( dt.now().strftime('%I:%M %p %b %d, %Y') + \ ': Index file not found.', params.logfile) else: index_path = params.index_location # Move to working directory before doing I/O intensive alignment os.chdir(params.working_dir) print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') +': ' + 'Begining MHC Haplotyping', file=params.logfile) system_call = ['/usr/bin/env', 'python2.7', '-O', params.phlat_executable] system_call.extend(['-1', ''.join([params.file_path, '/', params.file_prefix, '_1.fastq'])]) # Fq1 system_call.extend(['-2', ''.join([params.file_path, "/", params.file_prefix, '_2.fastq'])]) # Fq2 system_call.extend(['-index', index_path]) # Index files system_call.extend(['-b2url', params.bowtie2_executable]) # Bowtie2 system_call.extend(['-tag', ''.join([params.out_prefix])]) # DNA/RNA system_call.extend(['-e', phlat_dir]) # Phlat directory home system_call.extend(['-o', params.outdir]) # Output directory system_call.extend(['-p', str(params.n)]) # Number of threads # Call the program return_value = call(system_call) if return_value != 0: raise pi_errors.MyRuntimeError( dt.now().strftime('%I:%M %p %b %d, %Y') + \ ': MHC Haplotyping failed.', params.logfile) print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' + 'Alignment completed. Finishing up...', file=params.logfile) # Move files from temp directory to outdir prepare.move_output(params) print('RESULT ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': Process ' + 'completed', file=params.logfile) params.logfile.close()
def main(): """ This wrapper script will run the tool mutect within the mutect docker container for the precision immuno project. The wrapper requires 1. mutect 2. java (For running mutect) 3. twoBitToFa from the kent tools library (For extracting the reference genome in case indexing is required) 4. lftp for downloading the cosmic vcf Unless specified, the program will look for default executables on $PATH. The program DOES NOT look for jar files and they are required to be passed during execution. """ # Parse the arguments using prepare.parse_args() params = prepare.parse_args(main.__doc__, 'mutect', 'mutect_calls') # params ERROR handling if not (params.java_Xmx.endswith('G') or params.java_Xmx.endswith('M')): raise pi_errors.ParameterError( dt.now().strftime('%I:%M %p %b %d, %Y') + \ ': Please use a suitable value for --Xmx.', params.logfile) params.java_executable = pi_errors.test_param_value(params.java_executable, 'java', '--java', params.logfile) params.mutect_jar = pi_errors.test_param_value(params.mutect_jar, 'Mutect jar', '--mutect_jar', params.logfile) # If Indexing is required, does twoBitToFa point to a valid file? if params.index_location is None: params.tbtf_executable = pi_errors.test_param_value( params.tbtf_executable, 'twoBitToFa', '--twoBitToFa', params.logfile) # Do the dnsnp and cosmic vcfs exist? if params.dbsnp_file == 'DOWNLOAD' or params.cosmic_file == 'DOWNLOAD': # First ensure the vcf storage location has been provided if params.vcf_location is None: raise pi_errors.ParameterError( dt.now().strftime('%I:%M %p %b %d, %Y') + \ ': --vcf_location cannot be empty if either --cosmic, ' + \ '--dbsnp, or --genome_fasta are empty.', params.logfile) else: params.vcf_location = os.path.abspath(params.vcf_location) # Download dbsnp file if required if params.dbsnp_file == 'DOWNLOAD': if os.path.exists('/'.join([params.vcf_location, '00-All.vcf'])): params.dbsnp_file = '/'.join([params.vcf_location, '00-All.vcf']) else: params.dbsnp_file = prepare.download_vcf('dbsnp', params) # Download cosmic file if required if params.cosmic_file == 'DOWNLOAD': if os.path.exists('/'.join([params.vcf_location, 'Cosmic_sorted.vcf'])): params.cosmic_file = '/'.join([params.vcf_location, 'Cosmic_sorted.vcf']) else: params.cosmic_file = prepare.download_vcf('cosmic', params) # Download genome fasta if required if params.genome_fasta == 'DOWNLOAD': if params.vcf_location is None: # If params.vcf_location is None, set it to the output directory params.vcf_location = params.outdir # Does the fasta exist in the vcf_location directory? if os.path.exists(''.join([params.vcf_location, '/', params.genome_version, '.fa'])): params.genome_fasta = ''.join([params.vcf_location, '/', params.genome_version, '.fa']) else: params.genome_fasta = prepare.get_genome(params.genome_version, params.vcf_location, params.tbtf_executable, params.logfile) else: params.genome_fasta = pi_errors.test_param_value(params.genome_fasta, 'Genomic Fasta', '--genome_fasta', params.logfile) # Move to working directory before doing I/O intensive work os.chdir(params.working_dir) # Call the program mutect_call = [params.java_executable, ''.join(['-Xmx', params.java_Xmx]), '-jar'] # Base java call mutect_call.append(params.mutect_jar) mutect_call.extend(['-T', 'MuTect']) mutect_call.extend(['-R', params.genome_fasta]) mutect_call.extend(['--cosmic', params.cosmic_file]) mutect_call.extend(['--dbsnp', params.dbsnp_file]) mutect_call.extend(['--input_file:normal', params.norm_d_file]) mutect_call.extend(['--input_file:tumor', params.tum_d_file]) mutect_call.extend(['--out', ''.join([params.out_prefix, '.out'])]) return_value = call(mutect_call) if return_value != 0: raise pi_errors.MyRuntimeError( dt.now().strftime('%I:%M %p %b %d, %Y') + \ ': MuTect failed.', params.logfile) with open(''.join([params.out_prefix, '.out']), 'r') as mutect_file, \ open(''.join([params.out_prefix, 'non_rejected.out']), 'w') as \ nr_file: for line in mutect_file: line = line.strip() if line.startswith('#'): print(line, file=nr_file) continue if line.startswith('contig'): print('#', line, sep='', file=nr_file) continue line = line.split('\t') if line[50] == 'REJECT': continue else: print(line, sep='\t', file=nr_file) print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' + 'Mutect run completed. Finishing up...', file=params.logfile) # Move files from temp directory to outdir prepare.move_output(params) print('RESULT ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': Process ' + 'completed', file=params.logfile) params.logfile.close()
def main(): """ This wrapper script will run the tool radia within the radia docker container for the precision immuno project. The wrapper requires 1. radia 2. snpeff (if the --use_snpeff flag is used) 3. twoBitToFa from the kent tools library (For extracting the reference genome in case indexing is required) 4. lftp for downloading the cosmic vcf Unless specified, the program will look for default executables on $PATH. The program DOES NOT look for jar files and they are required to be passed during execution. If you want to use a genome build other than hg19 then download cosmic and dbsnp vcfs manually and pass them to this program. This program currently only works with hg19 due to how cosmic and NCBI's ownload pages work. Other options may be made available in the future. The vcfs for the various databases are assumed to be in the parent folders of the radia executable (../data/*). If the data isn't found is the parent directories, the program will search VCF_LOCATION before throwing a warning and continuing without the said database. Only dbsnp and cosmic are downloaded if not present. """ # Parse the arguments using prepare.parse_args() params = prepare.parse_args(main.__doc__, 'radia', 'radia_calls') # params ERROR handling and processing database_map = process_parameters(params) # Move to working directory before doing I/O intensive alignment os.chdir(params.working_dir) # Call the program print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' + 'Starting radia run.', file=params.logfile) for chrom in params.chromosome: radia_call = [params.radia_executable] # Base radia call radia_call.extend([params.out_prefix, chrom]) radia_call.extend(['-n', params.norm_d_file]) radia_call.extend(['-t', params.tum_d_file]) if params.tum_r_file is not None: radia_call.extend(['-r', params.tum_r_file]) radia_call.append(''.join(['--rnaTumorFasta=', params.rna_fasta])) radia_call.extend(['-f', params.genome_fasta]) radia_call.extend( ['-o', ''.join([params.out_prefix, '_', chrom, '.vcf'])]) radia_call.extend(['-i', params.genome_version]) radia_call.extend(['-m', params.genome_fasta]) radia_call.extend(['-d', params.data_source]) radia_call.extend(['-q', params.seq_platform]) radia_call.extend(['--disease', 'params.disease']) radia_call.extend(['-l', '\"INFO\"']) radia_call.extend( ['-g', ''.join([params.out_prefix, '_', chrom, '.log'])]) return_value = call(radia_call) if return_value != 0: raise pi_errors.MyRuntimeError( dt.now().strftime('%I:%M %p %b %d, %Y') + \ ': radia failed.', params.logfile) # Call radia filtering print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' + 'Radia completed. Running FilterRadia now.', file=params.logfile) for chrom in params.chromosome: filter_radia_call = [params.radia_executable ] # Base filter radia call filter_radia_call.extend([params.out_prefix, chrom]) filter_radia_call.append(''.join( [params.out_prefix, '_', chrom, '_filtered.vcf'])) filter_radia_call.append(params.working_dir) filter_radia_call.append(os.path.split(params.radia_executable)[0]) filter_radia_call.extend(['-b', database_map['blacklist']]) filter_radia_call.extend(['-d']) filter_radia_call.extend(['-r', database_map['retrogenes']]) filter_radia_call.extend(['-p', database_map['pseudogenes']]) filter_radia_call.extend(['-c']) filter_radia_call.extend(['-t', database_map['broad_targets']]) if params.use_snpeff: filter_radia_call.extend(['-s', params.snpeff_jar]) filter_radia_call.extend(['-e', params.genome_version]) if not params.no_canonical: filter_radia_call.append(['--canonical']) else: filter_radia_call.append('--noSnpEff') filter_radia_call.extend( ['--rnaGeneBlckFile', database_map['rna_blacklist']]) filter_radia_call.extend( ['--rnaGeneFamilyBlckFile', database_map['rna_family_blacklist']]) filter_radia_call.extend(['-f', params.genome_fasta]) filter_radia_call.extend(['-l', '\"INFO\"']) filter_radia_call.extend( ['-g', ''.join([params.out_prefix, '_', chrom, '_filter.log'])]) return_value = call(radia_call) if return_value != 0: raise pi_errors.MyRuntimeError( dt.now().strftime('%I:%M %p %b %d, %Y') + \ ': FilterRadia failed.', params.logfile) call([ 'rm', ''.join([params.out_prefix, '_', chrom, '.log']), ''.join([params.out_prefix, '_', chrom, '.vcf']) ]) print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' + 'Radia run completed. Finishing up...', file=params.logfile) # Move files from temp directory to outdir prepare.move_output(params) print('RESULT ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': Process ' + 'completed', file=params.logfile) params.logfile.close()
def main(): """ This wrapper script will run the IEDB tools within the cutadapt docker container for the precision immuno project. The wrapper requires 1. IEDB tools for MHCI prediction - http://tools.iedb.org/mhci 2. netMHCIIpan - In case IEDB tools fails 3. python This script requires an input file (--file_prefix) that contains (2 * PEPLEN - 1)-mer fasta records for analysis. FILE_PREFIX must be a .faa file. Unless specified, the program will look for default executables on $PATH. The program DOES NOT look for jar files and they are required to be passed during execution. """ # Parse the arguments using prepare.parse_args() params = prepare.parse_args(main.__doc__, 'mhc', 'mhci_predictions') # Params ERROR handling # peplen_filenames is a dictionary with peptide length as key and the full # path to the filename associated with the peplen as the value. pepilename = process_parameters(params) # Move to working directory before doing I/O intensive work os.chdir(params.working_dir) # set up the different allele regexes strip_allele_regex = re.compile(r'[\*:/-]') # For strip allele dpqa_allele_regex_1 = re.compile(r'[\*:]') # For DPA and DQA if netMHCIIpan dpqa_allele_regex_2 = re.compile(r'/') # is used for allele in params.alleles: for peptide_length in params.peplen: # Setup the output file # Strip allele converts HLA-DRB1*15:01 to HLA_DRB1_15_01 and # HLA-DQA1*01:02/DQB1*03:02 to HLA_DQA1_01_02_DQB1_03_02 strip_allele = re.sub(strip_allele_regex, '_', allele) # Setup the call mhc_ii_call = ['python', params.mhc_executable] # base call mhc_ii_call.append(params.pred_meth) # prediction method mhc_ii_call.append(allele) # Allele mhc_ii_call.append(pepfilename) mhc_outfile_name = ''.join([params.out_prefix, '_', allele, '.tsv']) with open(mhc_outfile_name, 'w') as mhc_outfile: return_value = call(mhc_ii_call, stdout=mhc_outfile, stderr=params.logfile) if return_value != 0: print('WARNING: IEDBtools failed. Attempting netMHCIIpan', file=params.logfile) # netmHCIIpan needs a different formatting for allele # HLA-DQA1*01:02/DQB1*03:02 should be HLA-DQA10102-DQB10302 # HLA-DRB1*15:01 should be DRB1_1501. DP and DQ are similar if allele.startswith('HLA-DQ') or allele.startswith('HLA-DP'): allele = re.sub(dpqa_allele_regex_1, '', allele) allele = re.sub(dpqa_allele_regex_2, '-', allele) else: allele = strip_allele[4:] # Easier than starting from allele netmhc_ii_call = [params.netmhciipan_executable] netmhc_ii_call.extend(['-a', allele]) # Allele netmhc_ii_call.extend(['-xls', 1]) netmhc_ii_call.extend(['-xlsfile', mhc_outfile_name]) netmhc_ii_call.extend(['-f', pepfilename]) return_value = call(mhc_ii_call, stderr=params.logfile) if return_value != 0: raise pi_errors.MyRuntimeError( dt.now().strftime('%I:%M %p %b %d, %Y') + \ ': MHCII prediction failed.', params.logfile) # Move files from temp directory to outdir prepare.move_output(params) print('RESULT ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': Process ' + 'completed', file=params.logfile) params.logfile.close()