def rsem_index(rsem_index_executable, fasta_input, bowtie_info, params): ''' This module will create the rsem indexes at params.index_destination using RSEM_INDEX_EXECUTABLE. If FASTA_INPUT = True, it will use the bowtie version to make bowtie indexes as well. bowtie_info is a tuple of (bowtie_path, bowtie_version) params contains index_destination - Folder to store the indexes n - number of cores to use genome_fasta - path to genomic fasta file. Can also specify DOWNLOAD. genome_version - hg19/hg38 logfile - Open file handle to a log file RETURN VALUES index_path - Path to directory where nidexes were stored ''' print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' + 'Creating rsem references...', file=params.logfile) index_path = os.path.abspath(params.index_destination) # If the directory doesn't exist, create it if not os.path.exists(index_path): prepare.py_mkdir(index_path) if params.genome_fasta == 'DOWNLOAD': params.genome_fasta = prepare.get_genome(params.genome_version, index_path, params.tbtf_executable, params.logfile) else: params.genome_fasta = pi_errors.test_param_value( params.genome_fasta, 'Genomic Fasta', '--genome_fasta', params.logfile) # If the gtf file is required, download it gencode_file = prepare.get_gtf(params.genome_version, index_path, params.logfile) print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' + 'Running rsem-prepare-reference on fasta reference.', file=params.logfile) rsem_prepref_call = [rsem_index_executable] # base call rsem_prepref_call.extend(['--gtf', gencode_file]) # gtf file if fasta_input: rsem_prepref_call.extend([ ''.join(['--', bowtie_version]), ''.join(['--', bowtie_version, '-path']), bowtie_path ]) else: rsem_prepref_call.append('--no-bowtie') rsem_prepref_call.append(params.genome_fasta) rsem_prepref_call.extend( [''.join([index_path, '/', params.genome_version])]) print(rsem_prepref_call, file=params.logfile) return_value = call(rsem_prepref_call) if return_value != 0: raise pi_errors.MyRuntimeError( dt.now().strftime('%I:%M %p %b %d, %Y') + \ ': Indexing Failed', params.logfile) print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' + 'Indexing completed.', file=params.logfile) return index_path
def rsem_index(rsem_index_executable, fasta_input, bowtie_info, params): ''' This module will create the rsem indexes at params.index_destination using RSEM_INDEX_EXECUTABLE. If FASTA_INPUT = True, it will use the bowtie version to make bowtie indexes as well. bowtie_info is a tuple of (bowtie_path, bowtie_version) params contains index_destination - Folder to store the indexes n - number of cores to use genome_fasta - path to genomic fasta file. Can also specify DOWNLOAD. genome_version - hg19/hg38 logfile - Open file handle to a log file RETURN VALUES index_path - Path to directory where nidexes were stored ''' print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' + 'Creating rsem references...', file=params.logfile) index_path = os.path.abspath(params.index_destination) # If the directory doesn't exist, create it if not os.path.exists(index_path): prepare.py_mkdir(index_path) if params.genome_fasta == 'DOWNLOAD': params.genome_fasta = prepare.get_genome(params.genome_version, index_path, params.tbtf_executable, params.logfile) else: params.genome_fasta = pi_errors.test_param_value(params.genome_fasta, 'Genomic Fasta', '--genome_fasta', params.logfile) # If the gtf file is required, download it gencode_file = prepare.get_gtf(params.genome_version, index_path, params.logfile) print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' + 'Running rsem-prepare-reference on fasta reference.', file=params.logfile) rsem_prepref_call = [rsem_index_executable] # base call rsem_prepref_call.extend(['--gtf', gencode_file]) # gtf file if fasta_input: rsem_prepref_call.extend([''.join(['--', bowtie_version]), ''.join(['--', bowtie_version, '-path']), bowtie_path]) else: rsem_prepref_call.append('--no-bowtie') rsem_prepref_call.append(params.genome_fasta) rsem_prepref_call.extend([''.join([index_path, '/', params.genome_version])]) print(rsem_prepref_call, file=params.logfile) return_value = call(rsem_prepref_call) if return_value != 0: raise pi_errors.MyRuntimeError( dt.now().strftime('%I:%M %p %b %d, %Y') + \ ': Indexing Failed', params.logfile) print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' + 'Indexing completed.', file=params.logfile) return index_path
def star_indexing(star_executable, read_length, params): ''' This module indexes a genome using STAR_EXECUTABLE using READ_LENGTH to set edge size. params contains index_destination - The location where the index should be stored logfile - Open file handle to a log file genome_version - hg19/hg38 n - number of cores to use tbtf_executable - path to twoBitToFa RETURN VALUES index_path - path ot the directory where indexes were stored ''' print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' + 'Indexing fasta...', file=params.logfile) params.index_destination = os.path.abspath(params.index_destination) if not os.path.exists(params.index_destination): prepare.py_mkdir(params.index_destination) edge_size = max(50, int(round(read_length / 50, 0) * 50)) # minimum edge # size = 50 index_path = ''.join( [params.index_destination, '/STAR_', str(edge_size), '_references']) if not os.path.exists(index_path): # make reference based on edge size prepare.py_mkdir(index_path) genome_fasta = prepare.get_genome(params.genome_version, index_path, params.tbtf_executable, params.logfile) gencode_file = prepare.get_gtf(params.genome_version, index_path, params.logfile) print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' + 'Running STAR index on fasta reference.', file=params.logfile) starindex_call = [star_executable] # Base call starindex_call.extend(['--runThreadN', str(params.n)]) # Threads starindex_call.extend(['--runMode', 'genomeGenerate']) # Indexing module starindex_call.extend(['--genomeDir', index_path]) # index directory starindex_call.extend(['--genomeFastaFiles', genome_fasta]) # Genomic fa starindex_call.extend(['--sjdbGTFfile', gencode_file]) # gencode annots starindex_call.extend(['--sjdbOverhang', str(read_length)]) # edge size print(starindex_call, file=params.logfile) return_value = call(starindex_call) if return_value != 0: raise pi_errors.MyRuntimeError( dt.now().strftime('%I:%M %p %b %d, %Y') + \ ': Indexing Failed', params.logfile) print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' + 'Indexing completed.', file=params.logfile) return index_path
def star_indexing(star_executable, read_length, params): ''' This module indexes a genome using STAR_EXECUTABLE using READ_LENGTH to set edge size. params contains index_destination - The location where the index should be stored logfile - Open file handle to a log file genome_version - hg19/hg38 n - number of cores to use tbtf_executable - path to twoBitToFa RETURN VALUES index_path - path ot the directory where indexes were stored ''' print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' + 'Indexing fasta...', file=params.logfile) params.index_destination = os.path.abspath(params.index_destination) if not os.path.exists(params.index_destination): prepare.py_mkdir(params.index_destination) edge_size = max(50, int(round(read_length / 50, 0) * 50)) # minimum edge # size = 50 index_path = ''.join([params.index_destination, '/STAR_', str(edge_size), '_references']) if not os.path.exists(index_path): # make reference based on edge size prepare.py_mkdir(index_path) genome_fasta = prepare.get_genome(params.genome_version, index_path, params.tbtf_executable, params.logfile) gencode_file = prepare.get_gtf(params.genome_version, index_path, params.logfile) print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' + 'Running STAR index on fasta reference.', file=params.logfile) starindex_call = [star_executable] # Base call starindex_call.extend(['--runThreadN', str(params.n)]) # Threads starindex_call.extend(['--runMode', 'genomeGenerate']) # Indexing module starindex_call.extend(['--genomeDir', index_path]) # index directory starindex_call.extend(['--genomeFastaFiles', genome_fasta]) # Genomic fa starindex_call.extend(['--sjdbGTFfile', gencode_file]) # gencode annots starindex_call.extend(['--sjdbOverhang', str(read_length)]) # edge size print(starindex_call, file=params.logfile) return_value = call(starindex_call) if return_value != 0: raise pi_errors.MyRuntimeError( dt.now().strftime('%I:%M %p %b %d, %Y') + \ ': Indexing Failed', params.logfile) print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': ' + 'Indexing completed.', file=params.logfile) return index_path
def process_parameters(params): ''' This module conducts the error handling for all parmeters passed to the program. ''' print('PROGRESS ' + dt.now().strftime('%I:%M %p %b %d, %Y') + ': Processing input parameters.', file=params.logfile) # Does the input vcf file exist? if not os.path.exists(''.join([params.file_path, '/', params.file_prefix, '.vcf'])): raise pi_errors.InputFileError( dt.now().strftime('%I:%M %p %b %d, %Y') + \ ': Please provide a valid input file using --file_prefix', params.logfile) # The memory option for java should be of the form Xmx10G or Xmx10M if not (params.java_Xmx.endswith('G') or params.java_Xmx.endswith('M')): raise pi_errors.ParameterError( dt.now().strftime('%I:%M %p %b %d, %Y') + \ ': Please use a suitable value for --Xmx.', params.logfile) params.java_executable = pi_errors.test_param_value(params.java_executable, 'java', '--java', params.logfile) # Does the provided snpeff binary provided exist? params.snpeff_jar = pi_errors.test_param_value(params.snpeff_jar, 'snpeff', '--snpeff_jar', params.logfile) params.use_snpeff_db = False # Does the user want a snpEff packaged database? if params.config_file == 'PACKAGED': params.use_snpeff_db = True # Has the snpeff reference to be used been provided? if params.reference_name == None: raise pi_errors.ParameterError( dt.now().strftime('%I:%M %p %b %d, %Y') + \ ': --snp_reference is required if --config=PACKAGED.', params.logfile) # If a custom databse is desired, does it need to be created? if params.index_location is None: # If the user has provided the location to the parent directory of data # directory, make DATA_DIRECTORY point to data. If they have provided # the link to data, make INDEX_DESTINATION point to the parent and # DATA_DIRECTORY point to data. if os.path.split(params.index_destination.rstrip('/'))[1] != 'data': params.data_directory = '/'.join([params.index_destination, 'data']) else: params.data_directory = params.index_destination params.index_destination = \ params.index_destination.rstrip('/').rstrip('/data') # Create the data directory if needed if not os.path.exists(params.data_directory): prepare.py_mkdir(params.data_directory) # If we're using a custom databse, thre is nothing more to do if params.use_snpeff_db: return None # Initialise the reference name params.reference_name = ''.join([params.genome_version, '_custom']) # make a variable to gold GENOME_VERSION_custom genome_folder = '/'.join([params.data_directory, params.reference_name]) prepare.py_mkdir(genome_folder) # If the genome fasta isn't provided or is provided a wrong value, # download it if params.genome_fasta == 'DOWNLOAD' or not \ os.path.exists(params.genome_fasta): # Does the provided tbtf binary point to a valid file? params.tbtf_executable = pi_errors.test_param_value( params.tbtf_executable, 'twoBitToFa', '--twoBitToFa', params.logfile) params.genome_fasta = prepare.get_genome( params.genome_version, genome_folder, params.tbtf_executable, params.logfile) # Rename genome fasta call(['mv', params.genome_fasta, '/'.join([genome_folder, 'sequences.fa'])]) else: params.genome_fasta = os.path.abspath(params.genome_fasta) # Link sequencesfa to genome fasta call(['ln', '-s', '-T', params.genome_fasta, '/'.join([genome_folder, 'sequences.fa'])]) # Download the gencode GTF file params.gtf_file = prepare.get_gtf(params.genome_version, genome_folder, params.logfile) # Rename gtf file call(['mv', params.gtf_file, '/'.join([genome_folder, 'genes.gtf'])]) # If it has been provided, set up the config file else: # If the user has provided the location to the parent directory of data # directory, make DATA_DIRECTORY point to data. If they have provided # the link to data, make INDEX_LOCATION point to the parent and # DATA_DIRECTORY point to data. if os.path.split(params.index_location.rstrip('/'))[1] != 'data': params.data_directory = '/'.join([params.index_location, 'data']) else: params.data_directory = params.index_location params.index_location = \ params.index_location.rstrip('/').rstrip('/data') # If we're using a custom databse, thre is nothing more to do if params.use_snpeff_db: return None # If the config file hasn't been provided, is it in INDEX_LOCATION # AND does GENOME_VERSION_custom exist (i.e. was it created by # this script?) if params.config_file is None: params.config_file = pi_errors.test_param_value( '/'.join([params.index_location, 'snpEff.config']), 'snpEff.config', '--config', params.logfile) # Dummy variable to ensure the GENOME_VERSION_custom exists _ = pi_errors.test_param_value( ''.join([params.data_directory, '/', params.genome_version, '_custom']), '_'.join([params.genome_version, 'custom' ]), '--snpeff_reference and' + '--config', params.logfile) params.reference_name = '_'.join([params.genome_version, 'custom']) # If a config file has been provided, does it point to a legit file and # has the reference name also been provided? else: params.config_file = pi_errors.test_param_value( params.config_file, 'snpEff config file', '--config', params.logfile) if params.reference_name is None: raise pi_errors.ParameterError( dt.now().strftime('%I:%M %p %b %d, %Y') + \ ': --snpeff_reference is required if --config points to' + \ ' a custom file.', params.logfile) return None