def _simulate_reads(self, dict_id_abundance, dict_id_file_path, factor, directory_output): """ Parallel simulation of reads @param dict_id_abundance: Dictionary of genome id to abundance @type dict_id_abundance: dict[str|unicode, float] @param dict_id_file_path: Dictionary of genome id to file path @type dict_id_file_path: dict[str|unicode, str|unicode] @param factor: Factor abundances will be multiplied by @type factor: float | int | long @param directory_output: Directory for the sam and fastq files output @type directory_output: str | unicode """ self._logger.info("Simulating reads using %s readsimulator..." % self._label) assert isinstance( dict_id_file_path, dict), "Expected dictionary, genome id as key, file path as value" assert isinstance( dict_id_abundance, dict), "Expected dictionary, genome id as key, abundance as value" assert isinstance(factor, (int, long, float)), "Factor must be numerical" assert self.validate_dir(directory_output) # add commands to a list of tasks to run them in parallel instead of calling them sequentially tasks = [] for genome_id in dict_id_abundance.keys(): file_path_input = dict_id_file_path[genome_id] abundance = dict_id_abundance[genome_id] if abundance == 0: continue if self._label == "ReadSimulationWgsim" or self._label == "ReadSimulationNanosim": # name "fold_coverage" is misleading for wgsim/nanosim, which use number of reads as input fold_coverage = int( round(abundance * factor / self._fragment_size_mean)) else: fold_coverage = abundance * factor file_path_output_prefix = os.path.join(directory_output, str(genome_id)) self._logger.debug("{id}\t{fold_coverage}".format( id=genome_id, fold_coverage=fold_coverage)) system_command = self._get_sys_cmd( file_path_input=file_path_input, fold_coverage=fold_coverage, file_path_output_prefix=file_path_output_prefix) self._logger.debug("SysCmd: '{}'".format(system_command)) self._logger.info("Simulating reads from {}: '{}'".format( genome_id, file_path_input)) tasks.append(TaskCmd(system_command)) list_of_fails = runCmdParallel(tasks, maxProc=self._max_processes) if list_of_fails is not None: self._logger.error("{} commands returned errors!".format( len(list_of_fails))) reportFailedCmd(list_of_fails) self._logger.info("Simulating reads finished")
def convert_sam_to_bam_by_list(self, list_of_sam_files, output_dir="./"): """ Converts all SAM-files in current directory to BAM-Format @attention: @param list_of_sam_files: list of sam file paths @type list_of_sam_files: list[str|unicode] @param output_dir: output directory @type output_dir: str | unicode @return: None @rtype: None @raises: OSError | AssertionError """ bam_is_folder = self.validate_dir(output_dir, silent=True) assert isinstance(list_of_sam_files, list), "Expected list of file paths" assert bam_is_folder, "Invalid file or directory: '{}'".format( output_dir) # add commands to a list of tasks to run them in parallel tasks = [] # cmd = "{exe} {sam} {name}" for sam_file_path in list_of_sam_files: cmd = self._get_sam_to_bam_cmd(sam_file_path, output_dir) tasks.append(TaskCmd(cmd)) fail_list = runCmdParallel(tasks, maxProc=self._max_processes) if fail_list is not None: for message in reportFailedCmd(fail_list): self._logger.error(message) msg = "Converting sam files to bam files failed." self._logger.error(msg) raise OSError(msg)
def multiprocessing_run(self): """ Distributes the passed command-line jobs using multiprocessing. @rtype: None """ self._logger.info("Running {} jobs with multiprocessing".format( len(self._cmd_lines))) list_cmd_task = [ parallel.TaskCmd(cmd, self._tmp_dir) for cmd in self._cmd_lines ] fail_list = parallel.runCmdParallel(list_cmd_task, self._max_processors) if fail_list is not None: parallel.reportFailedCmd(fail_list) self._CUM_RETVALS = -1 * len(fail_list) self._logger.info("Multiprocessing jobs completed")
def _simulate_strains(self, genome_id_to_amounts, genome_id_to_file_path_genome, genome_id_to_file_path_gff=None): """ Use sgEvolver to generate strain-level diversity around an isolate assembly. @attention genome_id_to_file_path_genome: Will be extended with IDs and file paths to the strains @param genome_id_to_amounts: Mapping from genome id to the amount of strains @type genome_id_to_amounts: dict[str, int] @param genome_id_to_file_path_genome: Mapping from genome id to the file path of the genome @type genome_id_to_file_path_genome: dict[str, str] @param genome_id_to_file_path_gff: Mapping from genome id to the file path of the gene annotations of a genome @type genome_id_to_file_path_gff: dict[str, str] @return: Nothing @rtype: None """ tasks = [] file_path_empty_file = None if genome_id_to_file_path_gff is None: file_path_empty_file = self.get_full_path( tempfile.mktemp(dir=self._tmp_dir)) touch(file_path_empty_file) for genome_id in genome_id_to_file_path_genome.keys(): if self._keep_original and genome_id_to_amounts[genome_id] == 1: continue directory_strain = self._directory_strain.format(gid=genome_id) self._prepare_simulation_subfolder(directory_strain) file_path_genome = genome_id_to_file_path_genome[genome_id] if genome_id_to_file_path_gff is None: file_path_gff = file_path_empty_file else: file_path_gff = genome_id_to_file_path_gff[genome_id] self._logger.info( "Simulating strain evolution of '{}'".format(genome_id)) tasks.append( TaskCmd( self._get_simulate_cmd(directory_strains=directory_strain, filepath_genome=file_path_genome, filepath_gff=file_path_gff))) list_of_fails = runCmdParallel(tasks, maxProc=self._max_processors) if file_path_empty_file is not None: if os.path.exists(file_path_empty_file): os.remove(file_path_empty_file) if list_of_fails is not None: for message in reportFailedCmd(list_of_fails): self._logger.error(message) msg = "Simulation of strains failed." self._logger.error(msg) raise OSError(msg)
def merge_bam_files_by_dict(self, dict_of_bam_files, output_dir): """ Merge lists of bam files into one. @attention: dictionary keys used as file names @param dict_of_bam_files: dictionary list of bam file paths as value @type dict_of_bam_files: dict[str|unicode, list[str|unicode]] @param output_dir: output directory @type output_dir: str | unicode @return: None @rtype: None @raises: OSError | AssertionError """ output_dir = self.get_full_path(output_dir) bam_is_folder = self.validate_dir(output_dir, silent=True) assert isinstance(dict_of_bam_files, dict), "Expected dictionary of file paths" assert bam_is_folder, "Invalid file or directory: '{}'".format( output_dir) for key, list_of_bam_paths in dict_of_bam_files.iteritems(): for file_path in list_of_bam_paths: assert self.validate_file( file_path), "Invalid file: '{}'".format(file_path) # add commands to a list of tasks to run them in parallel tasks = [] for filename, list_of_bam_paths in dict_of_bam_files.iteritems(): if len(list_of_bam_paths) == 1: # move bam instead of merge, if only one file_path = list_of_bam_paths[0] self._logger.warning( "List contains only one file: '{}'".format(file_path)) out_file_path = os.path.join( output_dir, filename + self._bam_file_extension) shutil.copy2(file_path, out_file_path) continue cmd = self._get_merge_bam_cmd(list_of_bam_paths, os.path.join(output_dir, filename)) tasks.append(TaskCmd(cmd)) fail_list = runCmdParallel(tasks, maxProc=self._max_processes) if fail_list is not None: for message in reportFailedCmd(fail_list): self._logger.error(message) msg = "Converting sam files to bam files failed." self._logger.error(msg) raise OSError(msg)
def gather_markergenes(self, hmmer, mg_type, file_path_output, file_path_map_uid_sid): """ Find and extract marker genes from genomes @param hmmer: hmmer2 or hmmer3 @type hmmer: int | long @param mg_type: '16S', '5S' or '23S' etc @type mg_type: str | unicode @param file_path_output: Output for list of extracted marker genes sequences in fasta format @type file_path_output: str | unicode @rtype: None """ assert isinstance(hmmer, (int, long)) assert isinstance(file_path_output, basestring) assert self.validate_number(hmmer, minimum=2, maximum=3) assert self.validate_dir(file_path_output, only_parent=True) assert mg_type in self._suffixes, "Marker gene '{}' is not supported." self._logger.info("Searching and extracting marker genes") start = time.time() query_genome_file_paths = self._get_genome_id_to_path_map( self._file_path_query_genome_file_paths) if self._file_path_reference_genome_file_paths is not None and self._file_path_reference_marker_genes is None: reference_genome_file_paths = self._get_genome_id_to_path_map( self._file_path_reference_genome_file_paths) query_genome_file_paths.update(reference_genome_file_paths) elif self._file_path_reference_genome_file_paths is not None and self._file_path_reference_marker_genes is not None: self._logger.warning( "Ignoring reference genome file paths and using previous reference marker genes!" ) cmd_list = self._get_cmd_list(hmmer=hmmer, dict_of_fasta=query_genome_file_paths) list_of_tasks = [] for cmd in cmd_list: list_of_tasks.append(parallel.TaskCmd(cmd)) fail_list = parallel.runCmdParallel(list_of_tasks, self._max_processors) if fail_list is not None: for message in parallel.reportFailedCmd(fail_list): self._logger.error(message) msg = "Extracting marker genes failed." self._logger.error(msg) raise OSError(msg) tmp_out_file_path = tempfile.mktemp(suffix="_accepted", dir=self._temp_directory) tmp_out_file_bin_path = tempfile.mktemp(suffix="_rejected", dir=self._temp_directory) self._merge_marker_genes_files( query_genome_file_paths, tmp_out_file_path, file_path_out_bin=tmp_out_file_bin_path, file_path_map_uid_sid=file_path_map_uid_sid, mg_type=mg_type) if os.path.exists(tmp_out_file_path): shutil.copy2(tmp_out_file_path, file_path_output) else: self._logger.warning("No valid maker gene found!") if os.path.exists(tmp_out_file_bin_path): shutil.copy2(tmp_out_file_bin_path, file_path_output + ".rejected.fna") if self._file_path_reference_marker_genes is not None: # append reference genome marker genes shutil.copy(file_path_output, file_path_output + ".no_ref") with open(file_path_output, 'a') as write_handler, open( self._file_path_reference_marker_genes) as read_handler: write_handler.writelines(read_handler) end = time.time() self._logger.info("Extracting marker genes finished ({}s)".format( round(end - start, 1))) if not self._debug: for directory in self._working_dirs.values(): shutil.rmtree(directory) else: for directory in self._working_dirs.values(): self._logger.warning("Remove manually: '{}'".format(directory))