def split_fasta_file(input_file_path, dest_dir, prefix='part', number_of_sequences_per_file=20000): debug('%s; src: %s, dest dir: %s' % (my_name(), input_file_path, dest_dir)) input = SequenceSource(input_file_path) parts = [] next_part = 1 part_obj = None while input.next(): if (input.pos - 1) % number_of_sequences_per_file == 0: sys.stderr.write('\rCreating part: ~ %s' % (pp(next_part))) sys.stderr.flush() if part_obj: part_obj.close() file_path = os.path.join(dest_dir, prefix + '-%08d' % next_part) parts.append(file_path) next_part += 1 part_obj = open(file_path, 'w') part_obj.write('>%s\n' % input.id) part_obj.write('%s\n' % input.seq) if part_obj: part_obj.close() sys.stderr.write('\n') return parts
def copy_file(source_file, dest_file): debug('%s; dest: "%s", src: "%s"' % (my_name(), source_file, dest_file)) try: return shutil.copyfile(source_file, dest_file) except IOError, e: raise UtilsError, "copy failed due to the following reason: '%s' (src: %s, dst: %s)" \ % (e, source_file, dest_file)
def store_ids_from_b6_output(source_b6_output, dest_file): debug('%s; dest: %s' % (my_name(), dest_file)) try: b6 = B6Source(source_b6_output) except IOError, e: raise UtilsError, "open failed due to the following reason: '%s' (src: %s)" \ % (e, source_b6_output)
def run_command(cmdline): debug('%s; cmd: %s' % (my_name(), cmdline)) try: if subprocess.call(cmdline, shell = True) < 0: raise UtilsError, "command was terminated by signal: %d" % (-retcode) except OSError, e: raise UtilsError, "command was failed for the following reason: '%s' ('%s')" % (e, cmdline)
def split_fasta_file(input_file_path, dest_dir, prefix = 'part', number_of_sequences_per_file = 20000): debug('%s; src: %s, dest dir: %s' % (my_name(), input_file_path, dest_dir)) input = SequenceSource(input_file_path) parts = [] next_part = 1 part_obj = None while input.next(): if (input.pos - 1) % number_of_sequences_per_file == 0: sys.stderr.write('\rCreating part: ~ %s' % (pp(next_part))) sys.stderr.flush() if part_obj: part_obj.close() file_path = os.path.join(dest_dir, prefix + '-%08d' % next_part) parts.append(file_path) next_part += 1 part_obj = open(file_path, 'w') part_obj.write('>%s\n' % input.id) part_obj.write('%s\n' % input.seq) if part_obj: part_obj.close() sys.stderr.write('\n') return parts
def init_modules(self): mod_base = self.constants.dirs['modules'] for file in os.listdir(mod_base): if file.startswith('mod_') and file.endswith('.py'): mod_name = file[4:-3] self.modules[mod_name] = imp.load_source(mod_name, os.path.join(mod_base, file)) debug('module "%s" found' % mod_name)
def concatenate_files(dest_file, file_list): debug('%s; dest: "%s"' % (my_name(), dest_file)) dest_file_obj = open(dest_file, 'w') for chunk_path in file_list: for line in open(chunk_path): dest_file_obj.write(line) return dest_file_obj.close()
def refine_b6(source_file, dest_file, params): # FIXME: check if source_file is a valid m8 output. debug('%s; dest: %s' % (my_name(), dest_file)) try: b6 = B6Source(source_file) except IOError, e: raise UtilsError, "open failed due to the following reason: '%s' (src: %s)" \ % (e, source_file)
def init_modules(self): mod_base = self.constants.dirs['modules'] for file in os.listdir(mod_base): if file.startswith('mod_') and file.endswith('.py'): mod_name = file[4:-3] self.modules[mod_name] = imp.load_source( mod_name, os.path.join(mod_base, file)) debug('module "%s" found' % mod_name)
def split_file(ids_file, source_file, filtered_dest_file, survived_dest_file, type='fasta'): """splits reads in input file into two files based on ids_file for read_id in input: if read_id in list_of_ids: --> filtered_dest_file else: --> survived dest_file """ debug('%s; src: "%s" (%s), filtered_dest: "%s", survived_dest: "%s"'\ % (my_name(), source_file, type, filtered_dest_file, survived_dest_file)) try: ids_to_filter = set([id.strip() for id in open(ids_file).readlines()]) except IOError: raise FilterError, 'Hit IDs file missing ("%s").' \ % (ids_to_filter) if type == 'fasta': STORE = lambda e, f: f.write('>%s\n%s\n' % (e.id, e.seq)) input = SequenceSource(source_file) filtered_output = open(filtered_dest_file, 'w') survived_output = open(survived_dest_file, 'w') filtered_count, survived_count = 0, 0 while input.next(): if input.pos % 10000 == 0 or input.pos == 1: sys.stderr.write('\rSplitting FASTA file: ~ %s' % (pp(input.pos))) sys.stderr.flush() if input.id in ids_to_filter: ids_to_filter.remove(input.id) STORE(input, filtered_output) filtered_count += 1 else: STORE(input, survived_output) survived_count += 1 sys.stderr.write('\n') filtered_output.close() survived_output.close() debug('%s; done. of %s total reads, filtered: %s, survived: %s.'\ % (my_name(), pp(filtered_count + survived_count),\ pp(filtered_count), pp(survived_count))) else: raise UtilsError, "type '%s' is not implemented" % (type) return True
def run_command(cmdline): debug('%s; cmd: %s' % (my_name(), cmdline)) try: if subprocess.call(cmdline, shell=True) < 0: raise UtilsError, "command was terminated by signal: %d" % ( -retcode) except OSError, e: raise UtilsError, "command was failed for the following reason: '%s' ('%s')" % ( e, cmdline)
def search(m): parts = m.files['parts'] for part in parts: params = {'input': part, 'output': part + '.b6', 'target': m.target_db, 'log': part + '.log', 'cmdparams': ' '.join(m.cmdparams)} debug('searching part %d/%d (log: %s)' % (parts.index(part) + 1, len(parts), params['log'])) cmdline = SEARCH_COMMAND % params utils.run_command(cmdline) dest_file = m.files['search_output'] utils.concatenate_files(dest_file, [part + '.b6' for part in m.files['parts']])
def split_file(ids_file, source_file, filtered_dest_file, survived_dest_file, type = 'fasta'): """splits reads in input file into two files based on ids_file for read_id in input: if read_id in list_of_ids: --> filtered_dest_file else: --> survived dest_file """ debug('%s; src: "%s" (%s), filtered_dest: "%s", survived_dest: "%s"'\ % (my_name(), source_file, type, filtered_dest_file, survived_dest_file)) try: ids_to_filter = set([id.strip() for id in open(ids_file).readlines()]) except IOError: raise FilterError, 'Hit IDs file missing ("%s").' \ % (ids_to_filter) if type == 'fasta': STORE = lambda e, f: f.write('>%s\n%s\n' % (e.id, e.seq)) input = SequenceSource(source_file) filtered_output = open(filtered_dest_file, 'w') survived_output = open(survived_dest_file, 'w') filtered_count, survived_count = 0, 0 while input.next(): if input.pos % 10000 == 0 or input.pos == 1: sys.stderr.write('\rSplitting FASTA file: ~ %s' % (pp(input.pos))) sys.stderr.flush() if input.id in ids_to_filter: ids_to_filter.remove(input.id) STORE(input, filtered_output) filtered_count += 1 else: STORE(input, survived_output) survived_count += 1 sys.stderr.write('\n') filtered_output.close() survived_output.close() debug('%s; done. of %s total reads, filtered: %s, survived: %s.'\ % (my_name(), pp(filtered_count + survived_count),\ pp(filtered_count), pp(survived_count))) else: raise UtilsError, "type '%s' is not implemented" % (type) return True
def search(m): parts = m.files["parts"] for part in parts: params = { "input": part, "output": part + ".b6", "target": m.target_db, "log": part + ".log", "cmdparams": " ".join(m.cmdparams), } debug("searching part %d/%d (log: %s)" % (parts.index(part) + 1, len(parts), params["log"])) cmdline = SEARCH_COMMAND % params utils.run_command(cmdline) dest_file = m.files["search_output"] utils.concatenate_files(dest_file, [part + ".b6" for part in m.files["parts"]])
def __init__(self, args, constants): if args: self.args = args self.constants = constants self.base_work_dir = self.args.base_work_dir.replace(' ', '_') self.dataset_name = self.args.dataset_name.replace(' ', '_') self.input = self.args.input self.dataset_root_dir = os.path.join(self.base_work_dir, self.dataset_name) self.filters = [] self.modules = {} debug('Initializing configuration') self.init_modules() self.init_essential_files_and_directories() self.init_filters_config(args.filters_config) self.init_chain_of_filters() debug('Config class is initialized with %d modules and %d filters'\ % (len(self.modules), len(self.filters)))
def delete_files_in_dir(dir): debug('%s; removing content of "%s"' % (my_name(), dir)) for f in os.listdir(dir): os.unlink(os.path.join(dir, f))
def init_filters_config(self, config_file_path): filters_config = ConfigParserWrapper(config_file_path) filters_config.read(config_file_path) for section in filters_config.sections(): filter = Filter(section) filter.name = filters_config.get(section, 'filter_name').replace(' ', '_') # check if the target database, which happens to be the section name, # exists if not (os.path.exists(section) and os.access(section, os.R_OK)): raise ConfigError, 'Bad target (file not found / no read permission): "%s"' % section # assign module module_from_config = filters_config.get(section, 'module') if not self.modules.has_key(module_from_config): raise ConfigError, 'Unknown module for filter "%s": "%s".\nAvailable modules:\n%s' \ % (filter.name, module_from_config, ', '.join(self.modules.keys())) else: filter.module = self.modules[module_from_config] # check the availability of the functions and the execution order, if the default # behavior has been changed manually in the config file if filters_config.has_option(section, 'execute'): execute_list_from_config = [ e.strip() for e in filters_config.get(section, 'execute').split(',') ] for item in execute_list_from_config: if item not in filter.module.FUNCTIONS_ORDER: raise ConfigError, 'Unknown function for module "%s" in "%s": "%s".\nAvailable functions: %s' \ % (module_from_config, filter.name, item, ', '.join(filter.module.FUNCTIONS_ORDER)) if len(execute_list_from_config) != len( list(set(execute_list_from_config))): raise ConfigError, 'Functions cannot be executed more than once: %s' \ % (', '.join(execute_list_from_config)) # make sure the order is right. t = [ filter.module.FUNCTIONS_ORDER.index(i) for i in execute_list_from_config ] if False in [t[i] > t[i - 1] for i in range(1, len(t))]: raise ConfigError, 'Order of functions to be executed is not correct: %s\nFunctions should follow this order: %s' \ % (', '.join(execute_list_from_config), ', '.join(filter.module.FUNCTIONS_ORDER)) filter.execution_order = execute_list_from_config debug( 'filter module functions execution order has been set: "%s"' % (filter.execution_order)) # store command line parameters from the config file for option in [ o for o in filters_config.options(section) if o.startswith('cmdparam.') ]: param = '.'.join(option.split('.')[1:]) opt = filters_config.get(section, option) filter.cmdparams.append('%s %s' % (param, opt)) debug('command line params for filter "%s": %s ' % (filter.name, filter.cmdparams)) # store post-search refinement filters from the config file for option in [ o for o in filters_config.options(section) if o.startswith('rfnparam.') ]: param = '.'.join(option.split('.')[1:]) opt = filters_config.get(section, option) if param in filter.get_refinement_params(): filter.rfnparams[param] = filter.module.ALLOWED_RFNPARAMS[ param](opt) else: raise ConfigError, 'Unknown refinement parameter for filter "%s": "%s"' \ % (filter.name, param) debug('refinement line params for filter "%s": %s ' % (filter.name, filter.rfnparams)) # take care of file paths and directories J = lambda x: os.path.join(filter.dirs['output'], x) filter.dirs['output'] = os.path.join(self.dataset_root_dir, filter.name) filter.dirs['parts'] = J('parts') filter.files['search_output'] = J('01_raw_hits.txt') filter.files['refined_search_output'] = J('02_refined_hits.txt') filter.files['hit_ids'] = J('03_hits.ids') filter.files['filtered_reads'] = J('04_filtered.fa') filter.files['survived_reads'] = J('05_survived.fa') self.filters.append(filter)
def init_filters_config(self, config_file_path): filters_config = ConfigParserWrapper(config_file_path) filters_config.read(config_file_path) for section in filters_config.sections(): filter = Filter(section) filter.name = filters_config.get(section, 'filter_name').replace(' ', '_') # check if the target database, which happens to be the section name, # exists if not (os.path.exists(section) and os.access(section, os.R_OK)): raise ConfigError, 'Bad target (file not found / no read permission): "%s"' % section # assign module module_from_config = filters_config.get(section, 'module') if not self.modules.has_key(module_from_config): raise ConfigError, 'Unknown module for filter "%s": "%s".\nAvailable modules:\n%s' \ % (filter.name, module_from_config, ', '.join(self.modules.keys())) else: filter.module = self.modules[module_from_config] # check the availability of the functions and the execution order, if the default # behavior has been changed manually in the config file if filters_config.has_option(section, 'execute'): execute_list_from_config = [e.strip() for e in filters_config.get(section, 'execute').split(',')] for item in execute_list_from_config: if item not in filter.module.FUNCTIONS_ORDER: raise ConfigError, 'Unknown function for module "%s" in "%s": "%s".\nAvailable functions: %s' \ % (module_from_config, filter.name, item, ', '.join(filter.module.FUNCTIONS_ORDER)) if len(execute_list_from_config) != len(list(set(execute_list_from_config))): raise ConfigError, 'Functions cannot be executed more than once: %s' \ % (', '.join(execute_list_from_config)) # make sure the order is right. t = [filter.module.FUNCTIONS_ORDER.index(i) for i in execute_list_from_config] if False in [t[i] > t[i - 1] for i in range(1, len(t))]: raise ConfigError, 'Order of functions to be executed is not correct: %s\nFunctions should follow this order: %s' \ % (', '.join(execute_list_from_config), ', '.join(filter.module.FUNCTIONS_ORDER)) filter.execution_order = execute_list_from_config debug('filter module functions execution order has been set: "%s"' % (filter.execution_order)) # store command line parameters from the config file for option in [o for o in filters_config.options(section) if o.startswith('cmdparam.')]: param = '.'.join(option.split('.')[1:]) opt = filters_config.get(section, option) filter.cmdparams.append('%s %s' % (param, opt)) debug('command line params for filter "%s": %s ' % (filter.name, filter.cmdparams)) # store post-search refinement filters from the config file for option in [o for o in filters_config.options(section) if o.startswith('rfnparam.')]: param = '.'.join(option.split('.')[1:]) opt = filters_config.get(section, option) if param in filter.get_refinement_params(): filter.rfnparams[param] = filter.module.ALLOWED_RFNPARAMS[param](opt) else: raise ConfigError, 'Unknown refinement parameter for filter "%s": "%s"' \ % (filter.name, param) debug('refinement line params for filter "%s": %s ' % (filter.name, filter.rfnparams)) # take care of file paths and directories J = lambda x: os.path.join(filter.dirs['output'], x) filter.dirs['output'] = os.path.join(self.dataset_root_dir, filter.name) filter.dirs['parts'] = J('parts') filter.files['search_output'] = J('01_raw_hits.txt') filter.files['refined_search_output'] = J('02_refined_hits.txt') filter.files['hit_ids'] = J('03_hits.ids') filter.files['filtered_reads'] = J('04_filtered.fa') filter.files['survived_reads'] = J('05_survived.fa') self.filters.append(filter)