def split(replicon_path, chunk=None, outdir='.'): """ Split the replicon_file in *chunk* chunks and write them in files. the name of the chunk is the input filename with suffix '_chunk_i' (where i is the chunk number) if the chunk contains several sequences or the id of the sequence if there is only one sequence in the chunk. There also a system that prevent to over write an existing file by appending (number) to the file name for instance ESCO001.B.00018.P002_(1).fst :param str replicon_path: The path to the replicon file. :param int chunk: The number of chunk desire (chunk > 0). :param str outdir: The path of a directory where to write chunk files. The directory must exists. :return: The name of all chunks created. :rtype: List of strings. """ def grouper(sequences_db, chunk_size): """ :param sequences_db: The sequences to group :type sequences_db: A :class:`integron_finder.utils.FastaIterator` object. :param int chunk_size: The number of sequence by Chunk file. :return: a chunk of sequences. :rtype: An iterator of tuples. """ args = [iter(sequences_db)] * chunk_size return zip_longest(*args) with utils.FastaIterator(replicon_path) as sequences_db: sequences_db_len = len(sequences_db) if not chunk: chunk_size = 1 else: chunk_size = math.ceil(sequences_db_len / chunk) chunks = grouper(sequences_db, chunk_size) all_chunk_name = [] for chunk_no, chunk_in in enumerate(chunks, 1): # if replicon contains illegal characters # or replicon is too short < 50 bp # then replicon is None chunk_out = [] for rep_no, replicon in enumerate(chunk_in, 1): if replicon is not None: replicon_name = replicon.id chunk_out.append(replicon) else: rep_no_in_db = (chunk_no - 1) * chunk_size + rep_no if rep_no_in_db <= sequences_db_len: _log.warning( "Skipping replicon {}/{} in chunk {}".format( rep_no_in_db, sequences_db_len, chunk_no)) if chunk_out: if chunk_size == 1: chunk_name = "{}.fst".format(replicon_name) else: replicon_name = utils.get_name_from_path(replicon_path) chunk_name = "{}_chunk_{}.fst".format( replicon_name, chunk_no) chunk_name = os.path.join(outdir, chunk_name) i = 0 while os.path.exists(chunk_name): root, ext = os.path.splitext(chunk_name) i += 1 match = re.search("_chunk_\d+$", root) if match: root = root[:match.start()] chunk_name = "{}_chunk_{}{}".format(root, i, ext) _log.info("writing chunk '{}'".format(chunk_name)) SeqIO.write(chunk_out, chunk_name, "fasta") all_chunk_name.append(chunk_name) return all_chunk_name
def test_FastaIterator(self): file_name = 'multi_fasta' replicon_path = self.find_data( os.path.join('Replicons', file_name + '.fst')) topologies = Topology('lin') with utils.FastaIterator(replicon_path) as seq_db: seq_db.topologies = topologies received_seq_id = sorted([seq.id for seq in seq_db]) expected_seq_id = sorted( ['ACBA.007.P01_13', 'LIAN.001.C02_10', 'PSSU.001.C01_13']) self.assertListEqual(expected_seq_id, received_seq_id) self.assertEqual(len(seq_db), 3) expected_seq_name = expected_seq_id with utils.FastaIterator(replicon_path) as seq_db: seq_db.topologies = topologies received_seq_name = sorted([seq.name for seq in seq_db]) self.assertListEqual(expected_seq_name, received_seq_name) replicon_name = 'foo' with utils.FastaIterator(replicon_path, replicon_name=replicon_name) as seq_db: seq_db.topologies = topologies received_seq_id = set([seq.name for seq in seq_db]) expected_seq_name = set([replicon_name]) self.assertSetEqual(expected_seq_name, received_seq_id) with utils.FastaIterator(replicon_path) as seq_db: received_seq_top = [seq.topology for seq in seq_db] expected_seq_top = ['lin', 'lin', 'lin'] self.assertListEqual(expected_seq_top, received_seq_top) topologies_data = { 'ACBA.007.P01_13': 'lin', 'LIAN.001.C02_10': 'circ', 'PSSU.001.C01_13': 'lin', } with tempfile.NamedTemporaryFile(mode='w') as topology_file: for rep, topo in topologies_data.items(): topology_file.write("{} {}\n".format(rep, topo)) topology_file.flush() topologies = Topology('lin', topology_file=topology_file.name) with utils.FastaIterator(replicon_path) as seq_db: seq_db.topologies = topologies received_seq_top = {seq.id: seq.topology for seq in seq_db} self.assertDictEqual(topologies_data, received_seq_top) file_name = 'acba_short' replicon_path = self.find_data( os.path.join('Replicons', file_name + '.fst')) topologies = Topology('circ') with utils.FastaIterator(replicon_path) as seq_db: seq_db.topologies = topologies received_seq_top = [seq.topology for seq in seq_db] expected_seq_top = ['lin'] self.assertListEqual(expected_seq_top, received_seq_top) file_name = 'replicon_ambiguous_char' replicon_path = self.find_data( os.path.join('Replicons', file_name + '.fst')) with utils.FastaIterator(replicon_path) as seq_db: received_seq_id = sorted([seq.id for seq in seq_db if seq]) expected_seq_id = sorted(['seq_1', 'seq_2', 'seq_3', 'seq_4']) self.assertListEqual(expected_seq_id, received_seq_id) file_name = 'replicon_bad_char' replicon_path = self.find_data( os.path.join('Replicons', file_name + '.fst')) expected_warning = """sequence seq_(3|4) contains invalid characters, the sequence is skipped. sequence seq_(3|4) contains invalid characters, the sequence is skipped.""" with utils.FastaIterator(replicon_path) as seq_db: # 2 sequences are rejected so 2 message is produced (for seq 3 and seq 4) with self.catch_log() as log: received_seq_id = sorted([seq.id for seq in seq_db if seq]) got_warning = log.get_value().strip() self.assertRegex(got_warning, expected_warning) expected_seq_id = sorted(['seq_1', 'seq_2']) self.assertListEqual(expected_seq_id, received_seq_id) file_name = 'replicon_too_short' replicon_path = self.find_data( os.path.join('Replicons', file_name + '.fst')) expected_warning = """sequence seq_(4|2) is too short \(32 bp\), the sequence is skipped \(must be > 50bp\). sequence seq_(4|2) is too short \(32 bp\), the sequence is skipped \(must be > 50bp\).""" with utils.FastaIterator(replicon_path) as seq_db: # 2 sequences are rejected so 2 messages are produced (for seq 2 & 4) with self.catch_log() as log: received_seq_id = sorted([seq.id for seq in seq_db if seq]) got_warning = log.get_value().strip() self.assertRegex(got_warning, expected_warning) expected_seq_id = sorted(['seq_1', 'seq_3']) self.assertListEqual(expected_seq_id, received_seq_id)
def main(args=None, loglevel=None): """ main entry point to integron_finder :param str args: the arguments passed on the command line :param loglevel: the output verbosity :type loglevel: a positive int or a string among 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL' """ global _log args = sys.argv[1:] if args is None else args config = parse_args(args) ################################### # Prepare directories for results # ################################### # need to create directory before to init logger # as we write log in integron_finder.out in this dir if not os.path.exists(config.outdir): os.mkdir(config.outdir) else: if not os.path.isdir(config.outdir): msg = "outdir '{}' already exists and is not a directory".format( config.outdir) # _log.critical(msg) # we can not log it because logger are not initialized yet. raise IsADirectoryError(msg) if not os.path.exists(config.result_dir): os.mkdir(config.result_dir) else: if not os.path.isdir(config.result_dir): msg = "result dir '{}' already exists and is not a directory".format( config.outdir) # _log.critical(msg) # we can not log it because logger are not initialized yet. raise IsADirectoryError(msg) elif not os.access(config.result_dir, os.W_OK): msg = "result dir '{}' already exists and is not writable".format( config.outdir) # _log.critical(msg) # we can not log it because logger are not initialized yet. raise PermissionError(msg) #################### # init the loggers # #################### log_file = os.path.join(config.result_dir, 'integron_finder.out') integron_finder.init_logger(log_file=log_file, out=not config.mute) _log = colorlog.getLogger('integron_finder') if not loglevel: # logs are specify from args options logger_set_level(config.log_level) else: # used by unit tests to mute or unmute logs logger_set_level(loglevel) ####################################### # do last config check before running # ####################################### if config.cmsearch is None: msg = """cannot find 'cmsearch' in PATH. Please install infernal package or setup 'cmsearch' binary path with --cmsearch option""" _log.critical(msg) raise RuntimeError(msg) if config.hmmsearch is None: msg = """cannot find 'hmmsearch' in PATH. Please install hmmer package or setup 'hmmsearch' binary path with --hmmsearch option""" _log.critical(msg) raise RuntimeError(msg) if config.prodigal is None: msg = """cannot find 'prodigal' in PATH. Please install prodigal package or setup 'prodigal' binary path with --prodigal option""" _log.critical(msg) raise RuntimeError(msg) ################ # print Header # ################ log_header = colorlog.getLogger('integron_finder.header') logging = colorlog.logging.logging handlers = [] header_log_file = logging.FileHandler(log_file) handlers.append(header_log_file) if not config.mute: header_stream = colorlog.StreamHandler(sys.stdout) handlers.append(header_stream) formatter = colorlog.ColoredFormatter("%(message)s") for h in handlers: h.setFormatter(formatter) log_header.addHandler(h) log_header.setLevel(colorlog.logging.logging.INFO) log_header.propagate = False log_header.info(header(args)) with utils.FastaIterator( config.input_seq_path, dist_threshold=config.distance_threshold) as sequences_db: ################ # set topology # ################ default_topology = 'circ' if len(sequences_db) == 1 else 'lin' if config.linear: default_topology = 'lin' elif config.circular: default_topology = 'circ' # the both options are mutually exclusive topologies = Topology(default_topology, topology_file=config.topology_file) # allow sequences_db to inject topology information # in seq.topology attribute sequences_db.topologies = topologies ############## # do the job # ############## sequences_db_len = len(sequences_db) all_integrons = [] all_summaries = [] for rep_no, replicon in enumerate(sequences_db, 1): # if replicon contains illegal characters # or replicon is too short < 50 bp # then replicon is None if replicon is not None: _log.info( "############ Processing replicon {} ({}/{}) ############\n" .format(replicon.id, rep_no, sequences_db_len)) integron_res, summary = find_integron_in_one_replicon( replicon, config) if integron_res: all_integrons.append(integron_res) if summary: all_summaries.append(summary) else: _log.warning( "############ Skipping replicon {}/{} ############".format( rep_no, sequences_db_len)) if not config.split_results: _log.info("Merging integrons results.\n") agg_integrons = results.merge_results(*all_integrons) agg_summary = results.merge_results(*all_summaries) outfile_base_name = os.path.join( config.result_dir, utils.get_name_from_path(config.input_seq_path)) merged_integron_file = outfile_base_name + ".integrons" if not agg_integrons.empty: agg_integrons.to_csv(merged_integron_file, sep="\t", index=False, na_rep="NA") else: with open(merged_integron_file, "w") as out_f: out_f.write("# No Integron found\n") merged_summary_file = outfile_base_name + ".summary" if not agg_integrons.empty: agg_summary.to_csv(merged_summary_file, sep="\t", index=False, na_rep="NA", columns=[ 'ID_replicon', 'ID_integron', 'complete', 'In0', 'CALIN' ]) for _file in all_integrons + all_summaries: if _file != merged_integron_file and _file != merged_summary_file: # in special case where the merged file has the same name that a replicon result file os.unlink(_file)