def split_fasta(input_fasta, output_dir, num_of_recs_per_file=None, num_of_files=None, output_prefix=None): """ by default splits input files into files with num_of_recs_per_file. if num_of_files is set num_of_recs_per_file is ignored. """ FileRoutines.save_mkdir(output_dir) out_prefix = FileRoutines.split_filename( input_fasta)[1] if output_prefix is None else output_prefix sequence_dict = SeqIO.index_db("temp.idx", input_fasta, "fasta") split_index = 1 records_written = 0 record_ids_list = list(sequence_dict.keys()) number_of_records = len(record_ids_list) num_of_recs = int( number_of_records / num_of_files) + 1 if num_of_files else num_of_recs_per_file while (records_written + num_of_recs) <= number_of_records: SeqIO.write(SequenceRoutines.record_by_id_generator( sequence_dict, record_ids_list[records_written:records_written + num_of_recs]), "%s/%s_%i.fasta" % (output_dir, out_prefix, split_index), format="fasta") split_index += 1 records_written += num_of_recs if records_written != number_of_records: SeqIO.write(SequenceRoutines.record_by_id_generator( sequence_dict, record_ids_list[records_written:]), "%s/%s_%i.fasta" % (output_dir, out_prefix, split_index), format="fasta") os.remove("temp.idx")
def split_fasta_by_seq_len(self, input_fasta, output_dir, max_len_per_file=None, output_prefix=None): """ by default splits input files into files with num_of_recs_per_file. if num_of_files is set num_of_recs_per_file is ignored. """ FileRoutines.save_mkdir(output_dir) out_prefix = FileRoutines.split_filename( input_fasta)[1] if output_prefix is None else output_prefix sequence_dict = SeqIO.index_db("temp.idx", input_fasta, "fasta") length = 0 for record_id in sequence_dict: length += len(sequence_dict[record_id].seq) max_len = max_len_per_file if max_len_per_file else int(length / self.threads) split_index = 1 id_list = [] total_length = 0 for record_id in sequence_dict: record_length = len(sequence_dict[record_id].seq) if record_length >= max_len: SeqIO.write(sequence_dict[record_id], "%s/%s_%i.fasta" % (output_dir, out_prefix, split_index), format="fasta") elif total_length + record_length > max_len: SeqIO.write(SequenceRoutines.record_by_id_generator( sequence_dict, id_list), "%s/%s_%i.fasta" % (output_dir, out_prefix, split_index), format="fasta") total_length = record_length id_list = [record_id] elif total_length + record_length == max_len: id_list.append(record_id) SeqIO.write(SequenceRoutines.record_by_id_generator( sequence_dict, id_list), "%s/%s_%i.fasta" % (output_dir, out_prefix, split_index), format="fasta") total_length = 0 id_list = [] elif total_length + record_length < max_len: id_list.append(record_id) total_length += record_length continue split_index += 1 if id_list: SeqIO.write(SequenceRoutines.record_by_id_generator( sequence_dict, id_list), "%s/%s_%i.fasta" % (output_dir, out_prefix, split_index), format="fasta") os.remove("temp.idx")