Exemplo n.º 1
0
    def split_fasta(input_fasta,
                    output_dir,
                    num_of_recs_per_file=None,
                    num_of_files=None,
                    output_prefix=None):
        """
        by default splits input files into files with num_of_recs_per_file.
        if num_of_files is set num_of_recs_per_file is ignored.
        """
        FileRoutines.save_mkdir(output_dir)
        out_prefix = FileRoutines.split_filename(
            input_fasta)[1] if output_prefix is None else output_prefix
        sequence_dict = SeqIO.index_db("temp.idx", input_fasta, "fasta")

        split_index = 1
        records_written = 0
        record_ids_list = list(sequence_dict.keys())
        number_of_records = len(record_ids_list)

        num_of_recs = int(
            number_of_records /
            num_of_files) + 1 if num_of_files else num_of_recs_per_file
        while (records_written + num_of_recs) <= number_of_records:

            SeqIO.write(SequenceRoutines.record_by_id_generator(
                sequence_dict,
                record_ids_list[records_written:records_written +
                                num_of_recs]),
                        "%s/%s_%i.fasta" %
                        (output_dir, out_prefix, split_index),
                        format="fasta")
            split_index += 1
            records_written += num_of_recs

        if records_written != number_of_records:
            SeqIO.write(SequenceRoutines.record_by_id_generator(
                sequence_dict, record_ids_list[records_written:]),
                        "%s/%s_%i.fasta" %
                        (output_dir, out_prefix, split_index),
                        format="fasta")

        os.remove("temp.idx")
Exemplo n.º 2
0
    def split_fasta_by_seq_len(self,
                               input_fasta,
                               output_dir,
                               max_len_per_file=None,
                               output_prefix=None):
        """
        by default splits input files into files with num_of_recs_per_file.
        if num_of_files is set num_of_recs_per_file is ignored.
        """
        FileRoutines.save_mkdir(output_dir)

        out_prefix = FileRoutines.split_filename(
            input_fasta)[1] if output_prefix is None else output_prefix
        sequence_dict = SeqIO.index_db("temp.idx", input_fasta, "fasta")
        length = 0

        for record_id in sequence_dict:
            length += len(sequence_dict[record_id].seq)

        max_len = max_len_per_file if max_len_per_file else int(length /
                                                                self.threads)

        split_index = 1
        id_list = []
        total_length = 0

        for record_id in sequence_dict:
            record_length = len(sequence_dict[record_id].seq)
            if record_length >= max_len:
                SeqIO.write(sequence_dict[record_id],
                            "%s/%s_%i.fasta" %
                            (output_dir, out_prefix, split_index),
                            format="fasta")

            elif total_length + record_length > max_len:
                SeqIO.write(SequenceRoutines.record_by_id_generator(
                    sequence_dict, id_list),
                            "%s/%s_%i.fasta" %
                            (output_dir, out_prefix, split_index),
                            format="fasta")
                total_length = record_length
                id_list = [record_id]

            elif total_length + record_length == max_len:
                id_list.append(record_id)
                SeqIO.write(SequenceRoutines.record_by_id_generator(
                    sequence_dict, id_list),
                            "%s/%s_%i.fasta" %
                            (output_dir, out_prefix, split_index),
                            format="fasta")
                total_length = 0
                id_list = []

            elif total_length + record_length < max_len:
                id_list.append(record_id)
                total_length += record_length
                continue

            split_index += 1

        if id_list:
            SeqIO.write(SequenceRoutines.record_by_id_generator(
                sequence_dict, id_list),
                        "%s/%s_%i.fasta" %
                        (output_dir, out_prefix, split_index),
                        format="fasta")

        os.remove("temp.idx")