예제 #1
0
def load_reads(seq_file, label=None, max_len=100):
    read_list = []
    seq_format = get_seq_format(seq_file)
    _open = partial(gzip.open, mode='rt') if seq_format.endswith(
        "gz") else open
    seq_type = "fasta" if seq_format.startswith("fa") else "fastq"
    if label is None:
        with _open(seq_file) as fh:
            for record in seq_parser(fh, seq_type):
                read_list.append(record)
    else:
        with _open(seq_file) as fh:
            for record in seq_parser(fh, seq_type):
                seq = record[1]
                read, rc_read = get_read_rc_with_maxlen(seq, max_len=100)
                read_list.extend([(label, read), (label, rc_read)])
    return read_list
예제 #2
0
def load_seqs(seq_file, label, step_size):
    seq_step_list = []
    seq_format = get_seq_format(seq_file)
    _open = partial(gzip.open, mode='rt') if seq_format.endswith(
        "gz") else open
    seq_type = "fasta" if seq_format.startswith("fa") else "fastq"
    with _open(seq_file) as fh:
        for record in seq_parser(fh, seq_type):
            seq = record[1]
            seq_step_list.extend([(label, seq, step_size),
                                  (label, str(Seq(seq).reverse_complement()), step_size)])
    return seq_step_list
예제 #3
0
def get_seq_chunks(seq_file, chunk_size=1048576):
    seq_format = get_seq_format(seq_file)
    _open = partial(gzip.open, mode='rt') if seq_format.endswith(
        "gz") else open
    seq_type = "fasta" if seq_format.startswith("fa") else "fastq"
    with _open(seq_file) as fh:
        seq_iterator = seq_parser(fh, seq_type)
        while True:
            seqs_chunk = list(islice(seq_iterator, chunk_size))
            if seqs_chunk:
                yield seqs_chunk
            else:
                break
예제 #4
0
def all_seqs_x(seq_file, min_seq_length):
    dataset = []
    seq_format = get_seq_format(seq_file)
    _open = partial(gzip.open,
                    mode='rt') if seq_format.endswith("gz") else open
    seq_type = "fasta" if seq_format.startswith("fa") else "fastq"
    with _open(seq_file) as fh:
        # for record in SeqIO.parse(fh, seq_type):  # parse_seq_file(seq_file):
        #     seq = str(record.seq).upper()
        for record in seq_parser(fh, seq_type):
            features = seq_to_feature(record[1], min_seq_length)
            try:
                dataset.append(features)
            except NameError as e:
                print(NameError("Can not concatenate the np array", e))
        return dataset
예제 #5
0
    def output_seq(self):
        norrna_count = 0
        seq_format = get_seq_format(self.input[0])
        seq_type = "fasta" if seq_format.startswith("fa") else "fastq"
        _open = partial(gzip.open, mode='rt') if seq_format.endswith("gz") else open

        self.logger.info('Writing output non-rRNA sequences into file: {}...'.format(", ".join(self.output)))

        if len(self.pred_labels) == 2:
            if self.rrna is not None:
                self.logger.info('Writing output rRNA sequences into file: {}...'.format(", ".join(self.rrna)))
                rrna1_fh = open(self.rrna[0], "w")
                rrna2_fh = open(self.rrna[1], "w")
            with open(self.output[0], "w") as out1_fh, open(self.output[1], "w") as out2_fh:
                with _open(self.input[0]) as r1_fh, _open(self.input[1]) as r2_fh:
                    for idx, (r1, r2) in enumerate(tqdm(zip(seq_parser(r1_fh, seq_type), seq_parser(r2_fh, seq_type)))):
                        if self.pred_labels[0][idx] == self.pred_labels[1][idx] == 0:
                            # seq_r2_header = record.id.replace("/1", "/2")
                            out1_fh.write('\n'.join(r1) + '\n')
                            out2_fh.write('\n'.join(r2) + '\n')
                            # SeqIO.write(r1, out1_fh, seq_type)
                            # SeqIO.write(r2, out2_fh, seq_type)
                            norrna_count += 1
                        elif self.pred_labels[0][idx] == self.pred_labels[1][idx] == 1:
                            if self.rrna is not None:
                                # seq_r2_header = record.id.replace("/1", "/2")
                                rrna1_fh.write('\n'.join(r1) + '\n')
                                rrna2_fh.write('\n'.join(r2) + '\n')
                                # SeqIO.write(r1, rrna1_fh, seq_type)
                                # SeqIO.write(r2, rrna2_fh, seq_type)
                        else:
                            if self.args.ensure == "rrna":
                                # seq_r2_header = record.id.replace("/1", "/2")
                                out1_fh.write('\n'.join(r1) + '\n')
                                out2_fh.write('\n'.join(r2) + '\n')
                                # SeqIO.write(r1, out1_fh, seq_type)
                                # SeqIO.write(r2, out2_fh, seq_type)
                                norrna_count += 1
                            elif self.args.ensure == "norrna":
                                if self.rrna is not None:
                                    # seq_r2_header = record.id.replace("/1", "/2")
                                    rrna1_fh.write('\n'.join(r1) + '\n')
                                    rrna2_fh.write('\n'.join(r2) + '\n')
                                    # SeqIO.write(r1, rrna1_fh, seq_type)
                                    # SeqIO.write(r2, rrna2_fh, seq_type)
                            else:
                                continue
            if self.rrna is not None:
                rrna1_fh.close()
                rrna2_fh.close()

        else:
            if self.rrna is not None:
                self.logger.info('Writing output rENA sequences into file: {}...'.format(", ".join(self.rrna)))
                rrna_fh = open(self.rrna[0], "w")
            with open(self.output[0], "w") as out_fh:
                with _open(self.input[0]) as read_fh:
                    for idx, record in enumerate(tqdm(seq_parser(read_fh, seq_type))):
                        if self.pred_labels[0][idx] == 0:
                            out_fh.write('\n'.join(record) + '\n')
                            # SeqIO.write(record, out_fh, seq_type)
                            norrna_count += 1
                        else:
                            if self.rrna is not None:
                                rrna_fh.write('\n'.join(record) + '\n')
                                # SeqIO.write(record, rrna_fh, seq_type)
            if self.rrna is not None:
                rrna_fh.close()
        self.logger.info('Finished writing {} non-rRNA sequences!'.format(norrna_count))