def load_reads(seq_file, label=None, max_len=100): read_list = [] seq_format = get_seq_format(seq_file) _open = partial(gzip.open, mode='rt') if seq_format.endswith( "gz") else open seq_type = "fasta" if seq_format.startswith("fa") else "fastq" if label is None: with _open(seq_file) as fh: for record in seq_parser(fh, seq_type): read_list.append(record) else: with _open(seq_file) as fh: for record in seq_parser(fh, seq_type): seq = record[1] read, rc_read = get_read_rc_with_maxlen(seq, max_len=100) read_list.extend([(label, read), (label, rc_read)]) return read_list
def load_seqs(seq_file, label, step_size): seq_step_list = [] seq_format = get_seq_format(seq_file) _open = partial(gzip.open, mode='rt') if seq_format.endswith( "gz") else open seq_type = "fasta" if seq_format.startswith("fa") else "fastq" with _open(seq_file) as fh: for record in seq_parser(fh, seq_type): seq = record[1] seq_step_list.extend([(label, seq, step_size), (label, str(Seq(seq).reverse_complement()), step_size)]) return seq_step_list
def get_seq_chunks(seq_file, chunk_size=1048576): seq_format = get_seq_format(seq_file) _open = partial(gzip.open, mode='rt') if seq_format.endswith( "gz") else open seq_type = "fasta" if seq_format.startswith("fa") else "fastq" with _open(seq_file) as fh: seq_iterator = seq_parser(fh, seq_type) while True: seqs_chunk = list(islice(seq_iterator, chunk_size)) if seqs_chunk: yield seqs_chunk else: break
def all_seqs_x(seq_file, min_seq_length): dataset = [] seq_format = get_seq_format(seq_file) _open = partial(gzip.open, mode='rt') if seq_format.endswith("gz") else open seq_type = "fasta" if seq_format.startswith("fa") else "fastq" with _open(seq_file) as fh: # for record in SeqIO.parse(fh, seq_type): # parse_seq_file(seq_file): # seq = str(record.seq).upper() for record in seq_parser(fh, seq_type): features = seq_to_feature(record[1], min_seq_length) try: dataset.append(features) except NameError as e: print(NameError("Can not concatenate the np array", e)) return dataset
def output_seq(self): norrna_count = 0 seq_format = get_seq_format(self.input[0]) seq_type = "fasta" if seq_format.startswith("fa") else "fastq" _open = partial(gzip.open, mode='rt') if seq_format.endswith("gz") else open self.logger.info('Writing output non-rRNA sequences into file: {}...'.format(", ".join(self.output))) if len(self.pred_labels) == 2: if self.rrna is not None: self.logger.info('Writing output rRNA sequences into file: {}...'.format(", ".join(self.rrna))) rrna1_fh = open(self.rrna[0], "w") rrna2_fh = open(self.rrna[1], "w") with open(self.output[0], "w") as out1_fh, open(self.output[1], "w") as out2_fh: with _open(self.input[0]) as r1_fh, _open(self.input[1]) as r2_fh: for idx, (r1, r2) in enumerate(tqdm(zip(seq_parser(r1_fh, seq_type), seq_parser(r2_fh, seq_type)))): if self.pred_labels[0][idx] == self.pred_labels[1][idx] == 0: # seq_r2_header = record.id.replace("/1", "/2") out1_fh.write('\n'.join(r1) + '\n') out2_fh.write('\n'.join(r2) + '\n') # SeqIO.write(r1, out1_fh, seq_type) # SeqIO.write(r2, out2_fh, seq_type) norrna_count += 1 elif self.pred_labels[0][idx] == self.pred_labels[1][idx] == 1: if self.rrna is not None: # seq_r2_header = record.id.replace("/1", "/2") rrna1_fh.write('\n'.join(r1) + '\n') rrna2_fh.write('\n'.join(r2) + '\n') # SeqIO.write(r1, rrna1_fh, seq_type) # SeqIO.write(r2, rrna2_fh, seq_type) else: if self.args.ensure == "rrna": # seq_r2_header = record.id.replace("/1", "/2") out1_fh.write('\n'.join(r1) + '\n') out2_fh.write('\n'.join(r2) + '\n') # SeqIO.write(r1, out1_fh, seq_type) # SeqIO.write(r2, out2_fh, seq_type) norrna_count += 1 elif self.args.ensure == "norrna": if self.rrna is not None: # seq_r2_header = record.id.replace("/1", "/2") rrna1_fh.write('\n'.join(r1) + '\n') rrna2_fh.write('\n'.join(r2) + '\n') # SeqIO.write(r1, rrna1_fh, seq_type) # SeqIO.write(r2, rrna2_fh, seq_type) else: continue if self.rrna is not None: rrna1_fh.close() rrna2_fh.close() else: if self.rrna is not None: self.logger.info('Writing output rENA sequences into file: {}...'.format(", ".join(self.rrna))) rrna_fh = open(self.rrna[0], "w") with open(self.output[0], "w") as out_fh: with _open(self.input[0]) as read_fh: for idx, record in enumerate(tqdm(seq_parser(read_fh, seq_type))): if self.pred_labels[0][idx] == 0: out_fh.write('\n'.join(record) + '\n') # SeqIO.write(record, out_fh, seq_type) norrna_count += 1 else: if self.rrna is not None: rrna_fh.write('\n'.join(record) + '\n') # SeqIO.write(record, rrna_fh, seq_type) if self.rrna is not None: rrna_fh.close() self.logger.info('Finished writing {} non-rRNA sequences!'.format(norrna_count))