def _load_encode_rna(self, class_files): self.data, self.labels = [], [] replacer_seq = lambda x: choice(self.alpha_coder.alph0) replacer_struct = lambda x: choice(self.alpha_coder.alph1) pattern_seq = r"[^{}]".format(re.escape(self.alpha_coder.alph0)) pattern_struct = r"[^{}]".format(re.escape(self.alpha_coder.alph1)) for class_id, file_name in enumerate(class_files): handle = io.get_handle(file_name, "rt") for header, block in io.parse_fasta(handle, "_"): lines = block.split("_") sequence = re.sub(pattern_seq, replacer_seq, lines[0].upper()) if True == self.is_rna_pwm: pwm = np.zeros( (len(sequence), len(self.alpha_coder.alph1)), dtype=np.float32) for x in range(1, pwm.shape[1] + 1): pwm[:, x - 1] = list(map(float, lines[x].split())) self.data.append(self._join_seq_pwm(sequence, pwm)) else: structure = re.sub(pattern_struct, replacer_struct, lines[1].split(" ")[0].upper()) joined = self.alpha_coder.encode((sequence, structure)) self.data.append(self.one_hot_encoder.encode(joined)) if self.multilabel: self.labels.append(list(map(int, header.split(',')))) else: self.labels.append([class_id]) handle.close()
def _load_encode_dna(self, class_files): self.data, self.labels = [], [] replacer = lambda x: choice(self.one_hot_encoder.alphabet) for class_id, file_name in enumerate(class_files): handle = io.get_handle(file_name, "rt") for header, sequence in io.parse_fasta(handle): sequence = re.sub(r"[NYMRWK]", replacer, sequence.upper()) self.data.append(self.one_hot_encoder.encode(sequence)) if self.multilabel: self.labels.append(list(map(int, header.split(',')))) else: self.labels.append([class_id]) handle.close()
def _load_encode_rna(self, class_files): self.data, self.labels = [], [] replacer_seq = lambda x: choice(self.alpha_coder.alph0) replacer_struct = lambda x: choice(self.alpha_coder.alph1) if self.alpha_coder.alph1 == "HIMS": idx = 2 else: idx = 1 for class_id, file_name in enumerate(class_files): handle = io.get_handle(file_name, "rt") for header, block in io.parse_fasta(handle, "_"): lines = block.split("_") sequence = re.sub(r"[NYMRWK]", replacer_seq, lines[0]) structure = re.sub(r"[FT]", replacer_struct, lines[idx].split(" ")[0].upper()) joined = self.alpha_coder.encode((sequence, structure)) self.data.append(self.one_hot_encoder.encode(joined)) if self.multilabel: self.labels.append(list(map(int, header.split(',')))) else: self.labels.append([class_id]) handle.close()