def __missing__(self, k): self[k] = [ seq_to_numpy(s) for s in convert_seqs(sequence_files[k]) ] return self[k]
def __missing__(self, key): dataset, cross_fold_idx = key positive_seqs = [ seq_to_numpy(s) for s in convert_seqs(sequence_files['%s-%d-validate' % (dataset, cross_fold_idx)]) ] # load the negative set and match the lengths of the positive sequences negative_seqs = [ seq_to_numpy(neg[:len(pos)]) for pos, neg in zip( positive_seqs, self.negative_seq_generators[dataset] ) ] if len(negative_seqs) != len(positive_seqs): raise RuntimeError('Not enough sequences in negative set to match positive sequences. %d positive, %d negative sequences' % (len(positive_seqs), len(negative_seqs))) for i, (pos, neg) in enumerate(zip(positive_seqs, negative_seqs)): if len(neg) < len(pos): raise RuntimeError( 'Not enough bases in negative sequence %d to match length of positive sequence: positive sequence has %d bases and negative sequence has %d bases' % ( i, pos.shape[0], neg.shape[0] ) ) self[key] = (positive_seqs, negative_seqs) return self[key]
K_min, K_max, max_mismatches, ) if os.system(cmd): raise RuntimeError("Could not run risotto with args: %s" % cmd) return _parse_output(output_file) finally: if os.access(alphabet_file, os.R_OK): os.remove(alphabet_file) if os.access(fasta_file, os.R_OK): os.remove(fasta_file) if os.access(output_file, os.R_OK): os.remove(output_file) if "__main__" == __name__: logging.basicConfig(level=logging.DEBUG) from sequences import convert_seqs, rev_comp, numpy_to_seq, seq_to_numpy test_seq_file = "synthetic-2/synthetic-sequences-K10-g0.50-N200-L200-seed4-0.fa" sequences = convert_seqs(test_seq_file) # sequences.extend([numpy_to_seq(rev_comp(seq_to_numpy(s))) for s in sequences]) # sequences[0] = sequences[0].replace('a','n') results = risotto(sequences, max_mismatches=1) results.sort(key=lambda x: x[2]) # sort by number of sequences that motif is found in print results[-10:]