예제 #1
0
 def __missing__(self, k):
     self[k] = [
       seq_to_numpy(s)
       for s
       in convert_seqs(sequence_files[k])
     ]
     return self[k]
예제 #2
0
    def __missing__(self, key):
        dataset, cross_fold_idx = key
        positive_seqs = [
          seq_to_numpy(s)
          for s
          in convert_seqs(sequence_files['%s-%d-validate' % (dataset, cross_fold_idx)])
        ]
        # load the negative set and match the lengths of the positive sequences

        negative_seqs = [
          seq_to_numpy(neg[:len(pos)])
          for pos, neg
          in zip(
            positive_seqs,
            self.negative_seq_generators[dataset]
          )
        ]
        if len(negative_seqs) != len(positive_seqs):
            raise RuntimeError('Not enough sequences in negative set to match positive sequences. %d positive, %d negative sequences' % (len(positive_seqs), len(negative_seqs)))

        for i, (pos, neg) in enumerate(zip(positive_seqs, negative_seqs)):
            if len(neg) < len(pos):
                raise RuntimeError(
                  'Not enough bases in negative sequence %d to match length of positive sequence: positive sequence has %d bases and negative sequence has %d bases' % (
                    i,
                    pos.shape[0],
                    neg.shape[0]
                  )
                )

        self[key] = (positive_seqs, negative_seqs)
        return self[key]
예제 #3
0
파일: risotto.py 프로젝트: JohnReid/HMM
            K_min,
            K_max,
            max_mismatches,
        )
        if os.system(cmd):
            raise RuntimeError("Could not run risotto with args: %s" % cmd)

        return _parse_output(output_file)

    finally:
        if os.access(alphabet_file, os.R_OK):
            os.remove(alphabet_file)
        if os.access(fasta_file, os.R_OK):
            os.remove(fasta_file)
        if os.access(output_file, os.R_OK):
            os.remove(output_file)


if "__main__" == __name__:
    logging.basicConfig(level=logging.DEBUG)

    from sequences import convert_seqs, rev_comp, numpy_to_seq, seq_to_numpy

    test_seq_file = "synthetic-2/synthetic-sequences-K10-g0.50-N200-L200-seed4-0.fa"
    sequences = convert_seqs(test_seq_file)
    # sequences.extend([numpy_to_seq(rev_comp(seq_to_numpy(s))) for s in sequences])
    # sequences[0] = sequences[0].replace('a','n')
    results = risotto(sequences, max_mismatches=1)
    results.sort(key=lambda x: x[2])  # sort by number of sequences that motif is found in
    print results[-10:]