def __init__(self, data, motifs, row_indices=None, background=None): data = numpy.array(data) row_sum = data.sum(axis=1) # are we dealing with counts data? if 0 <= data.min() and 1 < data.max(): # convert to freqs data data = data / numpy.vstack(row_sum) row_sum = data.sum(axis=1) # are we dealing with freqs data? if (data >= 0).all() and numpy.allclose( row_sum[numpy.isnan(row_sum) == False], 1 ): # standard PSSM object creation if background is None: background = numpy.ones(len(motifs), dtype=float) / len(motifs) self._background = numpy.array(background) assert len(background) == len( motifs ), "Mismatch between number of motifs and the background" validate_freqs_array(self._background) pssm = safe_log(data) - safe_log(self._background) super(PSSM, self).__init__( pssm, motifs, row_indices=row_indices, dtype=float ) self._indices = numpy.arange(self.shape[0]) # used for scoring return if not (data.min() < 0 < data.max()): raise ValueError("PSSM has been supplied invalid data") # we dealing with pssm data super(PSSM, self).__init__(data, motifs, row_indices=row_indices, dtype=float) self._indices = numpy.arange(self.shape[0]) # used for scoring
def test_write_tabular_pssm(self): """correctly writes tabular data for PSSM""" # data from test_profile data = numpy.array([ [0.1, 0.3, 0.5, 0.1], [0.25, 0.25, 0.25, 0.25], [0.05, 0.8, 0.05, 0.1], [0.7, 0.1, 0.1, 0.1], [0.6, 0.15, 0.05, 0.2], ]) pssm = PSSM(data, "ACTG") loader = io_app.load_tabular(sep="\t") with TemporaryDirectory(dir=".") as dirname: writer = io_app.write_tabular(data_path=dirname, format="tsv") outpath = join(dirname, "delme.tsv") writer.write(pssm, identifier=outpath) new = loader(outpath) expected = safe_log(data) - safe_log( numpy.array([0.25, 0.25, 0.25, 0.25])) for i in range(len(expected)): j = i // 4 self.assertTrue( numpy.isclose(new.array[i][2], expected[j][i - j], atol=0.0001))
def relative_entropy_terms(self, background=None): """ Computes a row-wise relative entropy terms per motif and stores them in a DictArray. Parameters ---------- background : dict {motif_1: prob_1, motif_2: prob_2, ...} is the specified background distribution. Returns ------- DictArray Notes ----- If background is type None, it defaults to equifrequent. """ if background is None: num_motifs = len(self.motifs) background = array([1 / num_motifs] * num_motifs) else: background = array([background.get(m, 0) for m in self.motifs]) validate_freqs_array(background) ret = background * (safe_log(background) - safe_log(self.array)) return self.template.wrap(ret)
def test_safe_log(self): """safe_log: should handle pos/neg/zero/empty arrays""" # normal valid array a = array([[4, 0, 8], [2, 16, 4]]) assert_equal(safe_log(a), array([[2, 0, 3], [1, 4, 2]])) # input integers, output floats assert_allclose(safe_log(array([1, 2, 3])), array([0, 1, 1.5849625])) # just zeros a = array([[0, 0], [0, 0]]) assert_equal(safe_log(a), array([[0, 0], [0, 0]])) # negative number with self.assertRaises(FloatingPointError): safe_log(array([0, 3, -4])) # empty array assert_equal(safe_log(array([])), array([])) # double empty array assert_equal(safe_log(array([[]])), array([[]]))