예제 #1
0
 def phoc_list(self):
     """
     :return: list of PHOC in this dataset
     """
     return [
         phoc_util.phoc(word=word,
                        alphabet=self.alphabet,
                        levels=self.phoc_levels) for word in self.word_list
     ]
 def words(self, words):
     """setting this property refreshes the train-data/ list of respecive PHOC aswell"""
     self._words = list(words)
     self.phoc = [
         phoc_util.phoc(word=w,
                        alphabet=self.__alphabet,
                        levels=self.__phoc_level).astype(float)
         for w in self._words
     ]
     self.phoc = np.array(self.phoc)
예제 #3
0
    def words(self, new_words):
        """
        Setting the lexicon requires calculating the respective PHOC.
        As this estimator is solely based on the distances to the lexicons PHOC, we have to ensure their sanity
        and warn, if impurities should occur.

        .. note::

            self.ambiguous contains problematic words.

        :param new_words: New lexicon
        """
        # updating the lexicon
        self.__words = np.array(list(set(new_words)))
        # updating the PHOC
        self.phoc = [phoc_util.phoc(word=w, alphabet=self.__alphabet, levels=self.__phoc_level)
                      for w in self.__words]
        # eliminating zero PHOC vectors (those would inherently be the nearest neighbour for the cosine distance and we
        # we shall only  consider words that we can generate a representation for)
        self.phoc = np.array(self.phoc)
        sums = self.phoc.sum(axis=1)
        if any(sums == 0):
            warnings.warn('{} zero phocs\n{}'.format((sums == 0).sum(), self.__words[sums==0]))
        self.phoc = self.phoc[sums > 0]
        self.__words = self.__words[sums > 0]
        # checking for ambiguous PHOC and warn
        same_taken = np.zeros(self.phoc.shape[0], dtype=int)
        same_pairs = []
        same = 0
        for i in range(len(self.phoc)-1):
            zs = np.zeros(i + 1, dtype=int)
            tail = np.array(list(map(all, self.phoc[i] == self.phoc[i + 1:]))).astype(int)
            tmp_same = np.concatenate([zs, tail])
            tmp_same -= same_taken
            tmp_same[tmp_same < 0] = 0
            # pairs of words with identical PHOC
            if tmp_same.sum() > 0:
                same += 1
                same_pairs.append((self.__words[i], self.__words[tmp_same.astype(bool)]))
            same_taken += tmp_same
            same_taken[same_taken > 1] = 1
        # gathering ambiguous PHOC
        if same > 0:
            warnings.warn('{} same phocs out of {}\n{}'.format(same, len(self.phoc), same_pairs))
        # gathering ambiguous words, if this set is to large, you might want to use deeper PHOC (more levels)
        for w, pair in same_pairs:
            self.ambiguous.append(w)
            for v in pair:
                self.ambiguous.append(v)
        self.ambiguous = list(set(self.ambiguous))
        # discarding unambiguous PHOC if desired
        if self.unambiguous:
            self.__words = self.__words[~same_taken.astype(bool)]
            self.phoc = self.phoc[~same_taken.astype(bool)]
    def words(self, new_words):
        """
        The PHOC have to updated with the lexicon

        :param new_words:  new lexicon
        """
        # updating PHOC-table
        self.train_data = [
            phoc_util.phoc(word=w,
                           alphabet=self.__alphabet,
                           levels=self.__phoc_level) for w in new_words
        ]
        self._words = new_words
예제 #5
0
    def phoc(self, idx):
        """
        Generates the PHOC. The PHOC depends on the global variables self.alphabet, self.phoc_levels.
        (see also :func:`src.util.phoc_util.phoc`)

        :param idx: indesx of item
        :return: respective PHOC
        """
        transcript = self.transcript(idx)
        phoc = phoc_util.phoc(transcript,
                              alphabet=self.alphabet,
                              levels=self.phoc_levels)
        return phoc
    def __init__(self,
                 words,
                 phoc_level=phoc_util.DEFAULT_PHOC_LEVELS,
                 alphabet=[
                     Alphabet.ASCII_LOWER, Alphabet.ASCII_DIGITS,
                     Alphabet.ASCII_PUNCTUATION
                 ]):
        """
        tain_data is initialized with the PHOC encodings of the handed words
        words will be stored seperately

        :param words: words in dictionary
        :param phoc_level: levels of PHOC-encoding
        :param alphabet: alphabet used for PHOC (see :class:`phoc_util.Alphabet`)
        """
        self.__phoc_level = phoc_level
        self.__alphabet = alphabet
        super().__init__(words, [
            phoc_util.phoc(
                word=w, alphabet=self.__alphabet, levels=self.__phoc_level)
            for w in words
        ])
        self.words = words
 def test_phoc(self):
     alphabet = [
         phoc_util.Alphabet.ASCII_LOWER, phoc_util.Alphabet.ASCII_UPPER,
         phoc_util.Alphabet.ASCII_DIGITS,
         phoc_util.Alphabet.ASCII_PUNCTUATION
     ]
     chars = phoc_util.alphabet_chars(alphabet)
     levels = 2
     word = 'aAzZ19.,'
     # building PHOC manually
     phoc_2 = np.zeros(len(chars), dtype=np.uint8)
     for char in word:
         phoc_2[chars.index(char)] = 1
     phoc_1_1 = np.zeros(len(chars), dtype=np.uint8)
     for char in word[:int(len(word) / 2)]:
         phoc_1_1[chars.index(char)] = 1
     phoc_1_2 = np.zeros(len(chars), dtype=np.uint8)
     for char in word[int(len(word) / 2):]:
         phoc_1_2[chars.index(char)] = 1
     phoc = np.concatenate((phoc_2, phoc_1_1, phoc_1_2))
     # test
     test_phoc = phoc_util.phoc(word=word, alphabet=alphabet, levels=levels)
     self.assertEqual(phoc.dtype, test_phoc.dtype)
     np.testing.assert_array_equal(phoc, test_phoc)
 def setUp(self):
     self.words = ['cat', 'dog', 'fox']
     self.phocs = [phoc(w) for w in self.words]
     self.rcca = cca.RCCAEstimator(self.words)
예제 #9
0
 def test_estimate(self):
     # estimate and check for results
     query = [phoc('cat')]
     query_words = ['cat']
     self.assertEqual(self.prm.estimate_set(query), query_words)