def phoc_list(self): """ :return: list of PHOC in this dataset """ return [ phoc_util.phoc(word=word, alphabet=self.alphabet, levels=self.phoc_levels) for word in self.word_list ]
def words(self, words): """setting this property refreshes the train-data/ list of respecive PHOC aswell""" self._words = list(words) self.phoc = [ phoc_util.phoc(word=w, alphabet=self.__alphabet, levels=self.__phoc_level).astype(float) for w in self._words ] self.phoc = np.array(self.phoc)
def words(self, new_words): """ Setting the lexicon requires calculating the respective PHOC. As this estimator is solely based on the distances to the lexicons PHOC, we have to ensure their sanity and warn, if impurities should occur. .. note:: self.ambiguous contains problematic words. :param new_words: New lexicon """ # updating the lexicon self.__words = np.array(list(set(new_words))) # updating the PHOC self.phoc = [phoc_util.phoc(word=w, alphabet=self.__alphabet, levels=self.__phoc_level) for w in self.__words] # eliminating zero PHOC vectors (those would inherently be the nearest neighbour for the cosine distance and we # we shall only consider words that we can generate a representation for) self.phoc = np.array(self.phoc) sums = self.phoc.sum(axis=1) if any(sums == 0): warnings.warn('{} zero phocs\n{}'.format((sums == 0).sum(), self.__words[sums==0])) self.phoc = self.phoc[sums > 0] self.__words = self.__words[sums > 0] # checking for ambiguous PHOC and warn same_taken = np.zeros(self.phoc.shape[0], dtype=int) same_pairs = [] same = 0 for i in range(len(self.phoc)-1): zs = np.zeros(i + 1, dtype=int) tail = np.array(list(map(all, self.phoc[i] == self.phoc[i + 1:]))).astype(int) tmp_same = np.concatenate([zs, tail]) tmp_same -= same_taken tmp_same[tmp_same < 0] = 0 # pairs of words with identical PHOC if tmp_same.sum() > 0: same += 1 same_pairs.append((self.__words[i], self.__words[tmp_same.astype(bool)])) same_taken += tmp_same same_taken[same_taken > 1] = 1 # gathering ambiguous PHOC if same > 0: warnings.warn('{} same phocs out of {}\n{}'.format(same, len(self.phoc), same_pairs)) # gathering ambiguous words, if this set is to large, you might want to use deeper PHOC (more levels) for w, pair in same_pairs: self.ambiguous.append(w) for v in pair: self.ambiguous.append(v) self.ambiguous = list(set(self.ambiguous)) # discarding unambiguous PHOC if desired if self.unambiguous: self.__words = self.__words[~same_taken.astype(bool)] self.phoc = self.phoc[~same_taken.astype(bool)]
def words(self, new_words): """ The PHOC have to updated with the lexicon :param new_words: new lexicon """ # updating PHOC-table self.train_data = [ phoc_util.phoc(word=w, alphabet=self.__alphabet, levels=self.__phoc_level) for w in new_words ] self._words = new_words
def phoc(self, idx): """ Generates the PHOC. The PHOC depends on the global variables self.alphabet, self.phoc_levels. (see also :func:`src.util.phoc_util.phoc`) :param idx: indesx of item :return: respective PHOC """ transcript = self.transcript(idx) phoc = phoc_util.phoc(transcript, alphabet=self.alphabet, levels=self.phoc_levels) return phoc
def __init__(self, words, phoc_level=phoc_util.DEFAULT_PHOC_LEVELS, alphabet=[ Alphabet.ASCII_LOWER, Alphabet.ASCII_DIGITS, Alphabet.ASCII_PUNCTUATION ]): """ tain_data is initialized with the PHOC encodings of the handed words words will be stored seperately :param words: words in dictionary :param phoc_level: levels of PHOC-encoding :param alphabet: alphabet used for PHOC (see :class:`phoc_util.Alphabet`) """ self.__phoc_level = phoc_level self.__alphabet = alphabet super().__init__(words, [ phoc_util.phoc( word=w, alphabet=self.__alphabet, levels=self.__phoc_level) for w in words ]) self.words = words
def test_phoc(self): alphabet = [ phoc_util.Alphabet.ASCII_LOWER, phoc_util.Alphabet.ASCII_UPPER, phoc_util.Alphabet.ASCII_DIGITS, phoc_util.Alphabet.ASCII_PUNCTUATION ] chars = phoc_util.alphabet_chars(alphabet) levels = 2 word = 'aAzZ19.,' # building PHOC manually phoc_2 = np.zeros(len(chars), dtype=np.uint8) for char in word: phoc_2[chars.index(char)] = 1 phoc_1_1 = np.zeros(len(chars), dtype=np.uint8) for char in word[:int(len(word) / 2)]: phoc_1_1[chars.index(char)] = 1 phoc_1_2 = np.zeros(len(chars), dtype=np.uint8) for char in word[int(len(word) / 2):]: phoc_1_2[chars.index(char)] = 1 phoc = np.concatenate((phoc_2, phoc_1_1, phoc_1_2)) # test test_phoc = phoc_util.phoc(word=word, alphabet=alphabet, levels=levels) self.assertEqual(phoc.dtype, test_phoc.dtype) np.testing.assert_array_equal(phoc, test_phoc)
def setUp(self): self.words = ['cat', 'dog', 'fox'] self.phocs = [phoc(w) for w in self.words] self.rcca = cca.RCCAEstimator(self.words)
def test_estimate(self): # estimate and check for results query = [phoc('cat')] query_words = ['cat'] self.assertEqual(self.prm.estimate_set(query), query_words)