Пример #1
0
    def decode(self, ciphertext):
        try:
            if self.cipher_mode == AES.MODE_EAX:
                cipher = AES.new(key=self.key,
                                 mode=self.cipher_mode,
                                 nonce=self.nonce)
                cipherbytes = b64decode(ciphertext.encode(self.text_encoding))
                data = cipher.decrypt(cipherbytes)
            elif self.cipher_mode == AES.MODE_CBC:
                cipher = AES.new(key=self.key,
                                 mode=self.cipher_mode,
                                 iv=self.nonce)
                cipherbytes = b64decode(ciphertext.encode(self.text_encoding))
                data = cipher.decrypt(cipherbytes)
                Log.debugdebug(
                    str(self.__class__) + ' ' +
                    str(getframeinfo(currentframe()).lineno) +
                    ': Decrypted data length = ' + str(len(data)) +
                    ', modulo 16 = ' + str(len(data) % 128 / 8))
                # Remove last x bytes encoded in the padded bytes
                data = data[:-data[-1]]
            else:
                raise Exception(
                    str(self.__class__) + ' ' +
                    str(getframeinfo(currentframe()).lineno) +
                    ': Unsupported mode "' + str(self.cipher_mode) + '".')

            return str(data, encoding=STR_ENCODING)
        except Exception as ex:
            errmsg = str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                    + ': Error decoding data "' + str(ciphertext) + '" using AES ". Exception: ' + str(ex)
            Log.error(errmsg)
            raise Exception(errmsg)
Пример #2
0
 def check_if_model_updated(self):
     updated_time = os.path.getmtime(self.fpath_updated_file)
     Log.debugdebug(
         str(self.__class__) + ' ' +
         str(getframeinfo(currentframe()).lineno) + ': Model identifier "' +
         str(self.identifier_string) + '" last updated time ' +
         str(self.model_updated_time) + ', updated "' + str(updated_time) +
         '".')
     if (updated_time > self.model_updated_time):
         Log.important(
             str(self.__class__) + ' ' +
             str(getframeinfo(currentframe()).lineno) +
             ': Model update time for identifier "' +
             str(self.identifier_string) + '" - "' +
             str(datetime.fromtimestamp(updated_time)) +
             '" is newer than "' +
             str(datetime.fromtimestamp(self.model_updated_time)) +
             '". Reloading model...')
         try:
             self.mutex_training.acquire()
             # Reset model flags to not ready
             self.model_loaded = False
             self.model_updated_time = updated_time
         finally:
             self.mutex_training.release()
         return True
     else:
         return False
Пример #3
0
    def rank_sorted_list_by_unique_items(sorted_list):
        cntr = Counter(sorted_list)
        max_item_count = max(cntr.values())
        Log.debugdebug(
            str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) +
            ': Max unique item count = ' + str(max_item_count))

        #
        # Another way of not getting the max item count above is to loop until no changes
        # occur to the ranking list.
        # However that will take up memory if the list given is huge, so we prefer knowing
        # in advance how many max loops to take
        #
        len_list = len(sorted_list)
        # Start with 1 rank
        item_rank = np.array([1] * len_list)
        for i in range(max_item_count):
            shift = i + 1
            # Shift down
            sorted_list_shift = np.append(np.array(shift * [None]),
                                          sorted_list[0:(len_list - shift)],
                                          axis=0)
            Log.debugdebug(
                str(__name__) + ' ' +
                str(getframeinfo(currentframe()).lineno) + ' Shift #' +
                str(shift) + ': ' + str(sorted_list_shift))
            # If the rank is the previous rank and member code is the same, means we add 1 to the rank
            condition = (sorted_list == sorted_list_shift) & (item_rank
                                                              == shift)
            item_rank[condition] = item_rank[condition] + 1
        return item_rank.tolist()
Пример #4
0
    def xor_string(self, s1, s2):
        Log.debug(
            str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) +
            ': XOR between "' + str(s1) + '" and "' + str(s2) + '".')

        len_s1 = len(s1)
        len_s2 = len(s2)
        len_max = max(len(s1), len(s2))

        # Append to the shorter one, in a repeat manner
        for i in range(len(s1), len_max, 1):
            s1 += s1[(i - len_s1)]
        for i in range(len(s2), len_max, 1):
            s2 += s2[(i - len_s2)]

        Log.debug(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) +
            ': After appending, XOR between "' + str(s1) + '" and "' +
            str(s2) + '".')

        Log.debugdebug(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) + ': s1 "' + str(s1) +
            '", s2 "' + str(s2) + '"')

        b1 = bytes(s1, encoding=Obfuscate.STRING_ENCODING)
        b2 = bytes(s2, encoding=Obfuscate.STRING_ENCODING)

        bytes_xor = self.xor_bytes(b1=b1, b2=b2)

        return bytes_xor
Пример #5
0
    def __loss(
            self,
            # скрытые значения
            h,
            # наблюдаемые
            o,
            h_trns_prob_matrix,
            o_emis_prob_matrix,
            # для отладки
            info_i = -1,
            info_j = -1,
    ):
        assert len(h) == len(o)
        assert len(h) > 0

        ml = 0
        for i in range(1, len(h), 1):
            ml_part = - np.log(h_trns_prob_matrix[h[i-1], h[i]]) - np.log(o_emis_prob_matrix[h[i], o[i]])
            if (h[i], h[i-1]) == (info_j,info_i):
                Log.debugdebug(
                    str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                    + '; p(' + str(info_i) + ',' + str(info_j) + ') = ' + str(h_trns_prob_matrix[h[i-1], h[i]])
                )
            ml += ml_part

        return ml
Пример #6
0
    def filter_sentence_by_pos_tag_japanese(
        self,
        # string or word list
        word_list,
        keep_tags=DEFAULT_KEEP_TAGS_JAP,
    ):
        try:
            import nagisa
        except Exception as ex:
            raise Exception(
                str(__name__) + ' ' +
                str(getframeinfo(currentframe()).lineno) +
                ': Unable to load nagisa: ' + str(ex))
        if type(word_list) in [list, tuple]:
            text = ' '.join(word_list)
        else:
            text = word_list
        words_postags_obj = nagisa.tagging(text)
        txt_sym_tok = words_postags_obj.words
        txt_sym_postags = words_postags_obj.postags
        Log.debug(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) +
            ': Japanese segmentation ' + str(txt_sym_tok) +
            ', word & POS tags: ' + str(txt_sym_postags))

        words_postags = list(zip(txt_sym_tok, txt_sym_postags))
        sent_filtered = [w for w, t in words_postags if (t in keep_tags)]
        Log.debugdebug(
            str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) +
            ': POS TAGs: ' + str(words_postags))
        Log.debugdebug(
            str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) +
            ': Filtered sentence: ' + str(sent_filtered))
        return sent_filtered
Пример #7
0
 def set_feature_weights(self, fw):
     self.fv_weights = np.array(fw)
     Log.debugdebug(
         str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
         + ' Feature weights set to ' + str(self.fv_weights) + '.'
     )
     return
Пример #8
0
 def verify_totp_style(
         self,
         # We test for <tolerance_secs> back
         tolerance_secs=30):
     now = datetime.now()
     try:
         for i in range(tolerance_secs):
             t_test = now - timedelta(seconds=i)
             Log.debugdebug(
                 str(self.__class__) + ' ' +
                 str(getframeinfo(currentframe()).lineno) + ': Trying ' +
                 str(t_test.strftime('%Y-%m-%d %H:%M:%S')))
             test_challenge_calc = AccessTokenSharedsecretChallenge.create_totp_style_challenge_response(
                 shared_secret=self.shared_secret,
                 datetime_val=t_test,
                 algo_hash=self.algo_hash)
             res = self.__compare_test_challenge(
                 test_challenge_calc=test_challenge_calc)
             if res == True:
                 return res
         return False
     except Exception as ex:
         Log.error(
             str(self.__class__) + ' ' +
             str(getframeinfo(currentframe()).lineno) +
             ': Exception for shared secret "' + str(self.shared_secret) +
             '", totp style test challenge "' + str(self.test_challenge) +
             '": ' + str(ex))
         return False
Пример #9
0
    def test_textcluster_english(self):
        res = ut.ResultObj(count_ok=0, count_fail=0)
        lang = lf.LangFeatures.LANG_EN

        #
        # We take a few news articles and try to automatically classify sentences belonging to the same news article.
        # This example demonstrates the need for root word extraction, which will increase accuracy significantly.
        #
        text = [
            # Article 1
            'Freezing temperatures have gripped the nation, making Wednesday the coldest day yet this winter.',
            'Morning lows plunged to minus 16-point-three degrees Celsius in Seoul , the lowest to be posted during this year’s cold season.',
            'As of 7 a.m. Wednesday , morning lows stood at minus 15-point-four degrees in Daejeon , nearly minus 22 degrees in the Daegwallyeong mountain pass in Pyeongchang and minus 14 degrees in Gangneung.',
            'Due to the wind chill factor, temperatures stood at nearly minus 23 degrees in Seoul , minus 25 in Incheon and roughly minus 36 degrees in Daegwallyeong .',
            'An official of the Korea Meteorological Administration said the nation will continue to see subzero temperatures for the time being with the central regions and some southern inland areas projected to see morning lows plunge below minus 15 degrees',
            'Currently , a cold wave warning is in place for Seoul , Incheon , Daejeon and Sejong as well as the provinces of Gangwon , Chungcheong , North Jeolla and North Gyeongsang.',
            # Article 2
            'There are two primary motivations for keeping Bitcoin'
            's inventor keeping his or her or their identity secret.',
            'One is privacy. As Bitcoin has gained in popularity – becoming something of a worldwide phenomenon – Satoshi Nakamoto would likely garner a lot of attention from the media and from governments.',
            'The other reason is safety. Looking at 2009 alone , 32,489 blocks were mined; at the then-reward rate of 50 BTC per block, the total payout in 2009 was 1,624,500 BTC, which at today’s prices is over $900 million.',
            'One may conclude that only Satoshi and perhaps a few other people were mining through 2009, and that they possess a majority of that $900 million worth of BTC.',
            'Someone in possession of that much BTC could become a target of criminals, especially since bitcoins are less like stocks and more like cash, where the private keys needed to authorize spending could be printed out and literally kept under a mattress.',
            'While it'
            's likely the inventor of Bitcoin would take precautions to make any extortion-induced transfers traceable, remaining anonymous is a good way for Satoshi to limit exposure.',
            # Article 3
            'Some of these models of concurrency are primarily intended to support reasoning and specification, while others can be used through the entire development cycle, including design, implementation, proof, testing and simulation of concurrent systems',
            'The proliferation of different models of concurrency has motivated some researchers to develop ways to unify these different theoretical models.',
            'The Concurrency Representation Theorem in the actor model provides a fairly general way to represent concurrent systems that are closed in the sense that they do not receive communications from outside.'
        ]

        # stopwords не нужны!!! круто!!!!
        text_tag = []
        for sent in text:
            sent_new = self.stopwordtags.filter_sentence_by_pos_tag_english(
                word_list=sent)
            text_tag.append(sent_new)

        text_sentences_arr = self.txt_preprocessor.preprocess_list_all_langs(
            sentences_list=text_tag)
        Log.debugdebug('PRE-PROCESSED ' + str(lang) + ' SENTENCES:\n\r' +
                       str(text_sentences_arr))

        # This example is too small in sample size to weigh by IDF (which will instead lower the accuracy)
        # do_clustering(text=text, stopwords=stopwords, ncenters=3, freq_measure='tf', weigh_idf=False, verbose=0)
        res_cluster = self.do_clustering(
            text=text_sentences_arr,
            ncenters=3,
            expected_clusters=((0, 1, 2, 3, 4, 5), (6, 7, 8, 9, 10, 11),
                               (12, 13, 14)),
            test_threshold_inside=0.3,
            test_threshold_outside=0.7,
            stopwords_list=None,
            freq_measure=WordFreqDocMatrix.BY_SIGMOID_FREQ_NORM,
            test_description=str(lang) + ' normalized, no IDF',
        )
        res.update(other_res_obj=res_cluster)
        return res
Пример #10
0
    def __init__(
            self
    ):
        self.lang_features = LangFeatures()

        # Map alphabet name to unicode character set array
        self.alphabet_dict = {}
        for alp in self.TESTS_BY_ORDER:
            self.alphabet_dict[alp] = LangCharacters.get_alphabet_charset(
                alphabet = alp
            )
        Log.info(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Alphabets used: ' + str(self.alphabet_dict.keys())
        )

        self.langs_with_no_word_sep = self.lang_features.get_languages_with_no_word_separator()
        Log.debugdebug('Langs with no word sep: ' + str(self.langs_with_no_word_sep))

        # Load common words
        self.common_words = {}
        self.common_words[LangFeatures.LANG_EN] = English()
        self.common_words[LangFeatures.LANG_ES] = Spanish()
        self.common_words[LangFeatures.LANG_FR] = French()
        self.common_words[LangFeatures.LANG_ID] = Indonesian()
        self.common_words[LangFeatures.LANG_VI] = Vietnamese()

        # Load stemmers
        self.word_stemmer = {}
        for lang in self.SUPPORTED_LANGS:
            lang_have_verb_conj = self.lang_features.have_verb_conjugation(
                lang = lang
            )
            Log.important(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': Lang "' + str(lang) + '" verb conjugation = ' + str(lang_have_verb_conj) + '.'
            )
            self.word_stemmer[lang] = None
            if lang_have_verb_conj:
                try:
                    self.word_stemmer[lang] = Lemmatizer(
                        lang = lang
                    )
                    Log.important(
                        str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                        + ': Lang "' + str(lang) + '" stemmer/lemmatizer initialized successfully.'
                    )
                except Exception as ex_stemmer:
                    errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                             + ': Lang "' + str(lang) + ' stemmer/lemmatizer failed to initialize: ' \
                             + str(ex_stemmer) + '.'
                    Log.warning(errmsg)

        self.profiler_detect_alp = ProfilingHelper(profiler_name = str(self.__class__))

        return
Пример #11
0
 def __compare_test_challenge(self, test_challenge_calc):
     if test_challenge_calc != self.test_challenge:
         Log.debugdebug(
             str(self.__class__) + ' ' +
             str(getframeinfo(currentframe()).lineno) +
             ': Test Challenge Fail. Challenge string "' +
             str(self.challenge) + '". Test Challenge Calculated "' +
             str(test_challenge_calc) + '", test challenge given "' +
             str(self.test_challenge))
         return False
     return True
Пример #12
0
 def reconstruct_check(self, sent_vec, keywords_list):
     Log.debugdebug(
         str(self.__class__) + ' ' +
         str(getframeinfo(currentframe()).lineno) + ': Reconstructing ' +
         str(sent_vec) + ' from keywords ' + str(keywords_list))
     s_reconstruct_arr = []
     for j in range(len(sent_vec)):
         freq = sent_vec[j]
         while freq > 0:
             s_reconstruct_arr.append(keywords_list[j])
             freq = freq - 1
     return s_reconstruct_arr
Пример #13
0
    def calculate(self):
        self.x_shape = self.x.shape

        # How many elements altogether
        self.x_elements_count = np.product(self.x_shape)
        self.x_dim = len(self.x_shape)

        if self.x_elements_count == 0:
            return np.array([np.nan] * self.x_dim)

        # No negative numbers
        assert np.min(self.x) >= 0
        assert self.x_dim > 0
        assert self.x_elements_count > 0

        # Keep the dimension coordinates here
        self.x_coordinates = np.zeros(shape=[self.x_dim] +
                                      [self.x_elements_count])

        # For example if x has shape (4,3,2), this number will start with 4*3*2 = 24
        repeat_times = self.x_elements_count
        for dim in range(self.x_dim):
            # For example first dimension will have a scalar repeat 3*2=6 times (0,0,0,0,0,0,1,1,1,1,1,1,..)
            # as each row will have 6 elements in total,
            # 2nd dimension will repeat 2 times (0,0,1,1,2,2,..) as each row will have 2 elements in total
            repeat_times = repeat_times / self.x_shape[dim]
            # Each number 0, 1, 2, ... is repeated by the number of times
            dim_coor = np.array(list(range(
                self.x_elements_count))) // repeat_times
            # Modulo the dimension length
            dim_coor = dim_coor % self.x_shape[dim]
            self.x_coordinates[dim, ] = dim_coor

        # Reshape so that the dimensions after the first one is equal to the shape of x
        self.x_coordinates = np.reshape(self.x_coordinates,
                                        newshape=[self.x_dim] +
                                        list(self.x_shape))

        Log.debugdebug('Coordinates of x by dimension:\n\r' +
                       str(self.x_coordinates))

        cm = np.zeros(shape=[self.x_dim])
        for dim in range(self.x_dim):
            if np.sum(self.x) > 0:
                cm[dim] = np.sum(self.x_coordinates[dim] * self.x) / np.sum(
                    self.x)
            else:
                ones_arr = np.ones(shape=self.x_shape)
                cm[dim] = np.sum(
                    self.x_coordinates[dim] * ones_arr) / np.sum(ones_arr)
        return cm
Пример #14
0
    def hash_compression(
            self,
            s,
            # By default we return the original hash
            desired_byte_length=32):
        if desired_byte_length % 4 != 0:
            raise Exception(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) +
                ': Desired byte length must be 0 modulo-4, given = ' +
                str(desired_byte_length))

        m = hashlib.sha256()
        m.update(bytes(s, encoding=Obfuscate.STRING_ENCODING))
        # This will return a bytes list of length 32
        h = m.digest()
        if len(h) % 4 != 0:
            raise Exception(
                str(__name__) + ' ' +
                str(getframeinfo(currentframe()).lineno) +
                ': Hash bytes length must be 0 modulo-4, got = ' + str(h))

        # We compress to 8 bytes from the 32 bytes
        # The original SHA-256 appends 8 parts concatenated together, we break into 4 parts and xor them

        # 4 blocks
        n_blocks = int(len(h) / desired_byte_length)
        # 8 bytes block length
        block_len = int(len(h) / n_blocks)
        Log.debugdebug(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) +
            ': Number of blocks = ' + str(n_blocks) + ', block length = ' +
            str(block_len))

        # First block
        bytes_xor = h[0:block_len]
        for i in range(1, n_blocks, 1):
            idx_start = i * block_len
            idx_end = (i + 1) * block_len
            cur_block = h[idx_start:idx_end]
            if len(bytes_xor) != len(cur_block):
                raise Exception(
                    str(self.__class__) + ' ' +
                    str(getframeinfo(currentframe()).lineno) +
                    ': Different block lengths "' + str(bytes_xor) +
                    '", and "' + str(cur_block) + '"')
            bytes_xor = self.xor_bytes(b1=bytes_xor, b2=cur_block)

        return bytes_xor
Пример #15
0
    def transform_input_for_model(
        self,
        # This should be a list of words as a sentence
        x_input,
        word_freq_model=FeatureVector.COL_FREQUENCY,
    ):
        #
        # This could be numbers, words, etc.
        #
        features_model = list(self.get_model_features())
        # Log.debug(
        #    str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
        #    + ': Predicting v = ' + str(v_feature_segmented)
        #    + ' using model features:\n\r' + str(features_model)
        # )

        #
        # Convert sentence to a mathematical object (feature vector)
        #
        model_fv = FeatureVector()
        model_fv.set_freq_feature_vector_template(list_symbols=features_model)

        # Get feature vector of text
        try:
            df_fv = model_fv.get_freq_feature_vector(text_list=x_input)
        except Exception as ex:
            errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                     + ': Exception occurred calculating FV for "' + str(x_input) \
                     + '": Exception "' + str(ex) \
                     + '\n\rUsing FV Template:\n\r' + str(model_fv.get_fv_template()) \
                     + ', FV Weights:\n\r' + str(model_fv.get_fv_weights())
            Log.critical(errmsg)
            raise Exception(errmsg)

        # This creates a single row matrix that needs to be transposed before matrix multiplications
        # ndmin=2 will force numpy to create a 2D matrix instead of a 1D vector
        # For now we make it 1D first
        assert word_freq_model in df_fv.columns, '"' + str(
            word_freq_model) + '" must be in ' + str(df_fv.columns)
        fv_text_1d = np.array(df_fv[word_freq_model].values, ndmin=1)
        if fv_text_1d.ndim != 1:
            raise Exception(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) +
                ': Expected a 1D vector, got ' + str(fv_text_1d.ndim) + 'D!')
        Log.debugdebug(fv_text_1d)

        x_transformed = npUtil.NumpyUtil.convert_dimension(arr=fv_text_1d,
                                                           to_dim=2)
        return x_transformed
Пример #16
0
    def get_pct_intersection_with_common_words(
            self,
            word_list,
            # In the case of Vietnamese, we might have to form words from the syllables
            max_word_n_tuple = 1
    ):
        if max_word_n_tuple == 1:
            lang_intersection = set(word_list).intersection(self.get_common_words())
            pct_intersection = len(lang_intersection) / len(set(word_list))
        else:
            # Means we are looking not just at the current token, but form a word from
            # continuous tokens up to max_word_n_tuple (usually not more than 2)
            len_word_list = len(word_list)
            count_int = 0
            cur_index = 0
            actual_word_count = 0
            # Loop by each token in the word list (or rather token list)
            while cur_index < len_word_list:
                max_n_tuple_lookforward = min(max_word_n_tuple, len_word_list-cur_index)
                for j in range(max_n_tuple_lookforward,0,-1):
                    # Look from j tokens ahead
                    end_index = cur_index+j
                    # For the j-tuple word
                    w = ' '.join(word_list[cur_index:end_index])
                    Log.debugdebug(
                        str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                        + ': Test word "' + str(w) + '", cur_index=' + str(cur_index) + ', j=' + str(j))
                    if w in self.get_common_words():
                        count_int += 1
                        # Move forward to the end of the token from the word found
                        cur_index += j-1
                        Log.debugdebug(
                            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                            + ': Found word "' + str(w) + '"')
                        break
                cur_index += 1
                actual_word_count += 1

            Log.debug(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': Count Intersection = ' + str(count_int) + ', actual word count = ' + str(actual_word_count)
            )
            pct_intersection = count_int / actual_word_count

        Log.debug(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': "' + str(self.lang) + '" intersection = ' + str(pct_intersection)
        )
        return pct_intersection
Пример #17
0
 def encode(
         self,
         # bytes format
         data):
     try:
         if self.cipher_mode == AES.MODE_EAX:
             cipher = AES.new(key=self.key,
                              mode=self.cipher_mode,
                              nonce=self.nonce)
             cipherbytes, tag = cipher.encrypt_and_digest(data)
             return AES_Encrypt.EncryptRetClass(
                 cipher_mode=self.cipher_mode_str,
                 ciphertext_b64=b64encode(cipherbytes).decode(
                     self.text_encoding),
                 plaintext_b64=None,
                 tag_b64=b64encode(tag).decode(self.text_encoding),
                 nonce_b64=b64encode(self.nonce).decode(self.text_encoding))
         elif self.cipher_mode == AES.MODE_CBC:
             # 1-16, make sure not 0, other wise last byte will not be block length
             length = AES_Encrypt.DEFAULT_BLOCK_SIZE_AES_CBC - (
                 len(data) % AES_Encrypt.DEFAULT_BLOCK_SIZE_AES_CBC)
             # Pad data with the original length, so when we decrypt we can just take data[-1]
             # as length of data block
             data += bytes(chr(length), encoding=STR_ENCODING) * length
             Log.debugdebug(
                 str(self.__class__) + ' ' +
                 str(getframeinfo(currentframe()).lineno) +
                 ': Padded length = ' + str(length))
             cipher = AES.new(key=self.key,
                              mode=self.cipher_mode,
                              iv=self.nonce)
             cipherbytes = cipher.encrypt(data)
             return AES_Encrypt.EncryptRetClass(
                 cipher_mode=self.cipher_mode_str,
                 ciphertext_b64=b64encode(cipherbytes).decode(
                     self.text_encoding),
                 plaintext_b64=None,
                 tag_b64=None,
                 nonce_b64=b64encode(self.nonce).decode(self.text_encoding))
         else:
             raise Exception(
                 str(self.__class__) + ' ' +
                 str(getframeinfo(currentframe()).lineno) +
                 ': Unsupported mode "' + str(self.cipher_mode) + '".')
     except Exception as ex:
         errmsg = str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                 + ': Error encoding data "' + str(data) + '" using AES ". Exception: ' + str(ex)
         Log.error(errmsg)
         raise Exception(errmsg)
Пример #18
0
    def process_noun(self, word):
        l = len(word)

        ces = self.case_endings_by_len[LemmatizerBase.CE_NOUN]

        for i in ces.keys():
            postfix = word[(l - i):l]
            check = postfix in ces[i]
            Log.debugdebug(
                str(__name__) + ' ' +
                str(getframeinfo(currentframe()).lineno) + ': Check ' +
                str(check) + ' for "' + str(postfix) + '" in ' + str(ces[i]))
            if check:
                return word[0:(l - i)]
        return None
Пример #19
0
 def filter_sentence_by_pos_tag_english(
     self,
     word_list,
     keep_tags=DEFAULT_KEEP_TAGS_ENG,
 ):
     if type(word_list) is str:
         word_list = word_tokenize(text=word_list, language='english')
     words_postags = pos_tag(word_list)
     sent_filtered = [w for w, t in words_postags if (t in keep_tags)]
     Log.debugdebug(
         str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) +
         ': POS TAGs: ' + str(words_postags))
     Log.debugdebug(
         str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) +
         ': Filtered sentence: ' + str(sent_filtered))
     return sent_filtered
Пример #20
0
    def convert_ascii_string_to_other_alphabet(
        ascii_char_string,
        # Default to CJK Unicode Block
        unicode_range=BLOCK_CHINESE,
        # If the characters come from a hexdigest from a hash, we can compress 4 times,
        # otherwise for a random ascii string, we can only compress 2 characters to 1 chinese.
        group_n_char=2):
        uni_len = unicode_range[1] - unicode_range[0] + 1

        r = len(ascii_char_string) % 4
        if r != 0:
            # Append 0's
            ascii_char_string = ascii_char_string + '0' * (4 - r)
        # raise Exception('Hash length ' + str(len(hash_hex_string))
        #                 + ' for "' + str(hash_hex_string) + '" not 0 modulo-4')

        hash_zh = ''

        len_block = int(len(ascii_char_string) / group_n_char)
        for i in range(0, len_block, 1):
            idx_start = group_n_char * i
            idx_end = idx_start + group_n_char
            s = ascii_char_string[idx_start:idx_end]

            # Convert to Chinese, Korean, etc
            if group_n_char == 2:
                ord_arr = np.array([ord(x) for x in s])
                val = ord_arr * np.array(
                    [2**(8 * (x - 1)) for x in range(len(ord_arr), 0, -1)])
                val = np.sum(val)
                Log.debug('Index start=' + str(idx_start) + ', end=' +
                          str(idx_end) + ', s=' + str(s) + ', ordinal=' +
                          str(ord_arr) + ', val=' + str(hex(val)))
                cjk_unicode = (val % uni_len) + unicode_range[0]
                hash_zh += chr(cjk_unicode)
            elif group_n_char == 4:
                Log.debug('Index start=' + str(idx_start) + ', end=' +
                          str(idx_end) + ', s=' + str(s))
                n = int('0x' + str(s), 16)
                cjk_unicode = (n % uni_len) + unicode_range[0]
                hash_zh += chr(cjk_unicode)
                Log.debugdebug('From ' + str(idx_start) + ': ' + str(s) +
                               ', n=' + str(n) + ', char=' +
                               str(chr(cjk_unicode)))

        return hash_zh
Пример #21
0
    def transform_input_for_model(
            self,
            # For the model to interpret and transform in to x usable for model input
            # (e.g. map using one-hot dictionaries)
            x_input,
            word_freq_model = None,
    ):
        try:
            Log.debugdebug('***** x input: ' + str(x_input))
            # We expect x_input to be an np array of words
            if type(x_input) is np.ndarray:
                x_input = x_input.tolist()
            if type(x_input) not in (list, tuple):
                raise Exception(
                    str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                    + ': Model "' + str(self.identifier_string)
                    + '". Expect list/tuple type, got type "' + str(type(x_input))
                    + '" for x input: ' + str(x_input)
                )
            if self.x_one_hot_dict_inverse is None:
                raise Exception(
                    str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                    + ': Model "' + str(self.identifier_string) + '" x one hot not yet initialized!'
                )
            x = []

            for i in range(len(x_input)):
                word = x_input[i]
                if word in self.x_one_hot_dict_inverse.keys():
                    x.append(self.x_one_hot_dict_inverse[word])
                else:
                    Log.warning(
                        str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                        + ': Model "' + str(self.identifier_string) + '", could not map input value "' + str(word)
                        + '" to code x. Not in x one hot dictionary.'
                    )

            # TODO Pad with 0's to satisfy neural network in put length
            input_shape = self.network.layers[0].input_shape
            input_len = input_shape[1]
            Log.debugdebug('***** INPUT SHAPE ' + str(input_shape) + ', len ' + str(input_len) + ', x = ' + str(x))
            while len(x) < input_len:
                x = [0] + x
            Log.debugdebug('  ***** padded x: ' + str(x))

            x = np.array(x)
            x_transformed = NumpyUtil.convert_dimension(arr=x, to_dim=2)
            Log.debugdebug('  ***** transformed x: ' + str(x_transformed))

            return x_transformed
        except Exception as ex:
            raise Exception(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': Model "' + str(self.identifier_string) + '", exception tranforming ' + str(x_input)
                + '. Exception: ' + str(ex)
            )
Пример #22
0
    def test_textcluster_chinese(self):
        res = ut.ResultObj(count_ok=0, count_fail=0)
        lang = lf.LangFeatures.LANG_ZH

        self.txt_preprocessor.stopwords_list = []
        text = [
            # Article 1
            '人工智能 : 英 、 中 、 美 上演 “ 三国演义 ”',
            '英国 首相 特里莎·梅 周四 (1月 25日) 在 瑞士 达沃斯 世界 经济 论坛 上 宣布 , 英国 在 人工智能 ( AI ) 领域 要 争 当 世界 领头羊。',
            '一周 后 , 她 将 率 英国 经贸 代表团 访 华 , 到 北京 和 上海 展开 " 历史性 访问 "。 一周 前 , 中国 发表 《 人工智能 标准化 白皮书 》。',
            '中国 媒体 把 2017 年 称为 " AI 年 ", 2018 则 是 AI 从 学术 飞入 产业 、 普及 应用 的 关键 年 。',
            '围绕 AI , 中美 正 胶着 于 争霸 竞赛 ,而 中英 在 科技 、工商 和 金融界 的 互动 将 产生 怎样 的 结果 ,引 人 关注'
            '。',
            # Article 2
            '叙利亚 俄军 遇袭 恐怖分子 用 无人机 “ 群攻 ”',
            '俄军 在 叙利亚 军事基地 遭到 攻击 后 , 俄罗斯 国防部 警告 说 , 恐怖分子 已 获得 先进 无人机 技术 , 能够 在 全世界 发动 攻击 。',
            '俄罗斯 总参谋部 无人机 部门 负责人 亚历山大 · 维科夫 少将 说 , 恐怖分子 使用 无人机 发动 攻击 的 威胁 已经 不再 是 不可能 的 事情,',
            '恐怖分子 已经 利用 无人机 攻击 俄军 在 叙利亚 的 克 美 明 空军基地 和 塔尔图斯 的 一个 港口',
            '他 还 说 , 在 1月 6日 发动 攻击 的 技术 评估 显示 ," 在 世界 所有 其他 地方 使用 无人机 发动 恐怖 攻击 已经 成为 现实 威胁"'
            # Article 3
        ]

        text_sentences_arr = self.txt_preprocessor.preprocess_list_all_langs(
            sentences_list=text)
        Log.debugdebug('PRE-PROCESSED ' + str(lang) + ' SENTENCES:\n\r' +
                       str(text_sentences_arr))

        # This example is too small in sample size to weigh by IDF (which will instead lower the accuracy)
        # do_clustering(text=text, stopwords=stopwords, ncenters=2, freq_measure='tf', weigh_idf=False, verbose=0)
        res_cluster = self.do_clustering(
            text=text_sentences_arr,
            ncenters=2,
            expected_clusters=((0, 1, 2, 3, 4), (5, 6, 7, 8, 9)),
            test_threshold_inside=0.7,
            test_threshold_outside=0.3,
            test_description='1. ' + str(lang) + ' normalized, no IDF',
            freq_measure=WordFreqDocMatrix.BY_SIGMOID_FREQ_NORM,
        )
        res.update(other_res_obj=res_cluster)
        # do_clustering(text=text, stopwords=stopwords, ncenters=2, freq_measure='frequency', weigh_idf=False, verbose=0)

        return res
Пример #23
0
    def xor_bytes(self, b1, b2):
        t12 = zip(b1, b2)

        res_xor = []
        for x in t12:
            byte_xor = x[0] ^ x[1]
            Log.debugdebug(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) + 'XOR "' +
                str(hex(x[0])) + '" and "' + str(hex(x[1])) + '" = ' +
                str(hex(byte_xor)))
            res_xor.append(byte_xor)

        Log.debug(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) + ': XOR between "' +
            str(self.hexdigest(b1)) + '" and "' + str(self.hexdigest(b2)) +
            '" = "' + str(self.hexdigest(res_xor)) + '"')

        return res_xor
Пример #24
0
    def to_data_frame(
        self,
        # Pre-processed sentences are hard to read, so we use original sentences
        sentences_list_no_preprocessing,
        # List of dictionary (if word-weights) representing a topic
        topic_words,
        # numpy ndarray
        doc_labels,
    ):
        # Convenient data frame for the topics & original documents
        df_classified = pd.DataFrame()
        for doc_idx in range(np.max(doc_labels) + 1):
            Log.debugdebug(str(self.__class__) + ': Cluster #' + str(doc_idx))
            Log.debugdebug(
                str(self.__class__) + ': Word-Value Center: ' +
                str(topic_words[doc_idx]))
            Log.debugdebug(
                str(self.__class__) + ': Words Center: ' +
                str(topic_words[doc_idx].keys()))
            cluster_words = str(topic_words[doc_idx].keys())
            topic_sentences = []
            for j in range(len(sentences_list_no_preprocessing)):
                if doc_labels[j] == doc_idx:
                    # print('\t\t' + str(sentences_list_no_preprocessing[j]))
                    topic_sentences.append(sentences_list_no_preprocessing[j])
            df_topic = pd.DataFrame({
                'ClusterNo': doc_idx,
                'ClusterTopWords': cluster_words,
                'Sentence': topic_sentences,
            })
            df_classified = df_classified.append(df_topic)

        return df_classified
Пример #25
0
    def get_freq_feature_vector(
            self,
            # A word array. e.g. ['this','is','a','sentence','or','just','any','word','array','.']
            text_list,
            feature_as_presence_only = False,
            # Log base has no effect on LogFreqNormalized & LogFreqProbability as it is just a constant factor
            log_base = DEFAULT_LOG_BASE,
    ):
        counter = col.Counter(text_list)
        # Order the counter
        counter = counter.most_common()

        symbols = [x[0] for x in counter]
        freqs = np.array( [x[1] for x in counter] )
        # lg.Log.debugdebug(
        #     str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
        #     + ': Symbols ' + str(symbols)
        #     + ', Frequencies ' + str(freqs)
        #     + ', Presence ' + str(presence)
        # )

        # If <feature_as_presence_only> flag set, we don't count frequency, but presence
        if feature_as_presence_only:
            presence = (freqs >= 1) * 1
            freqs = presence
        df_counter = pd.DataFrame({
            self.COL_SYMBOL: symbols,
            self.COL_FREQUENCY: freqs
        })
        Log.debugdebug(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Converted text "' + str(text_list) + '" to ' +  str(df_counter.values)
        )

        df_merge = self.get_freq_feature_vector_df(
            df_text_counter = df_counter,
            log_base = log_base,
        )
        return df_merge
Пример #26
0
    def calculate(self):
        losses = []
        # The losses of each class has already been conveniently broken up by the categorical format
        for real_prob, given_probs in zip(self.p_real_prob_labels,
                                          self.q_given_probs):
            # Just to be sure in case numbers don't sum up to 1 for probabilities
            given_probs_normalized = given_probs / given_probs.sum(
                axis=-1, keepdims=True)
            assert abs(np.sum(given_probs_normalized) -
                       1.0) < CategoricalCrossEntropy.SMALL_NUMBER
            Log.debugdebug('Given Probs: ' + str(given_probs_normalized))

            # Calculate the number of bits required to represent this information
            info_bits = -np.log(
                np.maximum(CategoricalCrossEntropy.SMALL_NUMBER,
                           given_probs_normalized))
            Log.debugdebug('Information Bits: ' + str(info_bits))

            # If the label is categorical, the loss is only the loss of the single non-zero category usually
            loss = np.sum(real_prob * info_bits, axis=-1, keepdims=False)
            losses.append(loss)
        Log.debugdebug('Losses: ' + str(losses))
        # We can actually ignore the constant N term if we wish
        return np.sum(losses) * (1 / self.N)
Пример #27
0
    def search_close_words(
            self,
            word,
            # Cost can be any measure of edit distance, e.g. Levenshtein, Damerau-Levenshtein, etc.
            max_cost=2,
            edit_distance_algo=EditDistance.EDIT_DIST_ALGO_DAMLEV):
        # Returns tuples of (word, edit-distance)
        # E.g. from word bg to [('be',1), ('big',1), ('bag',1), ('brag',2)]
        results = TrieNode.search_close_words(
            trie=self.trie,
            word=word,
            max_cost=max_cost,
            edit_distance_algo=edit_distance_algo)
        Log.debug(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) + ': For word "' +
            str(word) + '", found trie node matches ' + str(results))
        if (results is None) or (len(results) == 0):
            return None

        #
        # Можно использовать любую весовую систему слов
        #
        corrected_words = []
        edit_distances = []
        eidf_values = []
        for obj in results:
            # The corrected word returned in tuple
            cor_word = obj[0]
            # The edit distance returned in tuple
            edit_dist = obj[1]
            if self.use_word_weighting:
                eidf_val = self.eidf_value[self.eidf_words == cor_word]
                if len(eidf_val) != 1:
                    Log.debugdebug(
                        str(self.__class__) + ' ' +
                        str(getframeinfo(currentframe()).lineno) +
                        ': No EIDF value found for corrected word "' +
                        str(cor_word) + '"')
                    continue
                else:
                    eidf_values.append(round(eidf_val[0], 2))
            else:
                eidf_values.append(None)
            corrected_words.append(cor_word)
            edit_distances.append(edit_dist)

        df = pd.DataFrame({
            SpellCheckWord.COL_CORRECTED_WORD: corrected_words,
            SpellCheckWord.COL_EDIT_DISTANCE: edit_distances,
            SpellCheckWord.COL_EIDF_VALUE: eidf_values
        })

        df = df.sort_values(by=[
            SpellCheckWord.COL_EDIT_DISTANCE, SpellCheckWord.COL_EIDF_VALUE
        ],
                            ascending=True)
        df = df.reset_index(drop=True)
        Log.debug(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) +
            ': Corrected words and eidf values: ' + str(df))
        return df
Пример #28
0
    def process_text_training_data(self, ):
        # The algorithm to segment words works as follows:
        #   If segmented text returned from DB is None or shorter than text, we will process the text.
        #   However if the flag self.reprocess_all_text == True, we segment no matter what.

        Log.important(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) +
            ': START SEGMENT & STEM DB TRAINING DATA, FORCE RESEGMENT ALL = ' +
            str(self.reprocess_all_text))

        td_total_rows = self.df_training_data.shape[0]
        count = 0

        for idx_row in self.df_training_data.index:
            count = count + 1
            text_from_db = str(self.df_training_data[
                DaehuaTrainDataModel.COL_TDATA_TEXT].loc[idx_row])
            text_processed_from_db = self.df_training_data[
                DaehuaTrainDataModel.COL_TDATA_TEXT_SEGMENTED].loc[idx_row]
            intent_td_id = self.df_training_data[
                DaehuaTrainDataModel.COL_TDATA_TRAINING_DATA_ID].loc[idx_row]
            intent_id = self.df_training_data[
                DaehuaTrainDataModel.COL_TDATA_INTENT_ID].loc[idx_row]
            intent_name = self.df_training_data[
                DaehuaTrainDataModel.COL_TDATA_INTENT_NAME].loc[idx_row]
            # Internal Counter
            internal_counter = self.df_training_data[
                TrDataPreprocessor.TD_INTERNAL_COUNTER].loc[idx_row]

            Log.debugdebug('Processing index row "' + str(idx_row) + '" ' +
                           str(self.df_training_data.loc[idx_row]) + '"')

            if type(text_from_db) is not str:
                Log.warning(
                    str(self.__class__) + ' ' +
                    str(getframeinfo(currentframe()).lineno) +
                    ': Text from DB "' + str(text_from_db) +
                    '" not string type.')
                text_from_db = str(text_from_db)
            # When a text is updated in DB/storage, this field should be cleared in DB to NULL
            if text_processed_from_db is None:
                text_processed_from_db = ''

            possible_langs = self.lang_detect.detect(text=text_from_db)
            # Empty list
            if not possible_langs:
                lang_detected = self.language_main
            else:
                lang_detected = possible_langs[0]

            # If detected language not supported
            if lang_detected not in [self.language_main
                                     ] + self.languages_additional:
                Log.warning(
                    str(self.__class__) + ' ' +
                    str(getframeinfo(currentframe()).lineno) + ': For "' +
                    str(self.model_identifier) + '", detected lang "' +
                    str(lang_detected) + '" not in languages supported')
                lang_detected = self.language_main
            # Update data frame with language detected
            self.df_training_data[DaehuaTrainDataModel.COL_TDATA_TEXT_LANG].at[idx_row] = \
                lang_detected

            #if lang_detected != self.language_main:
            Log.info(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) + ': Lang "' +
                str(lang_detected) + '" main lang "' +
                str(self.language_main) + '" for text "' + str(text_from_db) +
                '".')

            #
            # Sanity check only. Should not happen since after every training data update,
            # NULL would be written back to the TextSegmented column.
            # Because we don't want to reprocess all text which takes time, so we guess first
            #
            is_likely_processed_text_changed = len(
                text_processed_from_db) < len(text_from_db)
            # If a language has verb conjugation, we cannot just compare length as the original text could be longer
            if self.lang_have_verb_conj[lang_detected]:
                # So we just hardcode
                is_likely_processed_text_changed = len(
                    text_processed_from_db) <= 8

            if is_likely_processed_text_changed:
                if (intent_td_id is not None) and (intent_td_id > 0):
                    # Warn only if it is not our own inserted data
                    Log.warning(
                        str(self.__class__) + ' ' +
                        str(getframeinfo(currentframe()).lineno) + ': Text "' +
                        str(text_from_db) +
                        '" likely has incorrect segmentation "' +
                        str(text_processed_from_db) + '".')

            #
            # We only reprocess the text if there is some likelihood of change
            #
            if self.reprocess_all_text or is_likely_processed_text_changed:
                processed_text_str = self.txt_preprocessor[
                    lang_detected].process_text(inputtext=text_from_db,
                                                return_as_string=True)
                Log.debug(
                    str(self.__class__) + ' ' +
                    str(getframeinfo(currentframe()).lineno) + ': Text "' +
                    str(text_from_db) + '" processed text "' +
                    str(processed_text_str) + '".')

                is_text_processed_changed = not (text_processed_from_db
                                                 == processed_text_str)
                Log.info(
                    str(self.__class__) + ' ' +
                    str(getframeinfo(currentframe()).lineno) + ': No ' +
                    str(count) + ' of ' + str(td_total_rows) +
                    ': Tr Data ID "' + str(intent_td_id) +
                    '". Force segment = ' + str(self.reprocess_all_text) +
                    '\n\r   Text "' + str(text_from_db) + '". Processed to "' +
                    str(processed_text_str) + '"' + ', changed = ' +
                    str(is_text_processed_changed))

                # Training ID 0 are those we inserted ourselves so no need to update anything
                if is_text_processed_changed:
                    # Update the column
                    self.df_training_data[DaehuaTrainDataModel.COL_TDATA_TEXT_SEGMENTED].at[idx_row] = \
                        processed_text_str

                    # For intent name we inserted, no need to warn
                    if (intent_td_id is not None) and (intent_td_id > 0):
                        Log.warning(
                            str(self.__class__) + ' ' +
                            str(getframeinfo(currentframe()).lineno) +
                            ': Processed text different. Text "' +
                            str(text_from_db) + '\n\r   new processed text "' +
                            str(processed_text_str) + '"' +
                            '\n\r   old processed text "' +
                            str(text_processed_from_db) + '"')

                        row_changed = self.__get_row_to_append_to_training_data(
                            intent_id=intent_id,
                            intent_name=intent_name,
                            text=text_from_db,
                            text_id=intent_td_id,
                            processed_text=processed_text_str,
                            lang_detected=lang_detected,
                            internal_counter=internal_counter)
                        self.list_of_rows_with_changed_processed_text.append(
                            row_changed)
                        Log.important(
                            str(self.__class__) + ' ' +
                            str(getframeinfo(currentframe()).lineno) +
                            ': Appended changed row: ' + str(row_changed))
                    else:
                        Log.important(
                            str(self.__class__) + ' ' +
                            str(getframeinfo(currentframe()).lineno) +
                            ': Processed text ' + str(count) + ' ok "' +
                            str(processed_text_str) + '" from "' +
                            str(text_from_db) + '"')
            else:
                Log.info(
                    str(self.__class__) + ' ' +
                    str(getframeinfo(currentframe()).lineno) +
                    ': Training data ID ' + str(intent_td_id) + ': No ' +
                    str(count) + ' of ' + str(td_total_rows) +
                    ': Nothing to do, OK segmented/processed from DB "' +
                    str(text_processed_from_db) + '"')
        return
Пример #29
0
    def preprocess_training_data_text(self):
        # Just add intent names into the training data, no text processing
        self.add_intent_name_to_training_data()
        self.process_text_training_data()
        self.add_latin_form_to_training_data()

        try:
            from nwae.ml.text.TxtTransform import TxtTransform
            # Conversion to padded docs
            res = TxtTransform(docs=list(self.df_training_data[
                DaehuaTrainDataModel.COL_TDATA_TEXT_SEGMENTED]),
                               labels=list(self.df_training_data[
                                   DaehuaTrainDataModel.COL_TDATA_INTENT_ID]),
                               langs=list(self.df_training_data[
                                   DaehuaTrainDataModel.COL_TDATA_TEXT_LANG])
                               ).create_padded_docs()
            Log.debug(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) + ': Padded Docs: ' +
                str(res.padded_encoded_docs) + ', Labels: ' +
                str(res.encoded_labels))
            Log.debug(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) +
                ': Labels Categorical: ' + str(res.encoded_labels_categorical))

            self.embedding_params = EmbeddingParams(
                x=res.padded_encoded_docs,
                x_original=res.original_docs,
                y=np.array(res.encoded_labels),
                y_original=res.y_original,
                x_one_hot_dict=res.x_one_hot_dict,
                y_one_hot_dict=res.y_one_hot_dict,
                max_sent_len=res.max_x_length,
                max_label_val=max(res.encoded_labels),
                vocab_size=res.vocabulary_dimension)
            Log.info(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) + ': Converted ' +
                str(len(self.embedding_params.x)) +
                ' rows padded docs. Max sentence length = ' +
                str(self.embedding_params.max_sent_len) +
                ', max label value = ' +
                str(self.embedding_params.max_label_val) +
                ', vocabulary size = ' +
                str(self.embedding_params.vocab_size) + ', x one hot dict: ' +
                str(self.embedding_params.x_one_hot_dict))
            Log.debugdebug(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) +
                ': Original docs:\n\r' +
                str(self.embedding_params.x_original) +
                '\n\rEncoded padded docs\n\r:' + str(self.embedding_params.x) +
                '\n\rOriginal labels\n\r' +
                str(self.embedding_params.y_original) +
                '\n\rEncoded labels\n\r' + str(self.embedding_params.y))
        except Exception as ex_embed:
            errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\
                     + ': Error converting to training text to embed params: ' + str(ex_embed)
            Log.warning(errmsg)
            # Don't raise error
            # raise Exception(errmsg)

        return (self.df_training_data, self.embedding_params)
Пример #30
0
    def unit_test_predict_classes(
        self,
        word_freq_model,
        include_match_details=False,
        top=5,
    ):
        Log.info(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) +
            ': Test predict classes using model "' + str(self.model_name) +
            '".')

        # Unit test using direct text (PredictClass.py) is in PredictClass.py itself
        model_obj = ModelHelper.get_model(
            model_name=self.model_name,
            model_params=None,
            identifier_string=self.identifier_string,
            dir_path_model=self.ut_params.dirpath_model,
            training_data=None)
        model_obj.start()
        model_obj.wait_for_model()
        #model_obj.load_model_parameters()

        test_x = UnitTestMetricSpaceModel.DATA_TEST_X
        test_x_name = UnitTestMetricSpaceModel.DATA_TEST_X_NAME
        model_x_name = model_obj.get_model_features()
        if model_x_name is None:
            model_x_name = UnitTestMetricSpaceModel.DATA_X_NAME

        word_freq_model_mapped = WordFreqDocMatrix.map_to_feature_vect_word_freq_measure(
            freq_measure=word_freq_model)
        if word_freq_model_mapped in [
                WordFreqDocMatrix.BY_SIGMOID_FREQ,
                WordFreqDocMatrix.BY_SIGMOID_FREQ_NORM
        ]:
            test_x = 2 * ((1 / (1 + np.exp(-test_x))) - 0.5)
        elif word_freq_model_mapped in [
                WordFreqDocMatrix.BY_LOG_FREQ,
                WordFreqDocMatrix.BY_LOG_FREQ_NORM
        ]:
            test_x = np.log(1 + test_x)
        else:
            pass
        Log.info(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) +
            ': Mapped to word freq model "' + str(word_freq_model_mapped) +
            '" to ' + str(test_x))

        if model_x_name.ndim == 2:
            model_x_name = model_x_name[0]
        Log.info(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) + ': Model x_name: ' +
            str(model_x_name))

        # Reorder by model x_name
        df_x_name = pd.DataFrame(data={
            'word': model_x_name,
            'target_order': range(0, len(model_x_name), 1)
        })
        df_test_x_name = pd.DataFrame(
            data={
                'word': test_x_name,
                'original_order': range(0, len(test_x_name), 1)
            })
        # Log.debug('**** Target Order: ' + str(model_x_name))
        # Log.debug('**** Original order: ' + str(test_x_name))
        # Left join to ensure the order follows target order and target symbols
        df_x_name = df_x_name.merge(df_test_x_name, how='left')
        # Log.debug('**** Merged Order: ' + str(df_x_name))
        # Then order by original order
        df_x_name = df_x_name.sort_values(by=['target_order'], ascending=True)
        # Then the order we need to reorder is the target_order column
        reorder = np.array(df_x_name['original_order'])
        self.res_final.update_bool(res_bool=UnitTest.assert_true(
            observed=reorder.tolist(),
            expected=self.REORDER_FEATURE_NAMES_WITH_UNK.tolist(),
            test_comment='Test reorder of feature names ' + str(reorder)))

        test_x_transpose = test_x.transpose()
        Log.debugdebug(test_x_transpose)

        reordered_test_x = np.zeros(shape=test_x_transpose.shape)
        Log.debugdebug(reordered_test_x)

        for i in range(0, reordered_test_x.shape[0], 1):
            reordered_test_x[i] = test_x_transpose[reorder[i]]

        reordered_test_x = reordered_test_x.transpose()
        Log.debugdebug(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) +
            ': Reordered test x = ' + str(reordered_test_x))

        x_classes_expected = self.y
        # Just the top predicted ones
        all_y_observed_top = []
        all_y_observed = []
        mse = 0
        count_all = reordered_test_x.shape[0]

        Log.info(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) +
            ': Predict classes for x:\n\r' + str(reordered_test_x))
        prf_start = prf.Profiling.start()

        for i in range(reordered_test_x.shape[0]):
            v = npUtil.NumpyUtil.convert_dimension(arr=reordered_test_x[i],
                                                   to_dim=2)
            Log.debugdebug(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) + ': Testing x: ' +
                str(v))
            if self.model_name == ModelHelper.MODEL_NAME_HYPERSPHERE_METRICSPACE:
                predict_result = model_obj.predict_class(
                    x=v, include_match_details=include_match_details, top=top)
            else:
                predict_result = model_obj.predict_class(x=v)
            y_observed = predict_result.predicted_classes
            all_y_observed_top.append(y_observed[0])
            all_y_observed.append(y_observed)
            top_class_distance = predict_result.top_class_distance
            match_details = predict_result.match_details

            Log.debugdebug(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) + ': Point v ' +
                str(v) + ', predicted ' + str(y_observed) +
                ', Top Class Distance: ' + str(top_class_distance) +
                ', Match Details:\n\r' + str(match_details))

            if self.model_name == ModelHelper.MODEL_NAME_HYPERSPHERE_METRICSPACE:
                metric = top_class_distance
                mse += metric**2

        prf_dur = prf.Profiling.get_time_dif(prf_start, prf.Profiling.stop())
        Log.important(
            str(self.__class__) + str(getframeinfo(currentframe()).lineno) +
            ' PROFILING ' + str(count_all) + ' calculations: ' +
            str(round(1000 * prf_dur, 0)) + ', or ' +
            str(round(1000 * prf_dur / count_all, 2)) +
            ' milliseconds per calculation')

        # Compare with expected
        compare_top_x = {}

        for t in range(1, top + 1, 1):
            # True or '1' means not correct or error
            compare_top_x[t] = np.array([True] * len(all_y_observed))
            for i in range(len(all_y_observed)):
                matches_i = all_y_observed[i]
                if x_classes_expected[i] in matches_i[0:t]:
                    # False of '0' means no error
                    compare_top_x[t][i] = False
                    self.res_final.count_ok += 1 * (t == 1)
                else:
                    self.res_final.count_fail += 1 * (t == 1)
            Log.info(compare_top_x[t])
            Log.info('Total Errors (compare top #' + str(t) + ') = ' +
                     str(np.sum(compare_top_x[t] * 1)))

        Log.info('mse = ' + str(mse))

        if self.model_name == ModelHelper.MODEL_NAME_HYPERSPHERE_METRICSPACE:
            predict_result = model_obj.predict_classes(
                x=reordered_test_x,
                include_match_details=include_match_details,
                top=top)
            Log.info('Predicted Classes:\n\r' +
                     str(predict_result.predicted_classes))
            Log.info('Top class distance:\n\r' +
                     str(predict_result.top_class_distance))
            Log.info('Match Details:\n\r' + str(predict_result.match_details))
            Log.info('MSE = ' + str(predict_result.mse))

        model_obj.join()

        #
        # Test using PredictClass
        #
        from nwae.lang.LangFeatures import LangFeatures
        from nwae.ml.PredictClass import PredictClass
        predict = PredictClass(
            model_name=ModelHelper.MODEL_NAME_HYPERSPHERE_METRICSPACE,
            identifier_string=UnitTestMetricSpaceModel.IDENTIFIER_STRING,
            dir_path_model=self.ut_params.dirpath_model,
            lang=LangFeatures.LANG_KO,
            dir_wordlist=self.ut_params.dirpath_wordlist,
            postfix_wordlist=self.ut_params.postfix_wordlist,
            dir_wordlist_app=self.ut_params.dirpath_app_wordlist,
            postfix_wordlist_app=self.ut_params.postfix_app_wordlist,
            dirpath_synonymlist=self.ut_params.dirpath_synonymlist,
            postfix_synonymlist=self.ut_params.postfix_synonymlist,
            word_freq_model=word_freq_model_mapped,
            do_spelling_correction=False,
            do_profiling=True)

        for i in range(len(self.DATA_TEXTS)):
            label = self.DATA_Y[i]
            text_arr = self.DATA_TEXTS[i]
            text = ' '.join(text_arr)
            # Return all results in the top 5
            res = predict.predict_class_text_features(
                inputtext=text,
                match_pct_within_top_score=0,
                include_match_details=True,
                top=5,
            )
            self.res_final.update_bool(res_bool=UnitTest.assert_true(
                observed=res.predict_result.predicted_classes[0],
                expected=label,
                test_comment='Test "' + str(text) + '" label ' + str(label)))
            Log.debug(
                str(self.__class__) +
                str(getframeinfo(currentframe()).lineno) + ': ' + str(i) +
                '. Match Details word freq model "' +
                str(predict.word_freq_model) + '" ' +
                str(res.predict_result.match_details))
            predict.word_freq_model = WordFreqDocMatrix.map_to_feature_vect_word_freq_measure(
                freq_measure=WordFreqDocMatrix.BY_SIGMOID_FREQ)
            res = predict.predict_class_text_features(
                inputtext=text,
                match_pct_within_top_score=0,
                include_match_details=True,
                top=5,
            )
            Log.debug(
                str(self.__class__) +
                str(getframeinfo(currentframe()).lineno) + ': ' + str(i) +
                '. Match Details word freq model "' +
                str(predict.word_freq_model) + '" ' +
                str(res.predict_result.match_details))

        # Kill any background jobs
        predict.stop_model_thread()

        return