Пример #1
0
    def xor_string(self, s1, s2):
        Log.debug(
            str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) +
            ': XOR between "' + str(s1) + '" and "' + str(s2) + '".')

        len_s1 = len(s1)
        len_s2 = len(s2)
        len_max = max(len(s1), len(s2))

        # Append to the shorter one, in a repeat manner
        for i in range(len(s1), len_max, 1):
            s1 += s1[(i - len_s1)]
        for i in range(len(s2), len_max, 1):
            s2 += s2[(i - len_s2)]

        Log.debug(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) +
            ': After appending, XOR between "' + str(s1) + '" and "' +
            str(s2) + '".')

        Log.debugdebug(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) + ': s1 "' + str(s1) +
            '", s2 "' + str(s2) + '"')

        b1 = bytes(s1, encoding=Obfuscate.STRING_ENCODING)
        b2 = bytes(s2, encoding=Obfuscate.STRING_ENCODING)

        bytes_xor = self.xor_bytes(b1=b1, b2=b2)

        return bytes_xor
Пример #2
0
    def scrape_url(
        self,
        url,
        parser='html.parser',
        tag_to_find='p',
    ):
        try:
            sents = []
            resp = requests.get(url=url, )
            soup = BeautifulSoup(resp.content, parser)
            contents_tag = soup.find_all(tag_to_find)
            for cont in contents_tag:
                txt = StringUtils.trim(cont.get_text())
                sent_list = txt.split('。')
                sent_list = [StringUtils.trim(s) for s in sent_list if s]
                if len(sent_list):
                    sents += sent_list
                Log.debug('Split "' + str(txt) + '" into:' + str(sent_list))
                # [Log.debug('\t"' + str(s) + '"') for s in sent_list]

            return sents
        except Exception as ex:
            Log.error(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) +
                ': Error scraping url "' + str(url) + '", exception: ' +
                str(ex))
Пример #3
0
    def run_unit_test(self):
        res_final = ut.ResultObj(count_ok=0, count_fail=0)

        lang = LangFeatures.LANG_TH
        test_sent = [
            # Case words segmented correctly to ['มี', 'เงน', 'ที่', 'ไหน'] and 'เงน' corrected to 'เงิน'
            ['มีเงนที่ไหน', ['มี', 'เงิน', 'ที่', 'ไหน']],
            # ['การแพร่ระบาดของเชื้อไวรัสโควิด-19',
            #  ['การ', 'แพร่', 'ระบาด', 'ของ', 'เชื้อ', 'ไวรัส', 'โค', 'วิด', '-19']],
            # ['ในทั่วโลกยังเพิ่มขึ้นไม่หยุด',
            #  ['ใน', 'ทั่ว', 'โลก', 'ยัง', 'เพิ่ม', 'ขึ้น', 'ไม่', 'หยุด']]
        ]

        for obj in test_sent:
            s = obj[0]
            arr_expected = obj[1]

            seg = self.word_segmenter[lang].segment_words(
                text=s, return_array_of_split_words=True)
            Log.debug('"' + s + '" segmented to ' + str(seg))

            arr_cor = self.spell_corr[lang].check(text_segmented_arr=seg)
            Log.debug('Corrections array: ' + str(arr_cor))
            res_final.update_bool(
                res_bool=ut.UnitTest.assert_true(observed=arr_cor,
                                                 expected=arr_expected,
                                                 test_comment='Test "' +
                                                 str(s) + '" to ' +
                                                 str(arr_cor)))

        return res_final
Пример #4
0
    def filter_sentence_by_pos_tag_japanese(
        self,
        # string or word list
        word_list,
        keep_tags=DEFAULT_KEEP_TAGS_JAP,
    ):
        try:
            import nagisa
        except Exception as ex:
            raise Exception(
                str(__name__) + ' ' +
                str(getframeinfo(currentframe()).lineno) +
                ': Unable to load nagisa: ' + str(ex))
        if type(word_list) in [list, tuple]:
            text = ' '.join(word_list)
        else:
            text = word_list
        words_postags_obj = nagisa.tagging(text)
        txt_sym_tok = words_postags_obj.words
        txt_sym_postags = words_postags_obj.postags
        Log.debug(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) +
            ': Japanese segmentation ' + str(txt_sym_tok) +
            ', word & POS tags: ' + str(txt_sym_postags))

        words_postags = list(zip(txt_sym_tok, txt_sym_postags))
        sent_filtered = [w for w, t in words_postags if (t in keep_tags)]
        Log.debugdebug(
            str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) +
            ': POS TAGs: ' + str(words_postags))
        Log.debugdebug(
            str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) +
            ': Filtered sentence: ' + str(sent_filtered))
        return sent_filtered
Пример #5
0
    def __convert_preprocessed_text_to_training_data_model_for_nn_dense(
            tr_data_preprocessor
    ):
        x = tr_data_preprocessor.embedding_x
        y = tr_data_preprocessor.embedding_y
        x_one_hot_dict = tr_data_preprocessor.embedding_x_one_hot_dict
        n_rows = len(x)
        max_sentence_len = tr_data_preprocessor.embedding_max_sentence_len
        max_label_value = tr_data_preprocessor.embedding_max_label_val
        vocabulary_size = tr_data_preprocessor.embedding_vocab_size

        Log.debug(
            str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Padded Docs: ' + str(x)
        )

        # print('Padded docs: ' + str(res.padded_docs))
        # print('List labels: ' + str(res.list_labels))

        return tdm.TrainingDataModel(
            x = x,
            y = y,
            x_one_hot_dict = x_one_hot_dict,
            is_map_points_to_hypersphere = False
        )
Пример #6
0
 def convert_format(self, filepath, to_format='wav'):
     file_extension = self.get_audio_filepath_extension(filepath=filepath)
     filepath_converted = re.sub(pattern='[.][a-zA-Z0-9]+$',
                                 repl='.wav',
                                 string=filepath)
     Log.info(
         str(self.__class__) + ' ' +
         str(getframeinfo(currentframe()).lineno) + ': Convert "' +
         str(filepath) + '" with extension "' + str(file_extension) +
         '" New filepath "' + str(filepath_converted) + '"')
     try:
         track = AudioSegment.from_file(file=filepath,
                                        format=file_extension)
         Log.debug(
             str(self.__class__) + ' ' +
             str(getframeinfo(currentframe()).lineno) + ': Converting "' +
             str(filepath) + '" to "' + str(filepath_converted) + '"..')
         file_handle = track.export(filepath_converted, format=to_format)
         file_handle.close()
         Log.info(
             str(self.__class__) + ' ' +
             str(getframeinfo(currentframe()).lineno) +
             ': Successful Conversion from "' + str(filepath) + '" to "' +
             str(filepath_converted) + '"..')
         return filepath_converted
     except Exception as ex:
         raise Exception(
             str(self.__class__) + ' ' +
             str(getframeinfo(currentframe()).lineno) +
             ': Exception converting "' + str(filepath) + '" to "' +
             str(filepath_converted) + '": ' + str(ex))
Пример #7
0
    def run_unit_test(self):
        dt = LangDetect()
        res_final = ut.ResultObj(count_ok=0, count_fail=0)

        start_all_time = Profiling.start()

        for text, expected in LangDetectUnitTest.TEST_TEXT_LANG:
            start_time = Profiling.start()
            observed = dt.detect(text=text)
            ms = round(
                1000 * Profiling.get_time_dif_secs(start=start_time,
                                                   stop=Profiling.stop()), 2)
            Log.debug('Took ' + str(ms) + ' ms')

            res_final.update_bool(
                res_bool=ut.UnitTest.assert_true(observed=observed,
                                                 expected=expected,
                                                 test_comment='test lang "' +
                                                 str(expected) + '", text "' +
                                                 str(text) + '"'))

        end_all_time = Profiling.stop()
        avg_per_text_ms = 1000 * Profiling.get_time_dif_secs(
            start=start_all_time, stop=end_all_time) / len(
                LangDetectUnitTest.TEST_TEXT_LANG)
        Log.info(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) + ': Average ' +
            str(round(avg_per_text_ms, 2)) + 'ms per text (total ' +
            str(len(LangDetectUnitTest.TEST_TEXT_LANG)) + ' sentences)')

        return res_final
Пример #8
0
    def encode(
            self,
            # E.g. {'china': 1, 'russia': 2, ..}
            word_list,
            # E.g. [('china', 'dimsum'), ('russia', 'xleb'), ..]
            word_tuples_list,
    ):
        oh_enc = OneHotEncoder()
        self.words_onehot = oh_enc.encode(
            feature_list = word_list
        )
        self.word_index_dict = oh_enc.get_feature_index_dict()
        self.index_word_dict = {v:k for k,v in self.word_index_dict.items()}
        Log.debug(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Unique word dictionary, length ' + str(len(self.word_index_dict))
            + ': ' + str(self.word_index_dict)
        )

        X = []
        Y = []
        for t in word_tuples_list:
            root_word = t[0]
            root_word_index = self.word_index_dict[root_word]
            close_word = t[1]
            close_word_index = self.word_index_dict[close_word]
            X.append(self.words_onehot[root_word_index])
            Y.append(self.words_onehot[close_word_index])

        Log.debug(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': X: ' + str(X) + '\nY: ' + str(Y)
        )

        return np.array(X), np.array(Y)
Пример #9
0
    def __init__(
            self,
            # 16 or 32 byte key
            key,
            nonce=None,
            mode=AES_MODE_EAX,
            text_encoding='utf-8'):
        self.key = key
        Log.debug('Using key ' + str(str(self.key)) + '. Size = ' +
                  str(len(self.key)) + '.')
        self.cipher_mode_str = mode
        if self.cipher_mode_str == AES_Encrypt.AES_MODE_EAX:
            self.cipher_mode = AES.MODE_EAX
        elif self.cipher_mode_str == AES_Encrypt.AES_MODE_CBC:
            self.cipher_mode = AES.MODE_CBC
        else:
            raise Exception(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) +
                ': Unsupported AES mode "' + str(self.cipher_mode_str) + '"')
        if nonce is None:
            # Must be 16 bytes
            # nonce = key[0:16]
            nonce = AES_Encrypt.generate_random_bytes(
                size=AES_Encrypt.SIZE_NONCE, printable=True)

        self.nonce = nonce
        Log.debug(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) + ': Using nonce "' +
            str(self.nonce) + '". Size = ' + str(len(self.nonce)))

        self.text_encoding = text_encoding
        return
Пример #10
0
    def add_parent(self, parent):
        if parent.dead_node:
            Log.important(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': Parent "' + str(parent.name)
                + '" is dead node (cant have children), not adding parent for node "' + str(self.name) + '"'
            )
            return

        assert type(parent) is MultiTreeNode
        if parent.name in self.parent_names:
            Log.important(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': For node "' + str(self.name) + '" parent "' + str(parent.name) + '" already exists'
            )
        else:
            # Don't add if already exists as parent, anywhere higher up the tree hierarchy
            if self.is_higher_level(node=parent, supposed_child_node=self):
                return
            # Update for both parent and child
            self.parents.append(parent)
            self.update()
            parent.children.append(self)
            parent.update()
            Log.debug(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': For node "' + str(self.name) + '" successfully added parent "' + str(parent.name) + '"'
            )
Пример #11
0
 def check_prediction_stats(
     self,
     X,
     Y,
     y_predicted,
 ):
     Log.info(
         str(self.__class__) + ' ' +
         str(getframeinfo(currentframe()).lineno) +
         ': Checking prediction stats..')
     # print(y_predicted)
     # print(type(y_predicted))
     # print(y_predicted.shape)
     # print(np.sum(y_predicted, axis=1).tolist())
     # Compare some data
     count_correct = 0
     for i in range(X.shape[0]):
         data_i = X[i]
         label_i = Y[i]
         prob_distribution = y_predicted[i]
         top_x = NumpyUtil.get_top_indexes(data=prob_distribution,
                                           ascending=False,
                                           top_x=5)
         if top_x[0] == label_i:
             count_correct += 1
         Log.debug(
             str(i) + '. ' + str(data_i) + ': Label=' + str(label_i) +
             ', predicted=' + str(top_x))
     Log.important('Boosting Accuracy = ' +
                   str(100 * count_correct / X.shape[0]) + '%.')
     return
Пример #12
0
    def __init__(self, format, n_channels, frame_rate, n_frames, sample_width,
                 data_bytes):
        self.format = format
        self.n_channels = n_channels
        self.frame_rate = frame_rate
        self.n_frames = n_frames
        self.sample_width = sample_width

        # total_bytes_per_frame = sample_width * n_channels
        self.bytes_per_frame = int(self.n_channels * self.sample_width)

        # Anything above 8 bits are signed, only 8-bit is unsigned
        self.data_type = np.uint8
        if self.sample_width == 2:
            self.data_type = np.int16
        else:
            raise Exception('Wrong sample width ' + str(self.sample_width) +
                            ' > 2')

        # total_bytes = total_bytes_per_frames * total_frames
        self.data_bytes_len = int(self.bytes_per_frame * self.n_frames)
        self.data_bytes = data_bytes

        #
        # Extract channel raw values
        #
        audio_as_np = np.frombuffer(buffer=self.data_bytes,
                                    dtype=self.data_type)
        self.np_data = audio_as_np.astype(np.float32)

        # Normalise float32 array so that values are between -1.0 and +1.0
        n_bits = 8 * self.sample_width - 1
        self.np_data_normalized = self.np_data / (2**n_bits)

        # Now add additional dimension for channel
        self.np_data_by_channel = np.zeros(shape=(self.n_channels,
                                                  self.n_frames),
                                           dtype=self.data_type,
                                           order='C')
        # Just an array 0,1,2,3,... to symbolize indexes
        n_sample = np.array(list(range(len(self.np_data_normalized))))
        Log.debug(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) +
            ': Normalized data by channel shape: ' +
            str(self.np_data_by_channel.shape))
        if self.n_channels > 1:
            for chnl in range(self.n_channels):
                # Pick the correct indexes for this channel
                indexes = n_sample % self.sample_width == chnl
                channel_n_frames = np.sum(indexes * 1)
                assert channel_n_frames,\
                    'Channel ' + str(chnl) + ' with ' + str(channel_n_frames) + ' frames not ' + str(self.n_frames)
                # Assign channel data
                self.np_data_by_channel[chnl] = self.np_data[indexes]
        else:
            self.np_data_by_channel[0] = self.np_data.copy()

        return
Пример #13
0
 def calculate_metric(
     self,
     x,
     prd_attrs,
     # Для большей матрицы, вычисление нармализации очень медленно
     force_normalization,
     metric,
 ):
     if force_normalization:
         # Для большей матрицы, это вычисление очень медленно
         x_new = self.normalize_euclidean(x=x)
         prd_attrs_new = self.normalize_euclidean(x=prd_attrs)
         Log.debug(
             str(self.__class__) + ' ' +
             str(getframeinfo(currentframe()).lineno) + ': x normalized: ' +
             str(x_new) + '\n\rp normalized: ' + str(prd_attrs_new))
     else:
         x_new = x
         prd_attrs_new = prd_attrs
     """
     Суммирование по последней оси
     """
     # sum_axis = 1 + 1 * (ref_dna.shape[0] > 1)
     sum_axis = len(x_new.shape) - 1
     if metric == self.METRIC_COSINE:
         # Fast method just like NN layer
         distances = np.matmul(x_new, prd_attrs_new.transpose())
         # nan can occur for nan product with 0-vector
         condition_nan = np.isnan(distances)
         distances[condition_nan] = -1
         if sum_axis == 1:
             distances = np.reshape(distances,
                                    newshape=(prd_attrs_new.shape[0]))
             indxs_dist_sort = np.flip(np.argsort(distances), axis=0)
         else:
             distances = np.reshape(distances,
                                    newshape=(x_new.shape[0],
                                              prd_attrs_new.shape[0]))
             indxs_dist_sort = np.flip(np.argsort(distances), axis=1)
     elif metric == self.METRIC_EUCLIDEAN:
         # Slow, but more accurate for certain situations
         diff = x_new - prd_attrs_new
         distances = np.sqrt(np.sum((diff)**2, axis=sum_axis))
         # nan can occur for nan product with 0-vector
         condition_nan = np.isnan(distances)
         distances[condition_nan] = np.inf
         indxs_dist_sort = np.argsort(distances)
     else:
         raise Exception(
             str(self.__class__) + ' ' +
             str(getframeinfo(currentframe()).lineno) +
             ': No such metric "' + str(metric) + '" supported')
     Log.debug(
         str(self.__class__) + ' ' +
         str(getframeinfo(currentframe()).lineno) + ': Distances: ' +
         str(distances) + ' indexes sorted: ' + str(indxs_dist_sort))
     # Return the filtered data frame
     return indxs_dist_sort
Пример #14
0
    def predict_class_features(
            self,
            # This is the point given in feature format, instead of standard array format
            x_transformed,
            top=MATCH_TOP,
            match_pct_within_top_score=CONSTANT_PERCENT_WITHIN_TOP_SCORE,
            include_match_details=False,
            # Any relevant ID for logging purpose only
            id=None):
        self.wait_for_model_to_be_ready()
        self.wait_for_all_initializations_to_be_done()

        self.count_predict_calls = self.count_predict_calls + 1

        starttime_predict_class = prf.Profiling.start()

        predict_result = self.model.predict_class(
            x=x_transformed,
            top=top,
            include_match_details=include_match_details)

        #
        # Choose which scores to keep, we only have scores if we included the match details
        #
        if include_match_details:
            df_match = predict_result.match_details
            if df_match is not None:
                top_score = float(
                    df_match[ModelInterface.TERM_SCORE].loc[df_match.index[0]])
                df_match_keep = df_match[
                    df_match[ModelInterface.TERM_SCORE] >= top_score *
                    match_pct_within_top_score]
                df_match_keep = df_match_keep.reset_index(drop=True)
                # Overwrite data frame
                predict_result.match_details = df_match_keep

        y_observed = predict_result.predicted_classes
        top_class_distance = predict_result.top_class_distance

        Log.debug(
            str(self.__class__) + str(getframeinfo(currentframe()).lineno) +
            ': Input x: ' + str(x_transformed) + ', observed class: ' +
            str(y_observed) + ', top distance: ' + str(top_class_distance))

        if self.do_profiling:
            Log.debug(
                str(self.__class__) +
                str(getframeinfo(currentframe()).lineno) + ': ID="' + str(id) +
                '", x="' + str(x_transformed) + '"' +
                ' PROFILING predict class: ' + prf.Profiling.get_time_dif_str(
                    starttime_predict_class, prf.Profiling.stop()))

        return predict_result
Пример #15
0
 def preprocess_text(
         self
 ):
     self.sentences_cleaned = [
         self.txt_pp.process_text(inputtext=s, return_as_string=False)
         for s in self.training_text_list
     ]
     Log.debug(
         str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
         + ': Processed sentences: ' + str(self.sentences_cleaned)
     )
     return self.sentences_cleaned
Пример #16
0
 def __init__(
     self,
     # A list of text sentences in list type, already in lowercase and cleaned of None or ''.
     # Preprocessing assumed to be done and no text processing will be done here.
     sentences_list,
 ):
     self.sentences_list = sentences_list
     Log.debug(
         str(self.__class__) + ' ' +
         str(getframeinfo(currentframe()).lineno) +
         ': Sentences list (before filter):\n\r' + str(self.sentences_list))
     return
Пример #17
0
    def __init__(
            self,
            form,
            text_list_confirm_words=DEFAULT_OK,
            text_confirm_question='Please confirm answer ' + str(DEFAULT_OK),
            text_ask_field_value_prefix='Please provide',
            text_newline_char='<br/>',
            text_space_char='&nbsp',
            text_html_font_start_tag='<font color="blue">',
            text_html_font_end_tag='</font>',
            # For deserializing old objects so the old state is maintained
            error_count_quit_threshold=2,
            form_state=None,
            fill_form_continuous_err_count=0,
            conv_current_field_index=None,
            conv_current_field_name=None):
        if type(form) is not daehua_form.Form:
            raise Exception(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) +
                ': Wrong form type "' + str(type(form)) +
                '". Expected type "' + str(daehua_form.Form))
        # Keep the original form, and extended params
        self.form = form
        self.text_list_confirm_words = [
            str(s) for s in text_list_confirm_words
        ]
        self.text_confirm_question = str(text_confirm_question)
        self.text_ask_field_value_prefix = str(text_ask_field_value_prefix)
        self.text_newline_char = str(text_newline_char)
        self.text_space_char = str(text_space_char)
        self.text_html_font_start_tag = str(text_html_font_start_tag)
        self.text_html_font_end_tag = str(text_html_font_end_tag)
        self.error_count_quit_threshold = error_count_quit_threshold

        self.text_form_title = self.form.get_title_text()

        self.mex_expressions = self.form.mex_form_model
        Log.debug(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) + ': Mex Expressions: ' +
            str(self.mex_expressions) + '.')

        self.form_state = form_state
        self.fill_form_continuous_err_count = fill_form_continuous_err_count
        self.conv_current_field_index = conv_current_field_index
        self.conv_current_field_name = conv_current_field_name

        if self.form_state is None:
            self.reset()
        return
Пример #18
0
    def word_tokenize(self, sentences_list):
        sentences_segmt = [s.split(' ') for s in sentences_list]

        # Remove basic punctuations stuck to word
        sentences_cleanpunc = [
            BasicPreprocessor.clean_punctuations(sentence=s)
            for s in sentences_segmt
        ]
        for i in range(len(sentences_cleanpunc)):
            Log.debug(
                #str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno),
                #+ ': Text "' + str(sentences_segmt[i])
                #+ '" clean punctuations to: ' + str(sentences_cleanpunc[i])
                sentences_cleanpunc[i])
Пример #19
0
    def get_pct_intersection_with_common_words(
            self,
            word_list,
            # In the case of Vietnamese, we might have to form words from the syllables
            max_word_n_tuple = 1
    ):
        if max_word_n_tuple == 1:
            lang_intersection = set(word_list).intersection(self.get_common_words())
            pct_intersection = len(lang_intersection) / len(set(word_list))
        else:
            # Means we are looking not just at the current token, but form a word from
            # continuous tokens up to max_word_n_tuple (usually not more than 2)
            len_word_list = len(word_list)
            count_int = 0
            cur_index = 0
            actual_word_count = 0
            # Loop by each token in the word list (or rather token list)
            while cur_index < len_word_list:
                max_n_tuple_lookforward = min(max_word_n_tuple, len_word_list-cur_index)
                for j in range(max_n_tuple_lookforward,0,-1):
                    # Look from j tokens ahead
                    end_index = cur_index+j
                    # For the j-tuple word
                    w = ' '.join(word_list[cur_index:end_index])
                    Log.debugdebug(
                        str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                        + ': Test word "' + str(w) + '", cur_index=' + str(cur_index) + ', j=' + str(j))
                    if w in self.get_common_words():
                        count_int += 1
                        # Move forward to the end of the token from the word found
                        cur_index += j-1
                        Log.debugdebug(
                            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                            + ': Found word "' + str(w) + '"')
                        break
                cur_index += 1
                actual_word_count += 1

            Log.debug(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': Count Intersection = ' + str(count_int) + ', actual word count = ' + str(actual_word_count)
            )
            pct_intersection = count_int / actual_word_count

        Log.debug(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': "' + str(self.lang) + '" intersection = ' + str(pct_intersection)
        )
        return pct_intersection
Пример #20
0
    def train_from_partial_models(
            self,
            write_model_to_storage=True,
            write_training_data_to_storage=False,
            # Log training events
            logs=None):
        #
        # Load EIDF first
        # TODO How to ensure there are no missing words?
        #
        x_name = self.training_data.get_x_name()
        try:
            if type(logs) is list:
                self.logs_training = logs
            else:
                self.logs_training = []

            Log.info(str(self.__class__) + ' ' +
                     str(getframeinfo(currentframe()).lineno) +
                     ': Initializing IDF object.. try to read from file first',
                     log_list=self.logs_training)
            # Try to read from file
            df_eidf_file = eidf.Eidf.read_eidf_from_storage(
                dir_path_model=self.dir_path_model,
                identifier_string=self.identifier_string,
                x_name=x_name,
                log_training=self.logs_training)
            Log.debug(str(self.__class__) + ' ' +
                      str(getframeinfo(currentframe()).lineno) +
                      ': Successfully Read EIDF from file',
                      log_list=self.logs_training)
            self.model_data.idf = np.array(
                df_eidf_file[eidf.Eidf.STORAGE_COL_EIDF])
        except Exception as ex_eidf:
            errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\
                     + ': No EIDF from file available. Exception ' + str(ex_eidf)
            Log.critical(errmsg, log_list=self.logs_training)
            raise Exception(errmsg)

        # Standardize to at least 2-dimensional, easier when weighting x
        self.model_data.idf = npUtil.NumpyUtil.convert_dimension(
            arr=self.model_data.idf, to_dim=2)

        #
        # Combines
        #
        self.model_data.load_model_from_partial_trainings_data(
            td_latest=self.training_data, log_training=self.logs_training)
        return self.logs_training
Пример #21
0
    def train(
            self,
            X,
            Y
    ):
        # Defining the size of the embedding
        embed_size = 2

        # Defining the neural network
        inp = Input(shape=(X.shape[1],))
        Log.debug('Input shape: ' + str(X.shape))
        # Middle layer is the embedding vector we seek to extract
        # "linear" because this will serve as the word definition, to be input to other neural networks
        x = Dense(units=embed_size, activation='linear')(inp)
        # Standard softmax final layer
        x = Dense(units=Y.shape[1], activation='softmax')(x)
        model = Model(inputs=inp, outputs=x)
        Log.debug('Output shape: ' + str(Y.shape))
        model.compile(loss='categorical_crossentropy', optimizer='adam')
        model.summary()

        # Optimizing the network weights
        model.fit(
            x=X,
            y=Y,
            batch_size=256,
            epochs=100
        )

        # Obtaining the weights from the neural network.
        # These are the so called word embeddings

        # The input layer (embedding weights)
        weights = model.get_weights()[0]
        Log.important(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Weights extracted as embedding layer: ' + str(weights)
        )
        print(len(weights))

        # Creating a dictionary to store the embeddings in. The key is a unique word and
        # the value is the numeric vector
        embedding_dict = {}
        for word in self.word_index_dict.keys():
            embedding_dict.update({
                word: weights[self.word_index_dict.get(word)]
            })
        return embedding_dict
Пример #22
0
    def convert_ascii_string_to_other_alphabet(
        ascii_char_string,
        # Default to CJK Unicode Block
        unicode_range=BLOCK_CHINESE,
        # If the characters come from a hexdigest from a hash, we can compress 4 times,
        # otherwise for a random ascii string, we can only compress 2 characters to 1 chinese.
        group_n_char=2):
        uni_len = unicode_range[1] - unicode_range[0] + 1

        r = len(ascii_char_string) % 4
        if r != 0:
            # Append 0's
            ascii_char_string = ascii_char_string + '0' * (4 - r)
        # raise Exception('Hash length ' + str(len(hash_hex_string))
        #                 + ' for "' + str(hash_hex_string) + '" not 0 modulo-4')

        hash_zh = ''

        len_block = int(len(ascii_char_string) / group_n_char)
        for i in range(0, len_block, 1):
            idx_start = group_n_char * i
            idx_end = idx_start + group_n_char
            s = ascii_char_string[idx_start:idx_end]

            # Convert to Chinese, Korean, etc
            if group_n_char == 2:
                ord_arr = np.array([ord(x) for x in s])
                val = ord_arr * np.array(
                    [2**(8 * (x - 1)) for x in range(len(ord_arr), 0, -1)])
                val = np.sum(val)
                Log.debug('Index start=' + str(idx_start) + ', end=' +
                          str(idx_end) + ', s=' + str(s) + ', ordinal=' +
                          str(ord_arr) + ', val=' + str(hex(val)))
                cjk_unicode = (val % uni_len) + unicode_range[0]
                hash_zh += chr(cjk_unicode)
            elif group_n_char == 4:
                Log.debug('Index start=' + str(idx_start) + ', end=' +
                          str(idx_end) + ', s=' + str(s))
                n = int('0x' + str(s), 16)
                cjk_unicode = (n % uni_len) + unicode_range[0]
                hash_zh += chr(cjk_unicode)
                Log.debugdebug('From ' + str(idx_start) + ': ' + str(s) +
                               ', n=' + str(n) + ', char=' +
                               str(chr(cjk_unicode)))

        return hash_zh
Пример #23
0
 def __remove_stopwords(
         self,
         word_list
 ):
     if self.stopwords_list:
         word_list_remove = []
         for w in word_list:
             if w not in self.stopwords_list:
                 word_list_remove.append(w)
         Log.debug(
             str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
             + ': Lang "' + str(self.lang) + '", Word list "' + str(word_list)
             + '", removed stopwords to "' + str(word_list_remove) + '".'
         )
         return word_list_remove
     else:
         return word_list
Пример #24
0
    def run_unit_test(self):
        res_final = ut.ResultObj(count_ok=0, count_fail=0)

        test_data = [
            # 0: words to compare, 1: expected dist using Damerau-Levenshtein, 2: expected distance using Levenshtein
            (('เงน', 'เงิน'), 1, 1),
            (('ถนอ', 'ถอน'), 1, 2),
            (('ธรรมะ', 'ธรา'), 3, 3),
            # For Lev, 3 edit distance by
            #   1. Deleting 'ธ' to get 'รรมะ'
            #   2. Inserting 'ธ' to get 'รธมะ'
            #   3. Replacing 'ะ' with 'ร' to get 'รธมร'
            (('ธรรมะ', 'รธมร'), 3, 3),
        ]
        test_algos = [
            EditDistance.EDIT_DIST_ALGO_DAMLEV, EditDistance.EDIT_DIST_ALGO_LEV
        ]

        for use_numpy in [True, False]:
            for i in range(len(test_data)):
                word1, word2 = test_data[i][0]
                expected_dist = [test_data[i][1], test_data[i][2]]

                for j_algo in range(len(test_algos)):
                    algo = test_algos[j_algo]

                    start = time.time()
                    retc = EditDistance(algo=algo).calculate(
                        word_1=word1,
                        word_2=word2,
                        use_np=use_numpy,
                    )
                    dist = retc.optimal_cost
                    end = time.time()
                    Log.debug('Calculated distance: ' + str(dist))
                    Log.debug("Search took " +
                              str(round(1000 * (end - start), 2)) + 'ms.')

                    res_final.update_bool(res_bool=ut.UnitTest.assert_true(
                        observed=dist,
                        expected=expected_dist[j_algo],
                        test_comment='numpy= ' + str(use_numpy) +
                        ', test word ' + str(i) + ' "' + str(word1) +
                        '" and "' + str(word2) + '"'))

        return res_final
Пример #25
0
    def run_unit_test(self):
        res_final = ut.ResultObj(count_ok=0, count_fail=0)

        s = '니는 먹고 싶어'
        tests_set_1 = [[Hash.ALGO_SHA1, '蔮膫圈嫩慁覕邜蹋妡狿'],
                       [Hash.ALGO_SHA256, '葶杊閹翔綐僤徼戻髯鼚胦嘭藃诠灑浽'],
                       [Hash.ALGO_SHA512, '詐鏙仟墍例嵝烐檦蝡溲薑珇鸦東燢爻纷欜陲囚劚攠菜槑茹輀濯偑袁蓣质簨'],
                       [Hash.ALGO_SHA3_256, '厥驹踸鸨揱澯鑢擠鳰僸覑儽悃徵絨控'],
                       [
                           Hash.ALGO_SHA3_512,
                           '醜怅僒础衺菼惓隔鮚腋釔晞鏙屜咖龩檵因伖蘦惌灱騾凊纅弪鮾蕏解铦欪臓'
                       ]]
        for x in tests_set_1:
            algo = x[0]
            expected = x[1]
            # In Linux command line, echo -n "$s" | shasum -a 1 (or 256,512)
            Log.debug('Using algo "' + str(algo) + '":')
            hstr = Hash.hash(string=s, algo=algo)
            Log.debug('Hash: ' + str(hstr))
            observed = Hash.convert_ascii_string_to_other_alphabet(
                ascii_char_string=hstr,
                # unicode_range   = Hash.BLOCK_KOREAN_SYL,
                group_n_char=4)
            res_final.update_bool(
                res_bool=ut.UnitTest.assert_true(observed=observed,
                                                 expected=expected,
                                                 test_comment='test string "' +
                                                 str(hstr) + '" got "' +
                                                 str(observed) + '"'))

        tests_set_2 = [['abc/ii{}.!&%[][\\+=', '嵢弯敩睽簡琥坝坜礽縰'],
                       ['8829amsf)(*&^%^*./', '蘸耹嵭潦眨砦娥娪簯縰']]
        for x in tests_set_2:
            ascii_string = x[0]
            expected = x[1]
            observed = Hash.convert_ascii_string_to_other_alphabet(
                ascii_char_string=ascii_string)
            res_final.update_bool(
                res_bool=ut.UnitTest.assert_true(observed=observed,
                                                 expected=expected,
                                                 test_comment='test string "' +
                                                 str(ascii_string) +
                                                 '" got "' + str(observed) +
                                                 '"'))

        return res_final
Пример #26
0
    def __group_case_endings_by_len(self, endings_list, part_of_speech):
        endings_by_len = {}
        maxlen = 0
        for s in endings_list:
            maxlen = max(maxlen, len(s))
        # Longest to shortest
        for i in range(maxlen, 0, -1):
            endings_by_len[i] = []
        # Put them in the groups
        for s in endings_list:
            endings_by_len[len(s)].append(s)

        Log.debug(
            str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) +
            ': ' + str(part_of_speech) + ' case endings by length: ' +
            str(endings_by_len))

        return endings_by_len
Пример #27
0
 def build_tree_roots(self):
     # Find root tree nodes
     self.tree_roots = {}
     for name in self.tree_nodes.keys():
         Log.debug(
             str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
             + ': Checking if ' + str(name) + ' is a tree root...'
         )
         node = self.tree_nodes[name]
         if not node.is_dead_node():
             if node.is_tree_root():
                 self.tree_roots[name] = node
                 self.tree_roots_depth[name] = self.calculate_tree_depth(node=node)
     Log.important(
         str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
         + ': Found ' + str(len(self.tree_roots)) + ' tree roots'
     )
     return
Пример #28
0
 def segment_ko_ja(
         self,
         text,
         return_array_of_split_words = False
 ):
     try:
         if self.lang in [lf.LangFeatures.LANG_JA]:
             words_postags = nagisa.tagging(text)
             txt_sym_tok = words_postags.words
             txt_sym_postags = words_postags.postags
             Log.debug(
                 str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                 + ': Japanese segmentation "' + str(txt_sym_tok) + '", word & POS tags: ' + str(words_postags)
             )
             if return_array_of_split_words:
                 return txt_sym_tok
             else:
                 return BasicPreprocessor.get_word_separator(lang=self.lang).join(txt_sym_tok)
         elif self.lang in [lf.LangFeatures.LANG_KO]:
             self.warn_korean()
             words_postags = self.kkma.pos(
                 phrase = text
             )
             txt_sym_tok = [wp[0] for wp in words_postags]
             txt_sym_postags = [wp[1] for wp in words_postags]
             Log.debug(
                 str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                 + ': Korean segmentation "' + str(txt_sym_tok) + '", word & POS tags: ' + str(words_postags)
             )
             if return_array_of_split_words:
                 return txt_sym_tok
             else:
                 return BasicPreprocessor.get_word_separator(lang=self.lang).join(txt_sym_tok)
         else:
             raise Exception(
                 str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                 + ': No external library supported for language "' + str(self.lang) + '"'
             )
     except Exception as ex:
         errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                  + ': Error segmenting lang "' + str(self.lang) + '", text "' + str(text) \
                  + '", exception: ' + str(ex)
         Log.error(errmsg)
         raise Exception(errmsg)
Пример #29
0
 def is_higher_level(self, node, supposed_child_node):
     Log.debug(
         '***** check if "' + str(supposed_child_node.name) + '" is higher level than "'
         + str(node.name) + '", parents: ' + str(node.parent_names)
     )
     if supposed_child_node.name in node.parent_names:
         Log.warning(
             str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
             + ': Node "' + str(self.name) + '" cannot add "' + str(supposed_child_node.name)
             + '" as child. Node "' + str(supposed_child_node.name)
             + '" is already a higher level parent node to "' + str(self.name) + '"'
         )
         return True
     for par in node.parents:
         if self.is_higher_level(node=par, supposed_child_node=supposed_child_node):
             return True
         else:
             continue
     return False
Пример #30
0
 def get_stats_lang_detect(
     self,
     sentences_list,
     langs_real,
     langs_detected,
 ):
     correct_count = 0
     total_count = len(langs_real)
     for i in range(total_count):
         lang_det = langs_detected[i]
         lang_real = langs_real[i]
         correct_result = lang_real == lang_det
         if not correct_result:
             Log.debug('Detected "' + str(lang_det) + '" for supposed "' +
                       str(lang_real) + '" sent "' +
                       str(sentences_list[i]) + '"')
         correct_count += 1 * (correct_result)
     correct_pct = round(100 * correct_count / total_count, 2)
     return correct_pct, correct_count, total_count