def normalize( self, string: str, check_english: bool = True, normalize_text: bool = True, normalize_entity: bool = True, normalize_url: bool = False, normalize_email: bool = False, normalize_year: bool = True, normalize_telephone: bool = True, logging: bool = False, ): """ Normalize a string. Parameters ---------- string : str check_english: bool, (default=True) check a word in english dictionary. normalize_text: bool, (default=True) if True, will try to replace shortforms with internal corpus. normalize_entity: bool, (default=True) normalize entities, only effect `date`, `datetime`, `time` and `money` patterns string only. normalize_url: bool, (default=False) if True, replace `://` with empty and `.` with `dot`. `https://huseinhouse.com` -> `https huseinhouse dot com`. normalize_email: bool, (default=False) if True, replace `@` with `di`, `.` with `dot`. `[email protected]` -> `husein dot zol kosong lima di gmail dot com`. normalize_year: bool, (default=True) if True, `tahun 1987` -> `tahun sembilan belas lapan puluh tujuh`. if True, `1970-an` -> `sembilan belas tujuh puluh an`. if False, `tahun 1987` -> `tahun seribu sembilan ratus lapan puluh tujuh`. normalize_telephone: bool, (default=True) if True, `no 012-1234567` -> `no kosong satu dua, satu dua tiga empat lima enam tujuh` logging: bool, (default=False) if True, will log index and token queue using `logging.warn`. Returns ------- string: normalized string """ string = ' '.join(self._tokenizer(string)) string = groupby(string) if normalize_text: string = replace_laugh(string) string = replace_mengeluh(string) string = _replace_compoud(string) if hasattr(self._speller, 'normalize_elongated'): string = [ self._speller.normalize_elongated(word) if len(re.findall(r'(.)\1{1}', word)) and not word[0].isupper() and not word.lower().startswith('ke-') and not _is_number_regex(word) else word for word in string.split() ] string = ' '.join(string) result, normalized = [], [] tokenized = self._tokenizer(string) index = 0 while index < len(tokenized): word = tokenized[index] word_lower = word.lower() word_upper = word.upper() first_c = word[0].isupper() if logging: s = f'index: {index}, word: {word}, queue: {result}' warn(s) if word in '~@#$%^&*()_+{}|[:"\'];<>,.?/-': result.append(word) index += 1 continue normalized.append(rules_normalizer.get(word_lower, word_lower)) if word_lower in ignore_words: result.append(word) index += 1 continue if first_c and not len(re.findall(_money, word_lower)): if word_lower in rules_normalizer and normalize_text: result.append(case_of(word)(rules_normalizer[word_lower])) index += 1 continue elif word_upper not in ['KE', 'PADA', 'RM', 'SEN', 'HINGGA']: result.append( _normalize_title(word) if normalize_text else word) index += 1 continue if check_english: if word_lower in ENGLISH_WORDS: result.append(word) index += 1 continue if word_lower in MALAY_WORDS and word_lower not in ['pada', 'ke']: result.append(word) index += 1 continue if len(word) > 2: if word[-2] in consonants and word[-1] == 'e': word = word[:-1] + 'a' if word[0] == 'x' and len(word) > 1: result_string = 'tak ' word = word[1:] else: result_string = '' if word_lower == 'ke' and index < (len(tokenized) - 2): if tokenized[index + 1] == '-' and _is_number_regex( tokenized[index + 2]): result.append( ordinal(word + tokenized[index + 1] + tokenized[index + 2])) index += 3 continue elif tokenized[index + 1] == '-' and re.match( '.*(V|X|I|L|D)', tokenized[index + 2]): result.append( ordinal(word + tokenized[index + 1] + str(rom_to_int(tokenized[index + 2])))) index += 3 continue else: result.append('ke') index += 1 continue if _is_number_regex(word) and index < (len(tokenized) - 2): if tokenized[index + 1] == '-' and _is_number_regex( tokenized[index + 2]): result.append( to_cardinal(_string_to_num(word)) + ' hingga ' + to_cardinal(_string_to_num(tokenized[index + 2]))) index += 3 continue if word_lower == 'pada' and index < (len(tokenized) - 3): if (_is_number_regex(tokenized[index + 1]) and tokenized[index + 2] in '/-' and _is_number_regex(tokenized[index + 3])): result.append('pada %s hari bulan %s' % ( to_cardinal(_string_to_num(tokenized[index + 1])), to_cardinal(_string_to_num(tokenized[index + 3])), )) index += 4 continue if (word_lower in ['tahun', 'thun'] and index < (len(tokenized) - 1) and normalize_year): if (_is_number_regex(tokenized[index + 1]) and len(tokenized[index + 1]) == 4): t = tokenized[index + 1] if t[1] != '0': l = to_cardinal(int(t[:2])) r = to_cardinal(int(t[2:])) c = f'{l} {r}' else: c = to_cardinal(int(t)) if (index < (len(tokenized) - 3) and tokenized[index + 2] == '-' and tokenized[index + 3].lower() == 'an'): end = 'an' plus = 4 else: end = '' plus = 2 result.append(f'tahun {c}{end}') index += plus continue if _is_number_regex(word) and index < (len(tokenized) - 2): if tokenized[index + 1] == '/' and _is_number_regex( tokenized[index + 2]): result.append( fraction(word + tokenized[index + 1] + tokenized[index + 2])) index += 3 continue if (tokenized[index + 1] == '-' and tokenized[index + 2].lower() == 'an' and normalize_year and len(word) == 4): t = word if t[1] != '0': l = to_cardinal(int(t[:2])) r = to_cardinal(int(t[2:])) c = f'{l} {r}' else: c = to_cardinal(int(t)) result.append(f'{c}an') index += 3 continue if re.findall(_money, word_lower): money_, _ = money(word) result.append(money_) if index < (len(tokenized) - 1): if tokenized[index + 1].lower() in ('sen', 'cent'): index += 2 else: index += 1 else: index += 1 continue if re.findall(_date, word_lower): word = word_lower word = multireplace(word, date_replace) word = re.sub(r'[ ]+', ' ', word).strip() parsed = dateparser.parse(word) if parsed: result.append(parsed.strftime('%d/%m/%Y')) else: result.append(word) index += 1 continue if re.findall(_expressions['time'], word_lower): word = word_lower word = multireplace(word, date_replace) word = re.sub(r'[ ]+', ' ', word).strip() parsed = dateparser.parse(word) if parsed: result.append(parsed.strftime('%H:%M:%S')) else: result.append(word) index += 1 continue if re.findall(_expressions['hashtag'], word_lower): result.append(word) index += 1 continue if re.findall(_expressions['url'], word_lower): if normalize_url: word = word.replace('://', ' ').replace('.', ' dot ') word = put_spacing_num(word) result.append(word) index += 1 continue if re.findall(_expressions['email'], word_lower): if normalize_email: word = (word.replace('://', ' ').replace('.', ' dot ').replace( '@', ' di ')) word = put_spacing_num(word) result.append(word) index += 1 continue if re.findall(_expressions['phone'], word_lower): if normalize_telephone: splitted = word.split('-') left = put_spacing_num(splitted[0]) right = put_spacing_num(splitted[1]) word = f'{left}, {right}' result.append(word) index += 1 continue if re.findall(_expressions['user'], word_lower): result.append(word) index += 1 continue if (re.findall(_expressions['temperature'], word_lower) or re.findall(_expressions['distance'], word_lower) or re.findall(_expressions['volume'], word_lower) or re.findall(_expressions['duration'], word_lower) or re.findall(_expressions['weight'], word_lower)): word = word.replace(' ', '') result.append(digit_unit(word)) index += 1 continue cardinal_ = cardinal(word) if cardinal_ != word: result.append(cardinal_) index += 1 continue normalized_ke = ordinal(word) if normalized_ke != word: result.append(normalized_ke) index += 1 continue word, end_result_string = _remove_postfix(word) word, repeat = check_repeat(word) if normalize_text: if word in sounds: selected = sounds[word] elif word in rules_normalizer: selected = rules_normalizer[word] elif self._speller: selected = self._speller.correct( word, string=' '.join(tokenized), index=index) else: selected = word else: selected = word selected = '-'.join([selected] * repeat) result.append(result_string + selected + end_result_string) index += 1 result = ' '.join(result) normalized = ' '.join(normalized) if normalize_entity: dates_, money_ = normalized_entity(normalized) else: dates_, money_ = {}, {} return {'normalize': result, 'date': dates_, 'money': money_}
def normalize( self, string: str, check_english: bool = True, normalize_entity: bool = True, ): """ Normalize a string Parameters ---------- string : str check_english: bool, (default=True) check a word in english dictionary. normalize_entity: bool, (default=True) normalize entities, only effect `date`, `datetime`, `time` and `money` patterns string only. Returns ------- string: normalized string """ string = groupby(string) string = replace_laugh(string) string = replace_mengeluh(string) string = _replace_compoud(string) if hasattr(self._speller, 'normalize_elongated'): string = [ self._speller.normalize_elongated(word) if len(re.findall(r'(.)\1{1}', word)) and not word[0].isupper() and not word.lower().startswith('ke-') else word for word in string.split() ] string = ' '.join(string) result, normalized = [], [] tokenized = self._tokenizer(string) index = 0 while index < len(tokenized): word = tokenized[index] word_lower = word.lower() word_upper = word.upper() first_c = word[0].isupper() if word in '~@#$%^&*()_+{}|[:"\'];<>,.?/-': result.append(word) index += 1 continue normalized.append(rules_normalizer.get(word_lower, word_lower)) if word_lower in ignore_words: result.append(word) index += 1 continue if first_c and not len(re.findall(_money, word_lower)): if word_lower in rules_normalizer: result.append(case_of(word)(rules_normalizer[word_lower])) index += 1 continue elif word_upper not in ['KE', 'PADA', 'RM', 'SEN', 'HINGGA']: result.append(_normalize_title(word)) index += 1 continue if check_english: if word_lower in ENGLISH_WORDS: result.append(word) index += 1 continue if word_lower in MALAY_WORDS and word_lower not in ['pada', 'ke']: result.append(word) index += 1 continue if len(word) > 2: if word[-2] in consonants and word[-1] == 'e': word = word[:-1] + 'a' if word[0] == 'x' and len(word) > 1: result_string = 'tak ' word = word[1:] else: result_string = '' if word.lower() == 'ke' and index < (len(tokenized) - 2): if tokenized[index + 1] == '-' and _is_number_regex( tokenized[index + 2]): result.append( ordinal(word + tokenized[index + 1] + tokenized[index + 2])) index += 3 continue elif tokenized[index + 1] == '-' and re.match( '.*(V|X|I|L|D)', tokenized[index + 2]): result.append( ordinal(word + tokenized[index + 1] + str(rom_to_int(tokenized[index + 2])))) index += 3 continue else: result.append('ke') index += 1 continue if _is_number_regex(word) and index < (len(tokenized) - 2): if tokenized[index + 1] == '-' and _is_number_regex( tokenized[index + 2]): result.append( to_cardinal(_string_to_num(word)) + ' hingga ' + to_cardinal(_string_to_num(tokenized[index + 2]))) index += 3 continue if word.lower() == 'pada' and index < (len(tokenized) - 3): if (_is_number_regex(tokenized[index + 1]) and tokenized[index + 2] in '/-' and _is_number_regex(tokenized[index + 3])): result.append('pada %s hari bulan %s' % ( to_cardinal(_string_to_num(tokenized[index + 1])), to_cardinal(_string_to_num(tokenized[index + 3])), )) index += 4 continue else: result.append('pada') index += 1 continue if _is_number_regex(word) and index < (len(tokenized) - 2): if tokenized[index + 1] == '/' and _is_number_regex( tokenized[index + 2]): result.append( fraction(word + tokenized[index + 1] + tokenized[index + 2])) index += 3 continue if re.findall(_money, word_lower): money_, _ = money(word) result.append(money_) if index < (len(tokenized) - 1): if tokenized[index + 1].lower() in ('sen', 'cent'): index += 2 else: index += 1 else: index += 1 continue if re.findall(_date, word_lower): word = word_lower word = multireplace(word, date_replace) word = re.sub(r'[ ]+', ' ', word).strip() parsed = dateparser.parse(word) if parsed: result.append(parsed.strftime('%d/%m/%Y')) else: result.append(word) index += 1 continue if re.findall(_expressions['time'], word_lower): word = word_lower word = multireplace(word, date_replace) word = re.sub(r'[ ]+', ' ', word).strip() parsed = dateparser.parse(word) if parsed: result.append(parsed.strftime('%H:%M:%S')) else: result.append(word) index += 1 continue if re.findall(_expressions['hashtag'], word_lower): result.append(word) index += 1 continue if re.findall(_expressions['url'], word_lower): result.append(word) index += 1 continue if re.findall(_expressions['user'], word_lower): result.append(word) index += 1 continue if (re.findall(_expressions['temperature'], word_lower) or re.findall(_expressions['distance'], word_lower) or re.findall(_expressions['volume'], word_lower) or re.findall(_expressions['duration'], word_lower) or re.findall(_expressions['weight'], word_lower)): word = word.replace(' ', '') result.append(digit_unit(word)) index += 1 continue cardinal_ = cardinal(word) if cardinal_ != word: result.append(cardinal_) index += 1 continue normalized_ke = ordinal(word) if normalized_ke != word: result.append(normalized_ke) index += 1 continue word, end_result_string = _remove_postfix(word) word, repeat = check_repeat(word) if word in sounds: selected = sounds[word] elif word in rules_normalizer: selected = rules_normalizer[word] else: selected = self._speller.correct(word, string=' '.join(tokenized), index=index) selected = ' - '.join([selected] * repeat) result.append(result_string + selected + end_result_string) index += 1 result = ' '.join(result) normalized = ' '.join(normalized) if normalize_entity: dates_, money_ = normalized_entity(normalized) else: dates_, money_ = {}, {} return {'normalize': result, 'date': dates_, 'money': money_}