def correct_text(self, text: str, batch_size: int = 20): """ Correct all the words within a text, returning the corrected text. Parameters ---------- text: str batch_size: int, optional(default=20) batch size to insert into model. Returns ------- result: str """ if batch_size < 1: raise ValueError('batch_size must be bigger than 0') text = re.sub('[^a-zA-Z]+', ' ', text) string = re.sub(r'[ ]+', ' ', text).strip() strings = [] for no, word in enumerate(string.split()): if not word[0].isupper(): word = case_of(word)(self.correct(word.lower(), string, no, batch_size=batch_size)) strings.append(word) return ' '.join(strings)
def stem(self, string: str, beam_search: bool = False): """ Stem a string, this also include lemmatization. Parameters ---------- string : str beam_search : bool, (optional=False) If True, use beam search decoder, else use greedy decoder. Returns ------- result: str """ tokenized = self._tokenizer(string) result, batch, actual, mapping = [], [], [], {} for no, word in enumerate(tokenized): if word in '~@#$%^&*()_+{}|[:"\'];<>,.?/-': result.append(word) elif (re.findall(_money, word.lower()) or re.findall(_date, word.lower()) or re.findall(_expressions['time'], word.lower()) or re.findall(_expressions['hashtag'], word.lower()) or re.findall(_expressions['url'], word.lower()) or re.findall(_expressions['user'], word.lower())): result.append(word) else: mapping[len(batch)] = no result.append('REPLACE-ME') actual.append(word) batch.append(word.lower()) if len(batch): batch = self._bpe.bpe.encode(batch, output_type=self._bpe.mode) batch = [i + [1] for i in batch] batch = pad_sentence_batch(batch, 0)[0] if beam_search: output = 'beam' else: output = 'greedy' r = self._execute( inputs=[batch], input_labels=['Placeholder'], output_labels=[output], ) output = r[output].tolist() for no, o in enumerate(output): predicted = list(dict.fromkeys(o)) predicted = (self._bpe.bpe.decode(predicted)[0].replace( '<EOS>', '').replace('<PAD>', '')) predicted = case_of(actual[no])(predicted) result[mapping[no]] = predicted return ' '.join(result)
def correct_word(self, word: str, string: str, batch_size: int = 20): """ Spell-correct word in match, and preserve proper upper/lower/title case. """ return case_of(word)(self.correct(word.lower(), string, batch_size=batch_size))
def correct_match(self, match): """ Spell-correct word in match, and preserve proper upper/lower/title case. """ word = match.group() if word[0].isupper(): return word return case_of(word)(self.correct(word.lower()))
def stem(self, string: str, beam_search: bool = True): """ Stem a string. Parameters ---------- string : str beam_search : bool, (optional=True) If True, use beam search decoder, else use greedy decoder. Returns ------- result: str """ tokenized = self._tokenizer(string) result, batch, actual, mapping = [], [], [], {} for no, word in enumerate(tokenized): if word in '~@#$%^&*()_+{}|[:"\'];<>,.?/-': result.append(word) elif ( re.findall(_money, word.lower()) or re.findall(_date, word.lower()) or re.findall(_expressions['time'], word.lower()) or re.findall(_expressions['hashtag'], word.lower()) or re.findall(_expressions['url'], word.lower()) or re.findall(_expressions['user'], word.lower()) ): result.append(word) else: mapping[len(batch)] = no result.append('REPLACE-ME') actual.append(word) batch.append(word.lower()) if len(batch): batch = self._bpe.encode(batch, output_type = self._subword_mode) batch = [i + [1] for i in batch] batch = pad_sentence_batch(batch, 0)[0] if beam_search: output = self._beam else: output = self._greedy output = self._sess.run(output, feed_dict = {self._X: batch}) output = output.tolist() for no, o in enumerate(output): predicted = list(dict.fromkeys(o)) predicted = self._bpe.decode(predicted)[0].replace('<EOS>', '') predicted = case_of(actual[no])(predicted) result[mapping[no]] = predicted return ' '.join(result)
def correct_word(self, word: str): """ Spell-correct word in match, and preserve proper upper/lower/title case. Parameters ---------- word: str Returns ------- result: str """ return case_of(word)(self.correct(word.lower()))
def correct_match(self, match: str): """ Spell-correct word in match, and preserve proper upper/lower/title case. Parameters ---------- match: str Returns ------- result: str """ word = match.group() if word[0].isupper(): return word return case_of(word)(self.correct(word.lower()))
def correct_word(self, word: str, string: str, batch_size: int = 20): """ Spell-correct word in match, and preserve proper upper/lower/title case. Parameters ---------- word: str string: str Entire string, `word` must a word inside `string`. batch_size: int, optional(default=20) batch size to insert into model. Returns ------- result: str """ return case_of(word)(self.correct(word.lower(), string, batch_size=batch_size))
def normalize_elongated(self, word): return case_of(word)(self.best_elong_candidate(word.lower()))
def correct_word(self, word: str): """ Spell-correct word in match, and preserve proper upper/lower/title case. """ return case_of(word)(self.correct(word.lower()))
def normalize( self, string: str, check_english: bool = True, normalize_text: bool = True, normalize_entity: bool = True, normalize_url: bool = False, normalize_email: bool = False, normalize_year: bool = True, normalize_telephone: bool = True, logging: bool = False, ): """ Normalize a string. Parameters ---------- string : str check_english: bool, (default=True) check a word in english dictionary. normalize_text: bool, (default=True) if True, will try to replace shortforms with internal corpus. normalize_entity: bool, (default=True) normalize entities, only effect `date`, `datetime`, `time` and `money` patterns string only. normalize_url: bool, (default=False) if True, replace `://` with empty and `.` with `dot`. `https://huseinhouse.com` -> `https huseinhouse dot com`. normalize_email: bool, (default=False) if True, replace `@` with `di`, `.` with `dot`. `[email protected]` -> `husein dot zol kosong lima di gmail dot com`. normalize_year: bool, (default=True) if True, `tahun 1987` -> `tahun sembilan belas lapan puluh tujuh`. if True, `1970-an` -> `sembilan belas tujuh puluh an`. if False, `tahun 1987` -> `tahun seribu sembilan ratus lapan puluh tujuh`. normalize_telephone: bool, (default=True) if True, `no 012-1234567` -> `no kosong satu dua, satu dua tiga empat lima enam tujuh` logging: bool, (default=False) if True, will log index and token queue using `logging.warn`. Returns ------- string: normalized string """ string = ' '.join(self._tokenizer(string)) string = groupby(string) if normalize_text: string = replace_laugh(string) string = replace_mengeluh(string) string = _replace_compoud(string) if hasattr(self._speller, 'normalize_elongated'): string = [ self._speller.normalize_elongated(word) if len(re.findall(r'(.)\1{1}', word)) and not word[0].isupper() and not word.lower().startswith('ke-') and not _is_number_regex(word) else word for word in string.split() ] string = ' '.join(string) result, normalized = [], [] tokenized = self._tokenizer(string) index = 0 while index < len(tokenized): word = tokenized[index] word_lower = word.lower() word_upper = word.upper() first_c = word[0].isupper() if logging: s = f'index: {index}, word: {word}, queue: {result}' warn(s) if word in '~@#$%^&*()_+{}|[:"\'];<>,.?/-': result.append(word) index += 1 continue normalized.append(rules_normalizer.get(word_lower, word_lower)) if word_lower in ignore_words: result.append(word) index += 1 continue if first_c and not len(re.findall(_money, word_lower)): if word_lower in rules_normalizer and normalize_text: result.append(case_of(word)(rules_normalizer[word_lower])) index += 1 continue elif word_upper not in ['KE', 'PADA', 'RM', 'SEN', 'HINGGA']: result.append( _normalize_title(word) if normalize_text else word) index += 1 continue if check_english: if word_lower in ENGLISH_WORDS: result.append(word) index += 1 continue if word_lower in MALAY_WORDS and word_lower not in ['pada', 'ke']: result.append(word) index += 1 continue if len(word) > 2: if word[-2] in consonants and word[-1] == 'e': word = word[:-1] + 'a' if word[0] == 'x' and len(word) > 1: result_string = 'tak ' word = word[1:] else: result_string = '' if word_lower == 'ke' and index < (len(tokenized) - 2): if tokenized[index + 1] == '-' and _is_number_regex( tokenized[index + 2]): result.append( ordinal(word + tokenized[index + 1] + tokenized[index + 2])) index += 3 continue elif tokenized[index + 1] == '-' and re.match( '.*(V|X|I|L|D)', tokenized[index + 2]): result.append( ordinal(word + tokenized[index + 1] + str(rom_to_int(tokenized[index + 2])))) index += 3 continue else: result.append('ke') index += 1 continue if _is_number_regex(word) and index < (len(tokenized) - 2): if tokenized[index + 1] == '-' and _is_number_regex( tokenized[index + 2]): result.append( to_cardinal(_string_to_num(word)) + ' hingga ' + to_cardinal(_string_to_num(tokenized[index + 2]))) index += 3 continue if word_lower == 'pada' and index < (len(tokenized) - 3): if (_is_number_regex(tokenized[index + 1]) and tokenized[index + 2] in '/-' and _is_number_regex(tokenized[index + 3])): result.append('pada %s hari bulan %s' % ( to_cardinal(_string_to_num(tokenized[index + 1])), to_cardinal(_string_to_num(tokenized[index + 3])), )) index += 4 continue if (word_lower in ['tahun', 'thun'] and index < (len(tokenized) - 1) and normalize_year): if (_is_number_regex(tokenized[index + 1]) and len(tokenized[index + 1]) == 4): t = tokenized[index + 1] if t[1] != '0': l = to_cardinal(int(t[:2])) r = to_cardinal(int(t[2:])) c = f'{l} {r}' else: c = to_cardinal(int(t)) if (index < (len(tokenized) - 3) and tokenized[index + 2] == '-' and tokenized[index + 3].lower() == 'an'): end = 'an' plus = 4 else: end = '' plus = 2 result.append(f'tahun {c}{end}') index += plus continue if _is_number_regex(word) and index < (len(tokenized) - 2): if tokenized[index + 1] == '/' and _is_number_regex( tokenized[index + 2]): result.append( fraction(word + tokenized[index + 1] + tokenized[index + 2])) index += 3 continue if (tokenized[index + 1] == '-' and tokenized[index + 2].lower() == 'an' and normalize_year and len(word) == 4): t = word if t[1] != '0': l = to_cardinal(int(t[:2])) r = to_cardinal(int(t[2:])) c = f'{l} {r}' else: c = to_cardinal(int(t)) result.append(f'{c}an') index += 3 continue if re.findall(_money, word_lower): money_, _ = money(word) result.append(money_) if index < (len(tokenized) - 1): if tokenized[index + 1].lower() in ('sen', 'cent'): index += 2 else: index += 1 else: index += 1 continue if re.findall(_date, word_lower): word = word_lower word = multireplace(word, date_replace) word = re.sub(r'[ ]+', ' ', word).strip() parsed = dateparser.parse(word) if parsed: result.append(parsed.strftime('%d/%m/%Y')) else: result.append(word) index += 1 continue if re.findall(_expressions['time'], word_lower): word = word_lower word = multireplace(word, date_replace) word = re.sub(r'[ ]+', ' ', word).strip() parsed = dateparser.parse(word) if parsed: result.append(parsed.strftime('%H:%M:%S')) else: result.append(word) index += 1 continue if re.findall(_expressions['hashtag'], word_lower): result.append(word) index += 1 continue if re.findall(_expressions['url'], word_lower): if normalize_url: word = word.replace('://', ' ').replace('.', ' dot ') word = put_spacing_num(word) result.append(word) index += 1 continue if re.findall(_expressions['email'], word_lower): if normalize_email: word = (word.replace('://', ' ').replace('.', ' dot ').replace( '@', ' di ')) word = put_spacing_num(word) result.append(word) index += 1 continue if re.findall(_expressions['phone'], word_lower): if normalize_telephone: splitted = word.split('-') left = put_spacing_num(splitted[0]) right = put_spacing_num(splitted[1]) word = f'{left}, {right}' result.append(word) index += 1 continue if re.findall(_expressions['user'], word_lower): result.append(word) index += 1 continue if (re.findall(_expressions['temperature'], word_lower) or re.findall(_expressions['distance'], word_lower) or re.findall(_expressions['volume'], word_lower) or re.findall(_expressions['duration'], word_lower) or re.findall(_expressions['weight'], word_lower)): word = word.replace(' ', '') result.append(digit_unit(word)) index += 1 continue cardinal_ = cardinal(word) if cardinal_ != word: result.append(cardinal_) index += 1 continue normalized_ke = ordinal(word) if normalized_ke != word: result.append(normalized_ke) index += 1 continue word, end_result_string = _remove_postfix(word) word, repeat = check_repeat(word) if normalize_text: if word in sounds: selected = sounds[word] elif word in rules_normalizer: selected = rules_normalizer[word] elif self._speller: selected = self._speller.correct( word, string=' '.join(tokenized), index=index) else: selected = word else: selected = word selected = '-'.join([selected] * repeat) result.append(result_string + selected + end_result_string) index += 1 result = ' '.join(result) normalized = ' '.join(normalized) if normalize_entity: dates_, money_ = normalized_entity(normalized) else: dates_, money_ = {}, {} return {'normalize': result, 'date': dates_, 'money': money_}
def normalize( self, string: str, normalize_text: bool = True, normalize_entity: bool = True, normalize_url: bool = False, normalize_email: bool = False, normalize_year: bool = True, normalize_telephone: bool = True, normalize_date: bool = True, normalize_time: bool = True, check_english_func=is_english, check_malay_func=is_malay, **kwargs, ): """ Normalize a string. Parameters ---------- string : str normalize_text: bool, (default=True) if True, will try to replace shortforms with internal corpus. normalize_entity: bool, (default=True) normalize entities, only effect `date`, `datetime`, `time` and `money` patterns string only. normalize_url: bool, (default=False) if True, replace `://` with empty and `.` with `dot`. `https://huseinhouse.com` -> `https huseinhouse dot com`. normalize_email: bool, (default=False) if True, replace `@` with `di`, `.` with `dot`. `[email protected]` -> `husein dot zol kosong lima di gmail dot com`. normalize_year: bool, (default=True) if True, `tahun 1987` -> `tahun sembilan belas lapan puluh tujuh`. if True, `1970-an` -> `sembilan belas tujuh puluh an`. if False, `tahun 1987` -> `tahun seribu sembilan ratus lapan puluh tujuh`. normalize_telephone: bool, (default=True) if True, `no 012-1234567` -> `no kosong satu dua, satu dua tiga empat lima enam tujuh` normalize_date: bool, (default=True) if True, `01/12/2001` -> `satu disember dua ribu satu`. if True, `Jun 2017` -> `satu Jun dua ribu tujuh belas`. if True, `2017 Jun` -> `satu Jun dua ribu tujuh belas`. if False, `2017 Jun` -> `01/06/2017`. if False, `Jun 2017` -> `01/06/2017`. normalize_time: bool, (default=True) if True, `pukul 2.30` -> `pukul dua tiga puluh minit`. if False, `pukul 2.30` -> `'02:00:00'` check_english_func: Callable, (default=malaya.text.is_english) function to check a word in english dictionary, default is malaya.text.is_english. check_malay_func: Callable, (default=malaya.text.is_malay) function to check a word in malay dictionary, default is malaya.text.is_malay. Returns ------- string: {'normalize', 'date', 'money'} """ tokenized = self._tokenizer(string) s = f'tokenized: {tokenized}' logger.debug(s) string = ' '.join(tokenized) string = groupby(string) if normalize_text: string = replace_laugh(string) string = replace_mengeluh(string) string = _replace_compound(string) if hasattr(self._speller, 'normalize_elongated'): string = [ self._speller.normalize_elongated(word) if len(re.findall(r'(.)\1{1}', word)) and not word[0].isupper() and not word.lower().startswith('ke-') and not _is_number_regex(word) else word for word in string.split() ] string = ' '.join(string) result, normalized = [], [] tokenized = self._tokenizer(string) index = 0 while index < len(tokenized): word = tokenized[index] word_lower = word.lower() word_upper = word.upper() first_c = word[0].isupper() s = f'index: {index}, word: {word}, queue: {result}' logger.debug(s) if word in '~@#$%^&*()_+{}|[:"\'];<>,.?/-': s = f'index: {index}, word: {word}, condition punct' logger.debug(s) result.append(word) index += 1 continue normalized.append(rules_normalizer.get(word_lower, word_lower)) if word_lower in ignore_words: s = f'index: {index}, word: {word}, condition ignore words' logger.debug(s) result.append(word) index += 1 continue if ( first_c and not len(re.findall(_expressions['money'], word_lower)) and not len(re.findall(_expressions['date'], word_lower)) ): s = f'index: {index}, word: {word}, condition not in money and date' logger.debug(s) if word_lower in rules_normalizer and normalize_text: result.append(case_of(word)(rules_normalizer[word_lower])) index += 1 continue elif word_upper not in ['KE', 'PADA', 'RM', 'SEN', 'HINGGA']: result.append( _normalize_title(word) if normalize_text else word ) index += 1 continue if check_english_func is not None: s = f'index: {index}, word: {word}, condition check english' logger.debug(s) if check_english_func(word_lower): result.append(word) index += 1 continue if check_malay_func is not None: s = f'index: {index}, word: {word}, condition check malay' logger.debug(s) if check_malay_func(word_lower) and word_lower not in ['pada', 'ke']: result.append(word) index += 1 continue if len(word) > 2 and normalize_text: s = f'index: {index}, word: {word}, condition len(word) > 2 and norm text' logger.debug(s) if word[-2] in consonants and word[-1] == 'e': word = word[:-1] + 'a' if word[0] == 'x' and len(word) > 1 and normalize_text: s = f'index: {index}, word: {word}, condition word[0] == `x` and len(word) > 1 and norm text' logger.debug(s) result_string = 'tak ' word = word[1:] else: s = f'index: {index}, word: {word}, condition else for (word[0] == `x` and len(word) > 1 and norm text)' logger.debug(s) result_string = '' if word_lower == 'ke' and index < (len(tokenized) - 2): s = f'index: {index}, word: {word}, condition ke' logger.debug(s) if tokenized[index + 1] == '-' and _is_number_regex( tokenized[index + 2] ): result.append( ordinal( word + tokenized[index + 1] + tokenized[index + 2] ) ) index += 3 continue elif tokenized[index + 1] == '-' and re.match( '.*(V|X|I|L|D)', tokenized[index + 2] ): result.append( ordinal( word + tokenized[index + 1] + str(rom_to_int(tokenized[index + 2])) ) ) index += 3 continue else: result.append('ke') index += 1 continue if _is_number_regex(word) and index < (len(tokenized) - 2): s = f'index: {index}, word: {word}, condition hingga' logger.debug(s) if tokenized[index + 1] == '-' and _is_number_regex( tokenized[index + 2] ): result.append( to_cardinal(_string_to_num(word)) + ' hingga ' + to_cardinal(_string_to_num(tokenized[index + 2])) ) index += 3 continue if word_lower == 'pada' and index < (len(tokenized) - 3): s = f'index: {index}, word: {word}, condition pada hari bulan' logger.debug(s) if ( _is_number_regex(tokenized[index + 1]) and tokenized[index + 2] in '/-' and _is_number_regex(tokenized[index + 3]) ): result.append( 'pada %s hari bulan %s' % ( to_cardinal(_string_to_num(tokenized[index + 1])), to_cardinal(_string_to_num(tokenized[index + 3])), ) ) index += 4 continue if ( word_lower in ['tahun', 'thun'] and index < (len(tokenized) - 1) and normalize_year ): s = f'index: {index}, word: {word}, condition tahun' logger.debug(s) if ( _is_number_regex(tokenized[index + 1]) and len(tokenized[index + 1]) == 4 ): t = tokenized[index + 1] if t[1] != '0': l = to_cardinal(int(t[:2])) r = to_cardinal(int(t[2:])) c = f'{l} {r}' else: c = to_cardinal(int(t)) if ( index < (len(tokenized) - 3) and tokenized[index + 2] == '-' and tokenized[index + 3].lower() == 'an' ): end = 'an' plus = 4 else: end = '' plus = 2 result.append(f'tahun {c}{end}') index += plus continue if _is_number_regex(word) and index < (len(tokenized) - 2): s = f'index: {index}, word: {word}, condition fraction' logger.debug(s) if tokenized[index + 1] == '/' and _is_number_regex( tokenized[index + 2] ): result.append( fraction( word + tokenized[index + 1] + tokenized[index + 2] ) ) index += 3 continue if ( tokenized[index + 1] == '-' and tokenized[index + 2].lower() == 'an' and normalize_year and len(word) == 4 ): t = word if t[1] != '0': l = to_cardinal(int(t[:2])) r = to_cardinal(int(t[2:])) c = f'{l} {r}' else: c = to_cardinal(int(t)) result.append(f'{c}an') index += 3 continue if re.findall(_expressions['money'], word_lower): s = f'index: {index}, word: {word}, condition money' logger.debug(s) money_, _ = money(word) result.append(money_) if index < (len(tokenized) - 1): if tokenized[index + 1].lower() in ('sen', 'cent'): index += 2 else: index += 1 else: index += 1 continue if re.findall(_expressions['date'], word_lower): s = f'index: {index}, word: {word}, condition date' logger.debug(s) word = word_lower word = multireplace(word, date_replace) word = re.sub(r'[ ]+', ' ', word).strip() try: s = f'index: {index}, word: {word}, parsing date' logger.debug(s) parsed = dateparser.parse(word) if parsed: word = parsed.strftime('%d/%m/%Y') if normalize_date: day, month, year = word.split('/') day = cardinal(day) month = bulan[int(month)].title() year = cardinal(year) word = f'{day} {month} {year}' except Exception as e: logger.warning(str(e)) result.append(word) index += 1 continue if ( re.findall(_expressions['time'], word_lower) or re.findall(_expressions['time_pukul'], word_lower) ): s = f'index: {index}, word: {word}, condition time' logger.debug(s) word = word_lower word = multireplace(word, date_replace) word = re.sub(r'[ ]+', ' ', word).strip() try: s = f'index: {index}, word: {word}, parsing time' logger.debug(s) parsed = dateparser.parse(word.replace('.', ':')) if parsed: word = parsed.strftime('%H:%M:%S') if normalize_time: hour, minute, second = word.split(':') hour = cardinal(hour) if int(minute) > 0: minute = cardinal(minute) minute = f'{minute} minit' else: minute = '' if int(second) > 0: second = cardinal(second) second = f'{second} saat' else: second = '' word = f'pukul {hour} {minute} {second}' word = re.sub(r'[ ]+', ' ', word).strip() except Exception as e: logger.warning(str(e)) result.append(word) index += 1 continue if re.findall(_expressions['hashtag'], word_lower): s = f'index: {index}, word: {word}, condition hashtag' logger.debug(s) result.append(word) index += 1 continue if re.findall(_expressions['url'], word_lower): s = f'index: {index}, word: {word}, condition url' logger.debug(s) if normalize_url: word = word.replace('://', ' ').replace('.', ' dot ') word = put_spacing_num(word) word = word.replace('https', 'HTTPS').replace('http', 'HTTP').replace('www', 'WWW') result.append(word) index += 1 continue if re.findall(_expressions['email'], word_lower): s = f'index: {index}, word: {word}, condition email' logger.debug(s) if normalize_email: word = ( word.replace('://', ' ') .replace('.', ' dot ') .replace('@', ' di ') ) word = put_spacing_num(word) result.append(word) index += 1 continue if re.findall(_expressions['phone'], word_lower): s = f'index: {index}, word: {word}, condition phone' logger.debug(s) if normalize_telephone: splitted = word.split('-') if len(splitted) == 2: left = put_spacing_num(splitted[0]) right = put_spacing_num(splitted[1]) word = f'{left}, {right}' result.append(word) index += 1 continue if re.findall(_expressions['user'], word_lower): s = f'index: {index}, word: {word}, condition user' logger.debug(s) result.append(word) index += 1 continue if ( re.findall(_expressions['temperature'], word_lower) or re.findall(_expressions['distance'], word_lower) or re.findall(_expressions['volume'], word_lower) or re.findall(_expressions['duration'], word_lower) or re.findall(_expressions['weight'], word_lower) ): s = f'index: {index}, word: {word}, condition units' logger.debug(s) word = word.replace(' ', '') result.append(digit_unit(word)) index += 1 continue if ( re.findall(_expressions['percent'], word_lower) ): s = f'index: {index}, word: {word}, condition percent' logger.debug(s) word = word.replace('%', '') result.append(cardinal(word) + ' peratus') index += 1 continue if re.findall(_expressions['ic'], word_lower): s = f'index: {index}, word: {word}, condition IC' logger.debug(s) result.append(digit(word)) index += 1 continue if ( re.findall(_expressions['number'], word_lower) and word_lower[0] == '0' and '.' not in word_lower ): s = f'index: {index}, word: {word}, condition digit and word[0] == `0`' logger.debug(s) result.append(digit(word)) index += 1 continue cardinal_ = cardinal(word) if cardinal_ != word: s = f'index: {index}, word: {word}, condition cardinal' logger.debug(s) result.append(cardinal_) index += 1 continue normalized_ke = ordinal(word) if normalized_ke != word: s = f'index: {index}, word: {word}, condition normalized ke' logger.debug(s) result.append(normalized_ke) index += 1 continue word, end_result_string = _remove_postfix(word) if normalize_text: word, repeat = check_repeat(word) else: repeat = 1 if normalize_text: s = f'index: {index}, word: {word}, condition normalize text' logger.debug(s) if word in sounds: selected = sounds[word] elif word in rules_normalizer: selected = rules_normalizer[word] elif self._speller: selected = self._speller.correct( word, string=' '.join(tokenized), index=index ) else: selected = word else: selected = word selected = '-'.join([selected] * repeat) result.append(result_string + selected + end_result_string) index += 1 result = ' '.join(result) normalized = ' '.join(normalized) if normalize_entity: dates_, money_ = normalized_entity(normalized) else: dates_, money_ = {}, {} return {'normalize': result, 'date': dates_, 'money': money_}
def normalize( self, string: str, check_english: bool = True, normalize_entity: bool = True, ): """ Normalize a string Parameters ---------- string : str check_english: bool, (default=True) check a word in english dictionary. normalize_entity: bool, (default=True) normalize entities, only effect `date`, `datetime`, `time` and `money` patterns string only. Returns ------- string: normalized string """ string = groupby(string) string = replace_laugh(string) string = replace_mengeluh(string) string = _replace_compoud(string) if hasattr(self._speller, 'normalize_elongated'): string = [ self._speller.normalize_elongated(word) if len(re.findall(r'(.)\1{1}', word)) and not word[0].isupper() and not word.lower().startswith('ke-') else word for word in string.split() ] string = ' '.join(string) result, normalized = [], [] tokenized = self._tokenizer(string) index = 0 while index < len(tokenized): word = tokenized[index] word_lower = word.lower() word_upper = word.upper() first_c = word[0].isupper() if word in '~@#$%^&*()_+{}|[:"\'];<>,.?/-': result.append(word) index += 1 continue normalized.append(rules_normalizer.get(word_lower, word_lower)) if word_lower in ignore_words: result.append(word) index += 1 continue if first_c and not len(re.findall(_money, word_lower)): if word_lower in rules_normalizer: result.append(case_of(word)(rules_normalizer[word_lower])) index += 1 continue elif word_upper not in ['KE', 'PADA', 'RM', 'SEN', 'HINGGA']: result.append(_normalize_title(word)) index += 1 continue if check_english: if word_lower in ENGLISH_WORDS: result.append(word) index += 1 continue if word_lower in MALAY_WORDS and word_lower not in ['pada', 'ke']: result.append(word) index += 1 continue if len(word) > 2: if word[-2] in consonants and word[-1] == 'e': word = word[:-1] + 'a' if word[0] == 'x' and len(word) > 1: result_string = 'tak ' word = word[1:] else: result_string = '' if word.lower() == 'ke' and index < (len(tokenized) - 2): if tokenized[index + 1] == '-' and _is_number_regex( tokenized[index + 2]): result.append( ordinal(word + tokenized[index + 1] + tokenized[index + 2])) index += 3 continue elif tokenized[index + 1] == '-' and re.match( '.*(V|X|I|L|D)', tokenized[index + 2]): result.append( ordinal(word + tokenized[index + 1] + str(rom_to_int(tokenized[index + 2])))) index += 3 continue else: result.append('ke') index += 1 continue if _is_number_regex(word) and index < (len(tokenized) - 2): if tokenized[index + 1] == '-' and _is_number_regex( tokenized[index + 2]): result.append( to_cardinal(_string_to_num(word)) + ' hingga ' + to_cardinal(_string_to_num(tokenized[index + 2]))) index += 3 continue if word.lower() == 'pada' and index < (len(tokenized) - 3): if (_is_number_regex(tokenized[index + 1]) and tokenized[index + 2] in '/-' and _is_number_regex(tokenized[index + 3])): result.append('pada %s hari bulan %s' % ( to_cardinal(_string_to_num(tokenized[index + 1])), to_cardinal(_string_to_num(tokenized[index + 3])), )) index += 4 continue else: result.append('pada') index += 1 continue if _is_number_regex(word) and index < (len(tokenized) - 2): if tokenized[index + 1] == '/' and _is_number_regex( tokenized[index + 2]): result.append( fraction(word + tokenized[index + 1] + tokenized[index + 2])) index += 3 continue if re.findall(_money, word_lower): money_, _ = money(word) result.append(money_) if index < (len(tokenized) - 1): if tokenized[index + 1].lower() in ('sen', 'cent'): index += 2 else: index += 1 else: index += 1 continue if re.findall(_date, word_lower): word = word_lower word = multireplace(word, date_replace) word = re.sub(r'[ ]+', ' ', word).strip() parsed = dateparser.parse(word) if parsed: result.append(parsed.strftime('%d/%m/%Y')) else: result.append(word) index += 1 continue if re.findall(_expressions['time'], word_lower): word = word_lower word = multireplace(word, date_replace) word = re.sub(r'[ ]+', ' ', word).strip() parsed = dateparser.parse(word) if parsed: result.append(parsed.strftime('%H:%M:%S')) else: result.append(word) index += 1 continue if re.findall(_expressions['hashtag'], word_lower): result.append(word) index += 1 continue if re.findall(_expressions['url'], word_lower): result.append(word) index += 1 continue if re.findall(_expressions['user'], word_lower): result.append(word) index += 1 continue if (re.findall(_expressions['temperature'], word_lower) or re.findall(_expressions['distance'], word_lower) or re.findall(_expressions['volume'], word_lower) or re.findall(_expressions['duration'], word_lower) or re.findall(_expressions['weight'], word_lower)): word = word.replace(' ', '') result.append(digit_unit(word)) index += 1 continue cardinal_ = cardinal(word) if cardinal_ != word: result.append(cardinal_) index += 1 continue normalized_ke = ordinal(word) if normalized_ke != word: result.append(normalized_ke) index += 1 continue word, end_result_string = _remove_postfix(word) word, repeat = check_repeat(word) if word in sounds: selected = sounds[word] elif word in rules_normalizer: selected = rules_normalizer[word] else: selected = self._speller.correct(word, string=' '.join(tokenized), index=index) selected = ' - '.join([selected] * repeat) result.append(result_string + selected + end_result_string) index += 1 result = ' '.join(result) normalized = ' '.join(normalized) if normalize_entity: dates_, money_ = normalized_entity(normalized) else: dates_, money_ = {}, {} return {'normalize': result, 'date': dates_, 'money': money_}