def _predict( self, left_strings: List[str], right_strings: List[str], aggregation: str = 'mean', similarity: str = 'cosine', soft: bool = True, ): if len(left_strings) != len(right_strings): raise ValueError( 'length list of left strings must be same with length list of right strings' ) identical = left_strings == right_strings aggregation = aggregation.lower() if aggregation == 'mean': aggregation_function = np.mean elif aggregation == 'min': aggregation_function = np.min elif aggregation == 'max': aggregation_function = np.max elif aggregation == 'sum': aggregation_function = np.sum elif aggregation == 'sqrt': aggregation_function = np.sqrt else: raise ValueError( "aggregation only supports 'mean', 'min', 'max', 'sum' and 'sqrt'" ) similarity = similarity.lower() if similarity == 'cosine': similarity_function = cosine_similarity elif similarity == 'euclidean': similarity_function = euclidean_distances elif similarity == 'manhattan': similarity_function = manhattan_distances else: raise ValueError( "similarity only supports 'cosine', 'euclidean', and 'manhattan'" ) left_vectors, right_vectors = [], [] for i in range(len(left_strings)): left_string = left_strings[i] right_string = right_strings[i] left_tokenized = _tokenizer(left_string) if not len(left_tokenized): raise ValueError('insert not empty left string') right_tokenized = _tokenizer(right_string) if not len(right_tokenized): raise ValueError('insert not empty right string') in_vector = [] for token in left_tokenized: try: in_vector.append( self._vectorizer.get_vector_by_name(token)) except: if not soft: pass else: arr = np.array([ self._jarowinkler.similarity(token, k) for k in self._vectorizer.words ]) idx = (-arr).argsort()[0] in_vector.append( self._vectorizer.get_vector_by_name( self._vectorizer.words[idx])) left_vectors.append(aggregation_function(in_vector, axis=0)) if not identical: in_vector = [] for token in right_tokenized: try: in_vector.append( self._vectorizer.get_vector_by_name(token)) except: if not soft: pass else: arr = np.array([ self._jarowinkler.similarity(token, k) for k in self._vectorizer.words ]) idx = (-arr).argsort()[0] in_vector.append( self._vectorizer.get_vector_by_name( self._vectorizer.words[idx])) right_vectors.append(aggregation_function(in_vector, axis=0)) if identical: similar = similarity_function(left_vectors, left_vectors) else: similar = similarity_function(left_vectors, right_vectors) if similarity == 'cosine': return (similar + 1) / 2 else: return 1 / (similar + 1)
def transformer_augmentation( string: str, model, threshold: float = 0.5, top_p: float = 0.8, top_k: int = 100, temperature: float = 0.8, top_n: int = 5, cleaning_function: Callable = None, ): """ augmenting a string using transformer + nucleus sampling / top-k sampling. Parameters ---------- string: str model: object transformer interface object. Right now only supported BERT. threshold: float, optional (default=0.5) random selection for a word. top_p: float, optional (default=0.8) cumulative sum of probabilities to sample a word. If top_n bigger than 0, the model will use nucleus sampling, else top-k sampling. top_k: int, optional (default=100) k for top-k sampling. temperature: float, optional (default=0.8) logits * temperature. top_n: int, (default=5) number of nearest neighbors returned. cleaning_function: function, (default=None) function to clean text. Returns ------- result: list """ if not hasattr(model, 'samples'): raise ValueError('model must has `samples` attribute') if not (threshold > 0 and threshold < 1): raise ValueError('threshold must be bigger than 0 and less than 1') if not top_p > 0: raise ValueError('top_p must be bigger than 0') if not top_k > 0: raise ValueError('top_k must be bigger than 0') if not (temperature > 0 and threshold < 1): raise ValueError('temperature must be bigger than 0 and less than 1') if not top_n > 0: raise ValueError('top_n must be bigger than 0') if top_n > top_k: raise ValueError('top_k must be bigger than top_n') original_string = string if cleaning_function: string = cleaning_function(string) string = _tokenizer(string) results = [] for token_idx, token in enumerate(string): if token in string_function.punctuation: continue if token[0].isupper(): continue if token.isdigit(): continue if random.random() > threshold: results.append(token_idx) if not len(results): raise ValueError( 'no words can augmented, make sure words available are not punctuation or proper nouns.' ) maskeds, indices, input_masks = [], [], [] for index in results: new = string[:] new[index] = '[MASK]' mask, ind = to_ids(new, model._tokenizer) maskeds.append(mask) indices.append(ind) input_masks.append([1] * len(mask)) masked_padded = pad_sequences(maskeds, padding = 'post') input_masks = pad_sequences(input_masks, padding = 'post') batch_indices = np.array([np.arange(len(indices)), indices]).T samples = model._sess.run( model.samples, feed_dict = { model.X: masked_padded, model.MASK: input_masks, model.top_p: top_p, model.top_k: top_k, model.temperature: temperature, model.indices: batch_indices, model.k: top_n, }, ) outputs = [] for i in range(samples.shape[1]): sample_i = samples[:, i] samples_tokens = model._tokenizer.convert_ids_to_tokens( sample_i.tolist() ) new_splitted = ['▁' + w if len(w) > 1 else w for w in string] for no, index in enumerate(results): new_splitted[index] = samples_tokens[no] new = ''.join(model._tokenizer.sp_model.DecodePieces(new_splitted)) outputs.append(new) return outputs
def normalize(self, string: str, check_english: bool = True): """ Normalize a string Parameters ---------- string : str check_english: bool, (default=True) check a word in english dictionary. Returns ------- string: normalized string """ result, normalized = [], [] tokenized = _tokenizer(string) print(tokenized) index = 0 while index < len(tokenized): word = tokenized[index] if word in '~@#$%^&*()_+{}|[:"\'];<>,.?/-': result.append(word) index += 1 continue normalized.append(rules_normalizer.get(word.lower(), word.lower())) if word.lower() in ignore_words: result.append(word) index += 1 continue if word[0].isupper(): if word.upper() not in ['KE', 'PADA', 'RM', 'SEN', 'HINGGA']: result.append(_normalize_title(word)) index += 1 continue if check_english: if word.lower() in ENGLISH_WORDS: result.append(word) index += 1 continue if word.lower() in MALAY_WORDS and word.lower() not in [ 'pada', 'ke', ]: result.append(word) index += 1 continue if len(word) > 2: if word[-2] in consonants and word[-1] == 'e': word = word[:-1] + 'a' if word[0] == 'x' and len(word) > 1: result_string = 'tak ' word = word[1:] else: result_string = '' if word.lower() == 'ke' and index < (len(tokenized) - 2): if tokenized[index + 1] == '-' and _is_number_regex( tokenized[index + 2]): result.append( ordinal(word + tokenized[index + 1] + tokenized[index + 2])) index += 3 continue elif tokenized[index + 1] == '-' and re.match( '.*(V|X|I|L|D)', tokenized[index + 2]): result.append( ordinal(word + tokenized[index + 1] + str(rom_to_int(tokenized[index + 2])))) index += 3 continue else: result.append('ke') index += 1 continue if _is_number_regex(word) and index < (len(tokenized) - 2): if tokenized[index + 1] == '-' and _is_number_regex( tokenized[index + 2]): result.append( to_cardinal(_string_to_num(word)) + ' hingga ' + to_cardinal(_string_to_num(tokenized[index + 2]))) index += 3 continue if word.lower() == 'pada' and index < (len(tokenized) - 3): if (_is_number_regex(tokenized[index + 1]) and tokenized[index + 2] in '/-' and _is_number_regex(tokenized[index + 3])): result.append('pada %s hari bulan %s' % ( to_cardinal(_string_to_num(tokenized[index + 1])), to_cardinal(_string_to_num(tokenized[index + 3])), )) index += 4 continue else: result.append('pada') index += 1 continue if _is_number_regex(word) and index < (len(tokenized) - 2): if tokenized[index + 1] == '/' and _is_number_regex( tokenized[index + 2]): result.append( fraction(word + tokenized[index + 1] + tokenized[index + 2])) index += 3 continue if re.findall(_money, word.lower()): money_, _ = money(word) result.append(money_) index += 1 continue if re.findall(_date, word.lower()): word = word.lower() word = multireplace(word, date_replace) word = re.sub(r'[ ]+', ' ', word).strip() parsed = dateparser.parse(word) if parsed: result.append(parsed.strftime('%d/%m/%Y')) else: result.append(word) index += 1 continue if re.findall(_expressions['time'], word.lower()): word = word.lower() word = multireplace(word, date_replace) word = re.sub(r'[ ]+', ' ', word).strip() parsed = dateparser.parse(word) if parsed: result.append(parsed.strftime('%H:%M:%S')) else: result.append(word) index += 1 continue cardinal_ = cardinal(word) if cardinal_ != word: result.append(cardinal_) index += 1 continue normalized_ke = ordinal(word) if normalized_ke != word: result.append(normalized_ke) index += 1 continue word, end_result_string = _remove_postfix(word) if word in sounds: result.append(result_string + sounds[word] + end_result_string) index += 1 continue if word in rules_normalizer: result.append(result_string + rules_normalizer[word] + end_result_string) index += 1 continue selected = self._speller.correct(word, string=' '.join(tokenized), index=index) result.append(result_string + selected + end_result_string) index += 1 result = ' '.join(result) normalized = ' '.join(normalized) money_ = re.findall(_money, normalized) money_ = [(s, money(s)[1]) for s in money_] dates_ = re.findall(_date, normalized) past_date_string_ = re.findall(_past_date_string, normalized) now_date_string_ = re.findall(_now_date_string, normalized) future_date_string_ = re.findall(_future_date_string, normalized) yesterday_date_string_ = re.findall(_yesterday_tomorrow_date_string, normalized) depan_date_string_ = re.findall(_depan_date_string, normalized) today_time_ = re.findall(_today_time, normalized) time_ = re.findall(_expressions['time'], normalized) left_datetime_ = [ f'{i[0]} {i[1]}' for i in re.findall(_left_datetime, normalized) ] right_datetime_ = [ f'{i[0]} {i[1]}' for i in re.findall(_right_datetime, normalized) ] today_left_datetime_ = [ f'{i[0]} {i[1]}' for i in re.findall(_left_datetodaytime, normalized) ] today_right_datetime_ = [ f'{i[0]} {i[1]}' for i in re.findall(_right_datetodaytime, normalized) ] left_yesterdaydatetime_ = [ f'{i[0]} {i[1]}' for i in re.findall(_left_yesterdaydatetime, normalized) ] right_yesterdaydatetime_ = [ f'{i[0]} {i[1]}' for i in re.findall(_right_yesterdaydatetime, normalized) ] left_yesterdaydatetodaytime_ = [ f'{i[0]} {i[1]}' for i in re.findall(_left_yesterdaydatetodaytime, normalized) ] right_yesterdaydatetodaytime_ = [ f'{i[0]} {i[1]}' for i in re.findall(_right_yesterdaydatetodaytime, normalized) ] dates_ = (dates_ + past_date_string_ + now_date_string_ + future_date_string_ + yesterday_date_string_ + depan_date_string_ + time_ + today_time_ + left_datetime_ + right_datetime_ + today_left_datetime_ + today_right_datetime_ + left_yesterdaydatetime_ + right_yesterdaydatetime_ + left_yesterdaydatetodaytime_ + right_yesterdaydatetodaytime_) dates_ = [multireplace(s, date_replace) for s in dates_] dates_ = [re.sub(r'[ ]+', ' ', s).strip() for s in dates_] dates_ = cluster_words(dates_) dates_ = {s: dateparser.parse(s) for s in dates_} money_ = {s[0]: s[1] for s in money_} return {'normalize': result, 'date': dates_, 'money': money_}
def wordvector_augmentation( string: str, wordvector, threshold: float = 0.5, top_n: int = 5, soft: bool = False, cleaning_function: Callable = None, ): """ augmenting a string using wordvector. Parameters ---------- string: str wordvector: object wordvector interface object. threshold: float, optional (default=0.5) random selection for a word. soft: bool, optional (default=False) if True, a word not in the dictionary will be replaced with nearest jarowrinkler ratio. if False, it will throw an exception if a word not in the dictionary. top_n: int, (default=5) number of nearest neighbors returned. cleaning_function: function, (default=None) function to clean text. Returns ------- result: list """ if not hasattr(wordvector, 'batch_n_closest'): raise ValueError('wordvector must has `batch_n_closest` method') if not hasattr(wordvector, '_dictionary'): raise ValueError('wordvector must has `_dictionary` attribute') original_string = string if cleaning_function: string = cleaning_function(string) string = _tokenizer(string) original_string = string[:] selected = [] for no, w in enumerate(string): if w in string_function.punctuation: continue if w[0].isupper(): continue if random.random() > threshold: selected.append((no, w)) if not len(selected): raise ValueError( 'no words can augmented, make sure words available are not punctuation or proper nouns.' ) indices, words = [i[0] for i in selected], [i[1] for i in selected] batch_parameters = list( inspect.signature(wordvector.batch_n_closest).parameters.keys() ) if 'soft' in batch_parameters: results = wordvector.batch_n_closest( words, num_closest = top_n, soft = soft ) else: results = wordvector.batch_n_closest(words, num_closest = top_n) augmented = [] for i in range(top_n): string_ = string[:] for no in range(len(results)): string_[indices[no]] = results[no][i] augmented.append( _make_upper(' '.join(string_), ' '.join(original_string)) ) return augmented