def segmenter(max_split_length: int = 20, validate: bool = True): """ Load Segmenter class. Parameters ---------- max_split_length: int, (default=20) max length of words in a sentence to segment validate: bool, optional (default=True) if True, malaya will check model availability and download if not available. Returns ------- _Segmenter : malaya.preprocessing._Segmenter class """ if validate: check_file(PATH_PREPROCESSING[1], S3_PATH_PREPROCESSING[1]) else: if not check_available(PATH_PREPROCESSING[1]): raise Exception( 'preprocessing is not available, please `validate = True`') if validate: check_file(PATH_PREPROCESSING[2], S3_PATH_PREPROCESSING[2]) else: if not check_available(PATH_PREPROCESSING[2]): raise Exception( 'preprocessing is not available, please `validate = True`') return _Segmenter(max_split_length=max_split_length)
def probability(sentence_piece: bool = False, **kwargs): """ Train a Probability Spell Corrector. Parameters ---------- sentence_piece: bool, optional (default=False) if True, reduce possible augmentation states using sentence piece. validate: bool, optional (default=True) if True, malaya will check model availability and download if not available. Returns ------- _SpellCorrector: malaya.spell._SpellCorrector class """ check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1], **kwargs) tokenizer = None if sentence_piece: if validate: check_file(PATH_NGRAM['sentencepiece'], S3_PATH_NGRAM['sentencepiece']) else: if not check_available(PATH_NGRAM[1]): raise Exception( 'sentence piece is not available, please `validate = True`' ) vocab = PATH_NGRAM['sentencepiece']['vocab'] vocab_model = PATH_NGRAM['sentencepiece']['model'] tokenizer = load_sentencepiece(vocab, vocab_model) with open(PATH_NGRAM[1]['model']) as fopen: corpus = json.load(fopen) return _SpellCorrector(corpus, tokenizer)
def shortform( word: str, augment_vowel: bool = True, augment_consonant: bool = True, prob_delete_vowel: float = 0.5, validate: bool = True, ): """ augmenting a formal word into socialmedia form. Purposely typo, purposely delete some vowels, purposely replaced some subwords into slang subwords. Parameters ---------- word: str augment_vowel: bool, (default=True) if True, will augment vowels for each samples generated. augment_consonant: bool, (default=True) if True, will augment consonants for each samples generated. prob_delete_vowel: float, (default=0.5) probability to delete a vowel. validate: bool, optional (default=True) if True, malaya will check model availability and download if not available. Returns ------- result: list """ if not 0 < prob_delete_vowel < 1: raise Exception( 'prob_delete_vowel must be bigger than 0 and less than 1' ) word = simple_textcleaning(word) if not len(word): raise Exception('word is too short to augment shortform.') if validate: check_file(PATH_NGRAM['sentencepiece'], S3_PATH_NGRAM['sentencepiece']) else: if not check_available(PATH_NGRAM[1]): raise Exception( 'sentence piece is not available, please `validate = True`' ) vocab = PATH_NGRAM['sentencepiece']['vocab'] vocab_model = PATH_NGRAM['sentencepiece']['model'] tokenizer = load_sentencepiece(vocab, vocab_model) replace_consonants = { 'n': 'm', 't': 'y', 'r': 't', 'g': 'h', 'j': 'k', 'k': 'l', 'd': 's', 'd': 'f', 'g': 'f', 'b': 'n', } replace_vowels = {'u': 'i', 'i': 'o', 'o': 'u'} results = [word] if len(word) > 1: if word[-1] == 'a' and word[-2] in consonants: results.append(word[:-1] + 'e') if word[0] == 'f' and word[-1] == 'r': results.append('p' + words[1:]) if word[-2] in consonants and word[-1] in vowels: results.append(word + 'k') if word[-2] in vowels and word[-1] == 'h': results.append(word[:-1]) if len(word) > 2: if word[-3] in consonants and word[-2:] == 'ar': results.append(words[:-2] + 'o') if word[0] == 'h' and word[1] in vowels and word[2] in consonants: results.append(word[1:]) if word[-3] in consonants and word[-2:] == 'ng': results.append(word[:-2] + 'g') if word[1:3] == 'ng': results.append(word[:1] + x[2:]) if augment_consonant: result_consonants = [] for k, v in replace_consonants.items(): for r in results: result_consonants.extend([r.replace(k, v), r.replace(v, k)]) results.extend(result_consonants) if augment_vowel: result_vowels = [] for k, v in replace_vowels.items(): for r in results: result_vowels.extend([r.replace(k, v), r.replace(v, k)]) results.extend(result_vowels) result_deleted = [] for s in results: deleted = [] for c in s: if random.random() > prob_delete_vowel and c in vowels: continue else: deleted.append(c) result_deleted.append(''.join(deleted)) results.extend(result_deleted) filtered = [] for s in results: t = tokenizer.tokenize(s) if len(t) == 1: filtered.append(s) continue if t[0] == '▁': continue if any([len(w) < 3 for w in t]): continue filtered.append(s) return list(set(filtered))
def preprocessing( normalize: List[str] = [ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date', 'number', ], annotate: List[str] = [ 'allcaps', 'elongated', 'repeated', 'emphasis', 'censored', 'hashtag', ], lowercase: bool = True, fix_unidecode: bool = True, expand_hashtags: bool = True, expand_english_contractions: bool = True, translate_english_to_bm: bool = True, remove_postfix: bool = True, maxlen_segmenter: int = 20, validate: bool = True, speller=None, ): """ Load Preprocessing class. Parameters ---------- normalize: list normalizing tokens, can check all supported normalizing at malaya.preprocessing.get_normalize() annotate: list annonate tokens <open></open>, only accept ['hashtag', 'allcaps', 'elongated', 'repeated', 'emphasis', 'censored'] lowercase: bool fix_unidecode: bool expand_hashtags: bool expand hashtags using Viterbi algorithm, #mondayblues == monday blues expand_english_contractions: bool expand english contractions translate_english_to_bm: bool translate english words to bahasa malaysia words remove_postfix: bool remove postfix from a word, faster way to get root word speller: object spelling correction object, need to have a method `correct` validate: bool, optional (default=True) if True, malaya will check model availability and download if not available. Returns ------- _Preprocessing : malaya.preprocessing._Preprocessing class """ if any([e not in _normalize for e in normalize]): raise ValueError( 'normalize element not able to recognize, supported normalization can check at get_normalize()' ) if any([e not in _annotate for e in annotate]): raise ValueError( "annotate only accept ['hashtag', 'allcaps', 'elongated', 'repeated', 'emphasis', 'censored']" ) if speller is not None: if not hasattr(speller, 'correct') and not hasattr( speller, 'normalize_elongated'): raise ValueError( 'speller must has `correct` or `normalize_elongated` method') if expand_hashtags: if validate: check_file(PATH_PREPROCESSING[1], S3_PATH_PREPROCESSING[1]) else: if not check_available(PATH_PREPROCESSING[1]): raise Exception( 'preprocessing is not available, please `validate = True`') if validate: check_file(PATH_PREPROCESSING[2], S3_PATH_PREPROCESSING[2]) else: if not check_available(PATH_PREPROCESSING[2]): raise Exception( 'preprocessing is not available, please `validate = True`') if translate_english_to_bm: if validate: check_file( PATH_PREPROCESSING['english-malay'], S3_PATH_PREPROCESSING['english-malay'], ) else: if not check_available(PATH_PREPROCESSING['english-malay']): raise Exception( 'translator english-malay is not available, please `validate = True`' ) with open(PATH_PREPROCESSING['english-malay']['model']) as fopen: translator = json.load(fopen) else: translator = None return _Preprocessing( normalize=normalize, annotate=annotate, lowercase=lowercase, fix_unidecode=fix_unidecode, expand_hashtags=expand_hashtags, expand_english_contractions=expand_english_contractions, remove_postfix=remove_postfix, maxlen_segmenter=maxlen_segmenter, translator=translator, speller=speller, )