def deep_model(**kwargs): """ Load deep learning language detection model. Returns ------- result : malaya.model.tf.DEEP_LANG class """ check_file(PATH_LANG_DETECTION['deep'], S3_PATH_LANG_DETECTION['deep'], **kwargs) try: with open(PATH_LANG_DETECTION['deep']['vector'], 'rb') as fopen: vector = pickle.load(fopen) except: raise ValueError( "model corrupted due to some reasons, please run malaya.clear_cache('language-detection/deep') and try again" ) from malaya.text.bpe import load_yttm bpe, subword_mode = load_yttm(PATH_LANG_DETECTION['deep']['bpe']) import os return DEEP_LANG( os.path.dirname(PATH_LANG_DETECTION['deep']['model']), vector, lang_labels, bpe, subword_mode, )
def multinomial(path, s3_path, class_name, label, **kwargs): check_file(path['multinomial'], s3_path['multinomial'], **kwargs) try: with open(path['multinomial']['model'], 'rb') as fopen: multinomial = pickle.load(fopen) with open(path['multinomial']['vector'], 'rb') as fopen: vectorize = pickle.load(fopen) except: raise Exception( f"model corrupted due to some reasons, please run malaya.clear_cache('{class_name}/multinomial') and try again" ) bpe, subword_mode = load_yttm(path['multinomial']['bpe']) from malaya.stem import _classification_textcleaning_stemmer if len(label) > 2: selected_class = MULTICLASS_BAYES else: selected_class = BINARY_BAYES return selected_class( multinomial=multinomial, label=label, vectorize=vectorize, bpe=bpe, subword_mode=subword_mode, cleaning=_classification_textcleaning_stemmer, )
def load(path, s3_path, model, encoder, model_class, quantized=False, **kwargs): check_file(path[model], s3_path[model], quantized=quantized, **kwargs) if quantized: model_path = 'quantized' else: model_path = 'model' g = load_graph(path[model][model_path], **kwargs) if encoder == 'subword': encoder = text_encoder.SubwordTextEncoder(path[model]['vocab']) if encoder == 'yttm': bpe, subword_mode = load_yttm(path[model]['vocab'], True) encoder = YTTMEncoder(bpe, subword_mode) return model_class( X=g.get_tensor_by_name('import/Placeholder:0'), greedy=g.get_tensor_by_name('import/greedy:0'), beam=g.get_tensor_by_name('import/beam:0'), sess=generate_session(graph=g, **kwargs), encoder=encoder, )
def multinomial(path, s3_path, class_name, label, sigmoid=False, **kwargs): check_file(path['multinomial'], s3_path['multinomial'], **kwargs) try: with open(path['multinomial']['model'], 'rb') as fopen: multinomial = pickle.load(fopen) with open(path['multinomial']['vector'], 'rb') as fopen: vectorize = pickle.load(fopen) except: raise Exception( f"model corrupted due to some reasons, please run `malaya.clear_cache('{class_name}/multinomial')` and try again" ) bpe, subword_mode = load_yttm(path['multinomial']['bpe']) stemmer = naive() cleaning = partial(_classification_textcleaning_stemmer, stemmer=stemmer) if sigmoid: selected_class = MultilabelBayes else: if len(label) > 2: selected_class = MulticlassBayes else: selected_class = BinaryBayes return selected_class( multinomial=multinomial, label=label, vectorize=vectorize, bpe=bpe, subword_mode=subword_mode, cleaning=cleaning, )
def load(module, model, encoder, model_class, quantized=False, **kwargs): path = check_file( file=model, module=module, keys={ 'model': 'model.pb', 'vocab': LM_VOCAB[module] }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) if encoder == 'subword': encoder = text_encoder.SubwordTextEncoder(path['vocab']) if encoder == 'yttm': bpe, subword_mode = load_yttm(path['vocab'], True) encoder = YTTMEncoder(bpe, subword_mode) inputs = ['Placeholder'] outputs = ['greedy', 'beam'] input_nodes, output_nodes = nodes_session(g, inputs, outputs) return model_class( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), encoder=encoder, )
def deep_model(quantized: bool = False, **kwargs): """ Load LSTM + Bahdanau Attention stemming model, this also include lemmatization. Original size 41.6MB, quantized size 10.6MB . Parameters ---------- quantized : bool, optional (default=False) if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine. Returns ------- result: malaya.stem.DeepStemmer class """ if check_tf_version() > 1: raise Exception( f'Tensorflow 2.0 and above not able to use `deep_model` for stemmer, use Tensorflow 1.15 instead.' ) path = check_file( file='lstm-bahdanau', module='stem', keys={ 'model': 'model.pb', 'vocab': STEMMER_VOCAB }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) bpe, subword_mode = load_yttm(path['vocab'], id_mode=True) inputs = ['Placeholder'] outputs = [] input_nodes, output_nodes = nodes_session( g, inputs, outputs, extra={ 'greedy': 'import/decode_1/greedy:0', 'beam': 'import/decode_2/beam:0', }, ) tokenizer = Tokenizer().tokenize return DeepStemmer( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), bpe=bpe, subword_mode=subword_mode, tokenizer=tokenizer, )
def deep_model(quantized: bool = False, **kwargs): """ Load deep learning language detection model. Original size is 51.2MB, Quantized size 12.8MB. quantized : bool, optional (default=False) if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine. Returns ------- result : malaya.model.tf.DeepLang class """ path = check_file( file = 'lang-32', module = 'language-detection', keys = { 'model': 'model.pb', 'vector': LANGUAGE_DETECTION_BOW, 'bpe': LANGUAGE_DETECTION_VOCAB, }, quantized = quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) bpe, subword_mode = load_yttm(path['bpe']) try: with open(path['vector'], 'rb') as fopen: vector = pickle.load(fopen) except: raise ValueError( "model corrupted due to some reasons, please run `malaya.clear_cache('language-detection/lang-32')` and try again" ) inputs = [ 'X_Placeholder/shape', 'X_Placeholder/values', 'X_Placeholder/indices', 'W_Placeholder/shape', 'W_Placeholder/values', 'W_Placeholder/indices', ] outputs = ['logits'] input_nodes, output_nodes = nodes_session(g, inputs, outputs) return DeepLang( input_nodes = input_nodes, output_nodes = output_nodes, sess = generate_session(graph = g, **kwargs), vectorizer = vector, bpe = bpe, type = subword_mode, label = lang_labels, )
def load(path, s3_path, model, encoder, model_class, **kwargs): check_file(path[model], s3_path[model], **kwargs) g = load_graph(path[model]['model'], **kwargs) if encoder == 'subword': encoder = text_encoder.SubwordTextEncoder(path[model]['vocab']) if encoder == 'yttm': bpe, subword_mode = load_yttm(path[model]['vocab'], True) encoder = YTTMEncoder(bpe, subword_mode) return model_class( g.get_tensor_by_name('import/Placeholder:0'), g.get_tensor_by_name('import/greedy:0'), g.get_tensor_by_name('import/beam:0'), generate_session(graph=g, **kwargs), encoder, )
def multinomial(**kwargs): """ Load multinomial toxicity model. Returns ------- result : malaya.model.ml.MULTILABEL_BAYES class """ import pickle check_file(PATH_TOXIC['multinomial'], S3_PATH_TOXIC['multinomial'], **kwargs) try: with open(PATH_TOXIC['multinomial']['model'], 'rb') as fopen: multinomial = pickle.load(fopen) with open(PATH_TOXIC['multinomial']['vector'], 'rb') as fopen: vectorize = pickle.load(fopen) except: raise Exception( f"model corrupted due to some reasons, please run malaya.clear_cache('toxic/multinomial') and try again" ) from malaya.text.bpe import load_yttm from malaya.stem import _classification_textcleaning_stemmer, naive stemmer = naive() cleaning = partial(_classification_textcleaning_stemmer, stemmer=stemmer) bpe, subword_mode = load_yttm(PATH_TOXIC['multinomial']['bpe']) return MULTILABEL_BAYES( multinomial=multinomial, label=label, vectorize=vectorize, bpe=bpe, subword_mode=subword_mode, cleaning=cleaning, )
def deep_model(quantized: bool = False, **kwargs): """ Load LSTM + Bahdanau Attention stemming model, this also include lemmatization. Original size 41.6MB, quantized size 10.6MB . Parameters ---------- quantized : bool, optional (default=False) if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine. Returns ------- result: malaya.stem.DEEP_STEMMER class """ from malaya.preprocessing import _tokenizer check_file(PATH_STEM['deep'], S3_PATH_STEM['deep'], quantized=quantized, **kwargs) if quantized: model_path = 'quantized' else: model_path = 'model' g = load_graph(PATH_STEM['deep'][model_path], **kwargs) bpe, subword_mode = load_yttm(PATH_STEM['deep']['bpe'], id_mode=True) return DEEP_STEMMER( g.get_tensor_by_name('import/Placeholder:0'), g.get_tensor_by_name('import/decode_1/greedy:0'), g.get_tensor_by_name('import/decode_2/beam:0'), generate_session(graph=g, **kwargs), bpe, subword_mode, _tokenizer, )
def deep_model(**kwargs): """ Load LSTM + Bahdanau Attention stemming model. Returns ------- DEEP_STEMMER: malaya.stem.DEEP_STEMMER class """ from malaya.preprocessing import _tokenizer check_file(PATH_STEM['deep'], S3_PATH_STEM['deep'], **kwargs) g = load_graph(PATH_STEM['deep']['model']) bpe, subword_mode = load_yttm(PATH_STEM['deep']['bpe'], id_mode=True) return DEEP_STEMMER( g.get_tensor_by_name('import/Placeholder:0'), g.get_tensor_by_name('import/decode_1/greedy:0'), g.get_tensor_by_name('import/decode_2/beam:0'), generate_session(graph=g), bpe, subword_mode, _tokenizer, )