def _transformer(model, bert_class, xlnet_class, quantized=False, siamese=False, **kwargs): model = model.lower() if model not in _transformer_availability: raise ValueError( 'model not supported, please check supported models from `malaya.similarity.available_transformer()`.' ) check_file(PATH_SIMILARITY[model], S3_PATH_SIMILARITY[model], quantized=quantized, **kwargs) if quantized: model_path = 'quantized' else: model_path = 'model' g = load_graph(PATH_SIMILARITY[model][model_path], **kwargs) path = PATH_SIMILARITY if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']: if model in ['bert', 'tiny-bert']: tokenizer = sentencepiece_tokenizer_bert(path[model]['tokenizer'], path[model]['vocab']) if model in ['albert', 'tiny-albert']: tokenizer = tokenization.FullTokenizer( vocab_file=path[model]['vocab'], do_lower_case=False, spm_model_file=path[model]['tokenizer'], ) selected_class = bert_class if siamese: selected_node = 'import/bert/pooler/dense/BiasAdd:0' if model in ['xlnet', 'alxlnet']: tokenizer = sentencepiece_tokenizer_xlnet(path[model]['tokenizer']) selected_class = xlnet_class if siamese: selected_node = 'import/model_1/sequnece_summary/summary/BiasAdd:0' if not siamese: selected_node = _vectorizer_mapping[model] return selected_class( X=g.get_tensor_by_name('import/Placeholder:0'), segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'), input_masks=g.get_tensor_by_name('import/Placeholder_2:0'), logits=g.get_tensor_by_name('import/logits:0'), vectorizer=g.get_tensor_by_name(selected_node), sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, label=['not similar', 'similar'], )
def transformer(class_name, model='xlnet', quantized=False, **kwargs): path = check_file( file=model, module=class_name, keys={ 'model': 'model.pb', 'vocab': MODEL_VOCAB[model], 'tokenizer': MODEL_BPE[model], 'setting': TAGGING_SETTING[class_name], }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) try: with open(path['setting']) as fopen: nodes = json.load(fopen) except: raise Exception( f"model corrupted due to some reasons, please run `malaya.clear_cache('{class_name}/{model}/{size}')` and try again" ) if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']: if model in ['bert', 'tiny-bert']: tokenizer = sentencepiece_tokenizer_bert(path['tokenizer'], path['vocab']) if model in ['albert', 'tiny-albert']: from malaya.transformers.albert import tokenization tokenizer = tokenization.FullTokenizer( vocab_file=path['vocab'], do_lower_case=False, spm_model_file=path['tokenizer'], ) inputs = ['Placeholder', 'Placeholder_1'] vectorizer = {'vectorizer': 'import/dense/BiasAdd:0'} Model = TaggingBERT if model in ['xlnet', 'alxlnet']: tokenizer = sentencepiece_tokenizer_xlnet(path['tokenizer']) inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2'] vectorizer = {'vectorizer': 'import/transpose_3:0'} Model = TaggingXLNET outputs = ['logits'] input_nodes, output_nodes = nodes_session(g, inputs, outputs, extra=vectorizer) return Model( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, settings=nodes, )
def _transformer(model, bert_class, xlnet_class, quantized=False, siamese=False, **kwargs): model = model.lower() if model not in _transformer_availability: raise ValueError( 'model not supported, please check supported models from `malaya.similarity.available_transformer()`.' ) path = check_file( file=model, module='similarity', keys={ 'model': 'model.pb', 'vocab': MODEL_VOCAB[model], 'tokenizer': MODEL_BPE[model], }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']: if model in ['bert', 'tiny-bert']: tokenizer = sentencepiece_tokenizer_bert(path['tokenizer'], path['vocab']) if model in ['albert', 'tiny-albert']: tokenizer = AlbertTokenizer(vocab_file=path['vocab'], spm_model_file=path['tokenizer']) selected_class = bert_class if siamese: selected_node = 'import/bert/pooler/dense/BiasAdd:0' if model in ['xlnet', 'alxlnet']: tokenizer = sentencepiece_tokenizer_xlnet(path['tokenizer']) selected_class = xlnet_class if siamese: selected_node = 'import/model_1/sequnece_summary/summary/BiasAdd:0' if not siamese: selected_node = _vectorizer_mapping[model] inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2'] outputs = ['logits'] input_nodes, output_nodes = nodes_session( g, inputs, outputs, extra={'vectorizer': selected_node}) return selected_class( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, label=['not similar', 'similar'], )
def transformer(path, s3_path, class_name, model='xlnet', quantized=False, **kwargs): check_file(path[model], s3_path[model], quantized=quantized, **kwargs) if quantized: model_path = 'quantized' else: model_path = 'model' g = load_graph(path[model][model_path], **kwargs) try: with open(path[model]['setting']) as fopen: nodes = json.load(fopen) except: raise Exception( f"model corrupted due to some reasons, please run malaya.clear_cache('{class_name}/{model}/{size}') and try again" ) if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']: if model in ['bert', 'tiny-bert']: tokenizer = sentencepiece_tokenizer_bert(path[model]['tokenizer'], path[model]['vocab']) if model in ['albert', 'tiny-albert']: from albert import tokenization tokenizer = tokenization.FullTokenizer( vocab_file=path[model]['vocab'], do_lower_case=False, spm_model_file=path[model]['tokenizer'], ) return TAGGING_BERT( X=g.get_tensor_by_name('import/Placeholder:0'), segment_ids=None, input_masks=g.get_tensor_by_name('import/Placeholder_1:0'), logits=g.get_tensor_by_name('import/logits:0'), vectorizer=g.get_tensor_by_name('import/dense/BiasAdd:0'), sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, settings=nodes, ) if model in ['xlnet', 'alxlnet']: tokenizer = sentencepiece_tokenizer_xlnet(path[model]['tokenizer']) return TAGGING_XLNET( X=g.get_tensor_by_name('import/Placeholder:0'), segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'), input_masks=g.get_tensor_by_name('import/Placeholder_2:0'), logits=g.get_tensor_by_name('import/logits:0'), vectorizer=g.get_tensor_by_name('import/transpose_3:0'), sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, settings=nodes, )
def transformer(model: str = 'xlnet', **kwargs): """ Load Transformer Constituency Parsing model, transfer learning Transformer + self attentive parsing. Parameters ---------- model : str, optional (default='bert') Model architecture supported. Allowed values: * ``'bert'`` - Google BERT BASE parameters. * ``'tiny-bert'`` - Google BERT TINY parameters. * ``'albert'`` - Google ALBERT BASE parameters. * ``'tiny-albert'`` - Google ALBERT TINY parameters. * ``'xlnet'`` - Google XLNET BASE parameters. Returns ------- result : malaya.model.tf.CONSTITUENCY class """ model = model.lower() if model not in _transformer_availability: raise ValueError( 'model not supported, please check supported models from malaya.constituency.available_transformer()' ) check_file(PATH_CONSTITUENCY[model], S3_PATH_CONSTITUENCY[model], **kwargs) g = load_graph(PATH_CONSTITUENCY[model]['model'], **kwargs) with open(PATH_CONSTITUENCY[model]['dictionary']) as fopen: dictionary = json.load(fopen) if model in ['bert', 'tiny-bert', 'albert', 'tiny-albert']: tokenizer = sentencepiece_tokenizer_bert( PATH_CONSTITUENCY[model]['tokenizer'], PATH_CONSTITUENCY[model]['vocab'], ) mode = 'bert' if model in ['xlnet']: tokenizer = sentencepiece_tokenizer_xlnet( PATH_CONSTITUENCY[model]['tokenizer']) mode = 'xlnet' from malaya.model.tf import CONSTITUENCY return CONSTITUENCY( input_ids=g.get_tensor_by_name('import/input_ids:0'), word_end_mask=g.get_tensor_by_name('import/word_end_mask:0'), charts=g.get_tensor_by_name('import/charts:0'), tags=g.get_tensor_by_name('import/tags:0'), sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, dictionary=dictionary, mode=mode, )
def transformer_squad(class_name, model = 'bert', quantized = False, **kwargs): path = check_file( file = model, module = class_name, keys = { 'model': 'model.pb', 'vocab': MODEL_VOCAB[model], 'tokenizer': MODEL_BPE[model], }, quantized = quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2', 'Placeholder_3'] if model in ['bert', 'tiny-bert']: tokenizer = sentencepiece_tokenizer_bert( path['tokenizer'], path['vocab'] ) if model in ['albert', 'tiny-albert']: tokenizer = AlbertTokenizer( vocab_file = path['vocab'], spm_model_file = path['tokenizer'] ) if model in ['xlnet', 'alxlnet']: tokenizer = sentencepiece_tokenizer_xlnet(path['tokenizer']) inputs.append('Placeholder_4') outputs = [ 'start_top_log_probs', 'start_top_index', 'end_top_log_probs', 'end_top_index', 'cls_logits', 'logits_vectorize', ] input_nodes, output_nodes = nodes_session(g, inputs, outputs) mode = 'bert' if 'bert' in model else 'xlnet' return SQUAD( input_nodes = input_nodes, output_nodes = output_nodes, sess = generate_session(graph = g, **kwargs), tokenizer = tokenizer, class_name = class_name, mode = mode, length = LENGTHS[mode], )
def transformer( path, s3_path, class_name, model = 'xlnet', size = 'base', **kwargs ): check_file(path[model][size], s3_path[model][size], **kwargs) try: with open(path[model][size]['setting']) as fopen: nodes = json.load(fopen) g = load_graph(path[model][size]['model']) except: raise Exception( f"model corrupted due to some reasons, please run malaya.clear_cache('{class_name}/{model}/{size}') and try again" ) if model in ['albert', 'bert']: tokenizer, cls, sep = sentencepiece_tokenizer_bert( path[model][size]['tokenizer'], path[model][size]['vocab'] ) return TAGGING_BERT( X = g.get_tensor_by_name('import/Placeholder:0'), segment_ids = None, input_masks = None, logits = g.get_tensor_by_name('import/logits:0'), sess = generate_session(graph = g), tokenizer = tokenizer, cls = cls, sep = sep, settings = nodes, ) if model in ['xlnet']: tokenizer = sentencepiece_tokenizer_xlnet( path[model][size]['tokenizer'] ) return TAGGING_XLNET( X = g.get_tensor_by_name('import/Placeholder:0'), segment_ids = g.get_tensor_by_name('import/Placeholder_1:0'), input_masks = g.get_tensor_by_name('import/Placeholder_2:0'), logits = g.get_tensor_by_name('import/logits:0'), sess = generate_session(graph = g), tokenizer = tokenizer, settings = nodes, )
def transformer(model: str = 'xlnet', quantized: bool = False, **kwargs): """ Load Transformer toxicity model. Parameters ---------- model : str, optional (default='bert') Model architecture supported. Allowed values: * ``'bert'`` - Google BERT BASE parameters. * ``'tiny-bert'`` - Google BERT TINY parameters. * ``'albert'`` - Google ALBERT BASE parameters. * ``'tiny-albert'`` - Google ALBERT TINY parameters. * ``'xlnet'`` - Google XLNET BASE parameters. * ``'alxlnet'`` - Malaya ALXLNET BASE parameters. quantized : bool, optional (default=False) if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine. Returns ------- result : malaya.model.bert.SIGMOID_BERT class """ model = model.lower() if model not in _transformer_availability: raise Exception( 'model not supported, please check supported models from `malaya.toxicity.available_transformer()`.' ) check_file( PATH_TOXIC[model], S3_PATH_TOXIC[model], quantized = quantized, **kwargs ) if quantized: model_path = 'quantized' else: model_path = 'model' g = load_graph(PATH_TOXIC[model][model_path], **kwargs) path = PATH_TOXIC if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']: if model in ['bert', 'tiny-bert']: from malaya.transformers.bert import ( _extract_attention_weights_import, ) from malaya.transformers.bert import bert_num_layers tokenizer = sentencepiece_tokenizer_bert( path[model]['tokenizer'], path[model]['vocab'] ) if model in ['albert', 'tiny-albert']: from malaya.transformers.albert import ( _extract_attention_weights_import, ) from malaya.transformers.albert import bert_num_layers from albert import tokenization tokenizer = tokenization.FullTokenizer( vocab_file = path[model]['vocab'], do_lower_case = False, spm_model_file = path[model]['tokenizer'], ) return SIGMOID_BERT( X = g.get_tensor_by_name('import/Placeholder:0'), segment_ids = None, input_masks = g.get_tensor_by_name('import/Placeholder_1:0'), logits = g.get_tensor_by_name('import/logits:0'), logits_seq = g.get_tensor_by_name('import/logits_seq:0'), vectorizer = g.get_tensor_by_name('import/dense/BiasAdd:0'), sess = generate_session(graph = g, **kwargs), tokenizer = tokenizer, label = label, attns = _extract_attention_weights_import( bert_num_layers[model], g ), class_name = 'toxic', ) if model in ['xlnet', 'alxlnet']: if model in ['xlnet']: from malaya.transformers.xlnet import ( _extract_attention_weights_import, ) if model in ['alxlnet']: from malaya.transformers.alxlnet import ( _extract_attention_weights_import, ) tokenizer = sentencepiece_tokenizer_xlnet(path[model]['tokenizer']) return SIGMOID_XLNET( X = g.get_tensor_by_name('import/Placeholder:0'), segment_ids = g.get_tensor_by_name('import/Placeholder_1:0'), input_masks = g.get_tensor_by_name('import/Placeholder_2:0'), logits = g.get_tensor_by_name('import/logits:0'), logits_seq = g.get_tensor_by_name('import/logits_seq:0'), vectorizer = g.get_tensor_by_name('import/transpose_3:0'), sess = generate_session(graph = g, **kwargs), tokenizer = tokenizer, label = label, attns = _extract_attention_weights_import(g), class_name = 'toxic', )
def transformer(model='base', **kwargs): """ Load transformer encoder-decoder model to generate a paraphrase given a string. Parameters ---------- model : str, optional (default='base') Model architecture supported. Allowed values: * ``'base'`` - transformer Base parameters. * ``'tiny'`` - transformer Tiny parameters. * ``'tiny-bert'`` - BERT-BERT Tiny parameters. * ``'bert'`` - BERT-BERT Base parameters. Returns ------- result: malaya.model.tf.PARAPHRASE class """ model = model.lower() if model not in _transformer_availability: raise Exception( 'model not supported, please check supported models from malaya.paraphrase.available_transformer()' ) if 'bert' in model: path = PATH_PARAPHRASE[model] s3_path = S3_PATH_PARAPHRASE[model] check_file(path, s3_path, **kwargs) g = load_graph(path['model']) if model in ['bert', 'tiny-bert']: from malaya.text.bpe import sentencepiece_tokenizer_bert tokenizer = sentencepiece_tokenizer_bert(path['tokenizer'], path['vocab']) from malaya.model.bert import PARAPHRASE_BERT return PARAPHRASE_BERT( X=g.get_tensor_by_name('import/Placeholder:0'), segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'), input_masks=g.get_tensor_by_name('import/Placeholder_2:0'), logits=g.get_tensor_by_name('import/greedy:0'), sess=generate_session(graph=g), tokenizer=tokenizer, ) else: path = PATH_PARAPHRASE['transformer'] s3_path = S3_PATH_PARAPHRASE['transformer'] check_file(path[model], s3_path[model], **kwargs) g = load_graph(path[model]['model']) from malaya.text.t2t import text_encoder from malaya.model.tf import PARAPHRASE encoder = text_encoder.SubwordTextEncoder(path[model]['vocab']) return PARAPHRASE( g.get_tensor_by_name('import/Placeholder:0'), g.get_tensor_by_name('import/greedy:0'), g.get_tensor_by_name('import/beam:0'), generate_session(graph=g), encoder, )
def transformer( path, s3_path, class_name, label, model='bert', quantized=False, **kwargs, ): check_file(path[model], s3_path[model], quantized=quantized, **kwargs) if quantized: model_path = 'quantized' else: model_path = 'model' g = load_graph(path[model][model_path], **kwargs) if len(label) > 2 or class_name == 'relevancy': if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']: selected_class = MULTICLASS_BERT selected_node = 'import/dense/BiasAdd:0' if model in ['xlnet', 'alxlnet']: selected_class = MULTICLASS_XLNET selected_node = 'import/transpose_3:0' else: if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']: selected_class = BINARY_BERT selected_node = 'import/dense/BiasAdd:0' if model in ['xlnet', 'alxlnet']: selected_class = BINARY_XLNET selected_node = 'import/transpose_3:0' if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']: if model in ['bert', 'tiny-bert']: from malaya.transformers.bert import ( _extract_attention_weights_import, ) from malaya.transformers.bert import bert_num_layers tokenizer = sentencepiece_tokenizer_bert(path[model]['tokenizer'], path[model]['vocab']) if model in ['albert', 'tiny-albert']: from malaya.transformers.albert import ( _extract_attention_weights_import, ) from malaya.transformers.albert import bert_num_layers from albert import tokenization tokenizer = tokenization.FullTokenizer( vocab_file=path[model]['vocab'], do_lower_case=False, spm_model_file=path[model]['tokenizer'], ) return selected_class( X=g.get_tensor_by_name('import/Placeholder:0'), segment_ids=None, input_masks=g.get_tensor_by_name('import/Placeholder_1:0'), logits=g.get_tensor_by_name('import/logits:0'), logits_seq=g.get_tensor_by_name('import/logits_seq:0'), vectorizer=g.get_tensor_by_name(selected_node), sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, label=label, attns=_extract_attention_weights_import(bert_num_layers[model], g), class_name=class_name, ) if model in ['xlnet', 'alxlnet']: if model in ['xlnet']: from malaya.transformers.xlnet import ( _extract_attention_weights_import, ) if model in ['alxlnet']: from malaya.transformers.alxlnet import ( _extract_attention_weights_import, ) tokenizer = sentencepiece_tokenizer_xlnet(path[model]['tokenizer']) return selected_class( X=g.get_tensor_by_name('import/Placeholder:0'), segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'), input_masks=g.get_tensor_by_name('import/Placeholder_2:0'), logits=g.get_tensor_by_name('import/logits:0'), logits_seq=g.get_tensor_by_name('import/logits_seq:0'), vectorizer=g.get_tensor_by_name(selected_node), sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, label=label, attns=_extract_attention_weights_import(g), class_name=class_name, )
def transformer(model: str = 'bert', **kwargs): """ Load Transformer sentiment model. Parameters ---------- model : str, optional (default='bert') Model architecture supported. Allowed values: * ``'bert'`` - BERT architecture from google. * ``'tiny-bert'`` - BERT architecture from google with smaller parameters. * ``'albert'`` - ALBERT architecture from google. * ``'tiny-albert'`` - ALBERT architecture from google with smaller parameters. * ``'xlnet'`` - XLNET architecture from google. * ``'alxlnet'`` - XLNET architecture from google + Malaya. Returns ------- BERT : malaya._models._bert_model.BINARY_BERT class """ model = model.lower() if model not in _availability: raise Exception( 'model not supported, please check supported models from malaya.similarity.available_transformer_model()' ) check_file(PATH_SIMILARITY[model], S3_PATH_SIMILARITY[model], **kwargs) g = load_graph(PATH_SIMILARITY[model]['model']) path = PATH_SIMILARITY if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']: if model in ['bert', 'tiny-bert']: from malaya.transformers.bert import ( _extract_attention_weights_import, ) from malaya.transformers.bert import bert_num_layers tokenizer = sentencepiece_tokenizer_bert(path[model]['tokenizer'], path[model]['vocab']) if model in ['albert', 'tiny-albert']: from malaya.transformers.albert import ( _extract_attention_weights_import, ) from malaya.transformers.albert import bert_num_layers from albert import tokenization tokenizer = tokenization.FullTokenizer( vocab_file=path[model]['vocab'], do_lower_case=False, spm_model_file=path[model]['tokenizer'], ) return SIAMESE_BERT( X=g.get_tensor_by_name('import/Placeholder:0'), segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'), input_masks=g.get_tensor_by_name('import/Placeholder_2:0'), logits=g.get_tensor_by_name('import/logits:0'), sess=generate_session(graph=g), tokenizer=tokenizer, label=['not similar', 'similar'], ) if model in ['xlnet', 'alxlnet']: if model in ['xlnet']: from malaya.transformers.xlnet import ( _extract_attention_weights_import, ) if model in ['alxlnet']: from malaya.transformers.alxlnet import ( _extract_attention_weights_import, ) tokenizer = sentencepiece_tokenizer_xlnet(path[model]['tokenizer']) return SIAMESE_XLNET( X=g.get_tensor_by_name('import/Placeholder:0'), segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'), input_masks=g.get_tensor_by_name('import/Placeholder_2:0'), logits=g.get_tensor_by_name('import/logits:0'), sess=generate_session(graph=g), tokenizer=tokenizer, label=['not similar', 'similar'], )
def transformer(model: str = 'xlnet', quantized: bool = False, **kwargs): """ Load Transformer Dependency Parsing model, transfer learning Transformer + biaffine attention. Parameters ---------- model : str, optional (default='bert') Model architecture supported. Allowed values: * ``'bert'`` - Google BERT BASE parameters. * ``'tiny-bert'`` - Google BERT TINY parameters. * ``'albert'`` - Google ALBERT BASE parameters. * ``'tiny-albert'`` - Google ALBERT TINY parameters. * ``'xlnet'`` - Google XLNET BASE parameters. * ``'alxlnet'`` - Malaya ALXLNET BASE parameters. quantized : bool, optional (default=False) if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine. Returns ------- result: model List of model classes: * if `bert` in model, will return `malaya.model.bert.DependencyBERT`. * if `xlnet` in model, will return `malaya.model.xlnet.DependencyXLNET`. """ model = model.lower() if model not in _transformer_availability: raise ValueError( 'model not supported, please check supported models from `malaya.dependency.available_transformer()`.' ) path = check_file( file=model, module='dependency', keys={ 'model': 'model.pb', 'vocab': MODEL_VOCAB[model], 'tokenizer': MODEL_BPE[model], }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) if model in ['bert', 'tiny-bert', 'albert', 'tiny-albert']: tokenizer = sentencepiece_tokenizer_bert(path['tokenizer'], path['vocab']) inputs = ['Placeholder'] vectorizer = {'vectorizer': 'import/dense/BiasAdd:0'} Model = DependencyBERT if model in ['xlnet', 'alxlnet']: tokenizer = sentencepiece_tokenizer_xlnet(path['tokenizer']) inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2'] vectorizer = {'vectorizer': 'import/transpose_3:0'} Model = DependencyXLNET outputs = ['logits', 'heads_seq'] input_nodes, output_nodes = nodes_session(g, inputs, outputs, extra=vectorizer) return Model( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, settings=label, )
def transformer(model: str = 'bert', quantized: bool = False, **kwargs): """ Load Transformer keyword similarity model. Parameters ---------- model : str, optional (default='bert') Model architecture supported. Allowed values: * ``'bert'`` - Google BERT BASE parameters. * ``'tiny-bert'`` - Google BERT TINY parameters. * ``'xlnet'`` - Google XLNET BASE parameters. * ``'alxlnet'`` - Malaya ALXLNET BASE parameters. quantized : bool, optional (default=False) if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine. Returns ------- result: model List of model classes: * if `bert` in model, will return `malaya.model.bert.KeyphraseBERT`. * if `xlnet` in model, will return `malaya.model.xlnet.KeyphraseXLNET`. """ model = model.lower() if model not in _transformer_availability: raise ValueError( 'model not supported, please check supported models from `malaya.keyword_extraction.available_transformer()`.' ) path = check_file( file=model, module='keyword-extraction', keys={ 'model': 'model.pb', 'vocab': MODEL_VOCAB[model], 'tokenizer': MODEL_BPE[model], }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) outputs = ['logits'] if model in ['bert', 'tiny-bert']: tokenizer = sentencepiece_tokenizer_bert(path['tokenizer'], path['vocab']) inputs = [ 'Placeholder', 'Placeholder_1', 'Placeholder_2', 'Placeholder_3', ] outputs.append('bert/summary') selected_class = KeyphraseBERT if model in ['xlnet', 'alxlnet']: tokenizer = sentencepiece_tokenizer_xlnet(path['tokenizer']) inputs = [ 'Placeholder', 'Placeholder_1', 'Placeholder_2', 'Placeholder_3', 'Placeholder_4', 'Placeholder_5', ] outputs.append('xlnet/summary') selected_class = KeyphraseXLNET input_nodes, output_nodes = nodes_session(g, inputs, outputs) return selected_class( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, label=['not similar', 'similar'], )
def transformer(model: str = 'xlnet', quantized: bool = False, **kwargs): """ Load Transformer Dependency Parsing model, transfer learning Transformer + biaffine attention. Parameters ---------- model : str, optional (default='bert') Model architecture supported. Allowed values: * ``'bert'`` - Google BERT BASE parameters. * ``'tiny-bert'`` - Google BERT TINY parameters. * ``'albert'`` - Google ALBERT BASE parameters. * ``'tiny-albert'`` - Google ALBERT TINY parameters. * ``'xlnet'`` - Google XLNET BASE parameters. * ``'alxlnet'`` - Malaya ALXLNET BASE parameters. quantized : bool, optional (default=False) if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine. Returns ------- result : Transformer class """ model = model.lower() if model not in _transformer_availability: raise ValueError( 'model not supported, please check supported models from `malaya.dependency.available_transformer()`.' ) check_file(PATH_DEPENDENCY[model], S3_PATH_DEPENDENCY[model], quantized=quantized, **kwargs) if quantized: model_path = 'quantized' else: model_path = 'model' g = load_graph(PATH_DEPENDENCY[model][model_path], **kwargs) if model in ['bert', 'tiny-bert', 'albert', 'tiny-albert']: from malaya.model.bert import DEPENDENCY_BERT tokenizer = sentencepiece_tokenizer_bert( PATH_DEPENDENCY[model]['tokenizer'], PATH_DEPENDENCY[model]['vocab']) return DEPENDENCY_BERT( X=g.get_tensor_by_name('import/Placeholder:0'), segment_ids=None, input_masks=None, logits=g.get_tensor_by_name('import/logits:0'), vectorizer=g.get_tensor_by_name('import/dense/BiasAdd:0'), sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, settings=label, heads_seq=g.get_tensor_by_name('import/heads_seq:0'), ) if model in ['xlnet', 'alxlnet']: from malaya.model.xlnet import DEPENDENCY_XLNET tokenizer = sentencepiece_tokenizer_xlnet( PATH_DEPENDENCY[model]['tokenizer']) return DEPENDENCY_XLNET( X=g.get_tensor_by_name('import/Placeholder:0'), segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'), input_masks=g.get_tensor_by_name('import/Placeholder_2:0'), logits=g.get_tensor_by_name('import/logits:0'), vectorizer=g.get_tensor_by_name('import/transpose_3:0'), sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, settings=label, heads_seq=g.get_tensor_by_name('import/heads_seq:0'), )
def transformer(model: str = 'xlnet', **kwargs): """ Load Transformer Entity Tagging model, transfer learning Transformer + biaffine attention. Parameters ---------- model : str, optional (default='bert') Model architecture supported. Allowed values: * ``'bert'`` - BERT architecture from google. * ``'tiny-bert'`` - BERT architecture from google with smaller parameters. * ``'albert'`` - ALBERT architecture from google. * ``'tiny-albert'`` - ALBERT architecture from google with smaller parameters. * ``'xlnet'`` - XLNET architecture from google. * ``'alxlnet'`` - XLNET architecture from google + Malaya. Returns ------- result : Transformer class """ model = model.lower() if model not in _availability: raise Exception( 'model not supported, please check supported models from malaya.dependency.available_transformer()' ) check_file(PATH_DEPENDENCY[model], S3_PATH_DEPENDENCY[model], **kwargs) g = load_graph(PATH_DEPENDENCY[model]['model']) if model in ['bert', 'tiny-bert', 'albert', 'tiny-albert']: from malaya.model.bert import DEPENDENCY_BERT tokenizer = sentencepiece_tokenizer_bert( PATH_DEPENDENCY[model]['tokenizer'], PATH_DEPENDENCY[model]['vocab'] ) return DEPENDENCY_BERT( X = g.get_tensor_by_name('import/Placeholder:0'), segment_ids = None, input_masks = None, logits = g.get_tensor_by_name('import/logits:0'), sess = generate_session(graph = g), tokenizer = tokenizer, settings = label, heads_seq = g.get_tensor_by_name('import/heads_seq:0'), ) if model in ['xlnet', 'alxlnet']: from malaya.model.xlnet import DEPENDENCY_XLNET tokenizer = sentencepiece_tokenizer_xlnet( PATH_DEPENDENCY[model]['tokenizer'] ) return DEPENDENCY_XLNET( X = g.get_tensor_by_name('import/Placeholder:0'), segment_ids = g.get_tensor_by_name('import/Placeholder_1:0'), input_masks = g.get_tensor_by_name('import/Placeholder_2:0'), logits = g.get_tensor_by_name('import/logits:0'), sess = generate_session(graph = g), tokenizer = tokenizer, settings = label, heads_seq = g.get_tensor_by_name('import/heads_seq:0'), )
def transformer_ontonotes5(class_name, model='xlnet', quantized=False, **kwargs): path = check_file( file=model, module=class_name, keys={ 'model': 'model.pb', 'vocab': MODEL_VOCAB[model], 'tokenizer': MODEL_BPE[model], 'setting': TAGGING_SETTING[class_name], }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) hypen = r'\w+(?:-\w+)+' hypen_left = r'\w+(?: -\w+)+' hypen_right = r'\w+(?:- \w+)+' hypen_both = r'\w+(?: - \w+)+' pipeline = [ hypen, hypen_left, hypen_right, hypen_both, _expressions['percent'], _expressions['money'], _expressions['time'], _expressions['date'], _expressions['repeat_puncts'], _expressions['number'], _expressions['word'], ] pipeline.append('(?:\S)') compiled = re.compile(r'({})'.format('|'.join(pipeline))) def tok(string): tokens = compiled.findall(string) return [t[0] for t in tokens] try: with open(path['setting']) as fopen: nodes = json.load(fopen) except: raise Exception( f"model corrupted due to some reasons, please run `malaya.clear_cache('{class_name}/{model}/{size}')` and try again" ) if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']: if model in ['bert', 'tiny-bert']: tokenizer = sentencepiece_tokenizer_bert(path['tokenizer'], path['vocab']) if model in ['albert', 'tiny-albert']: from malaya.transformers.albert import tokenization tokenizer = tokenization.FullTokenizer( vocab_file=path['vocab'], do_lower_case=False, spm_model_file=path['tokenizer'], ) inputs = ['Placeholder', 'Placeholder_1'] vectorizer = {'vectorizer': 'import/dense/BiasAdd:0'} Model = TaggingBERT return TaggingBERT( X=g.get_tensor_by_name('import/Placeholder:0'), segment_ids=None, input_masks=g.get_tensor_by_name('import/Placeholder_1:0'), logits=g.get_tensor_by_name('import/logits:0'), vectorizer=g.get_tensor_by_name('import/dense/BiasAdd:0'), sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, settings=nodes, tok=tok, ) if model in ['xlnet', 'alxlnet']: tokenizer = sentencepiece_tokenizer_xlnet(path['tokenizer']) inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2'] vectorizer = {'vectorizer': 'import/transpose_3:0'} Model = TaggingXLNET outputs = ['logits'] input_nodes, output_nodes = nodes_session(g, inputs, outputs, extra=vectorizer) return Model( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, settings=nodes, )
def transformer( class_name, label, model='bert', sigmoid=False, quantized=False, **kwargs, ): path = check_file( file=model, module=class_name, keys={ 'model': 'model.pb', 'vocab': MODEL_VOCAB[model], 'tokenizer': MODEL_BPE[model], }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) if sigmoid: if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']: selected_class = SigmoidBERT if model in ['xlnet', 'alxlnet']: selected_class = SigmoidXLNET else: if len(label) > 2 or class_name == 'relevancy': if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']: selected_class = MulticlassBERT if model in ['xlnet', 'alxlnet']: selected_class = MulticlassXLNET if model in ['bigbird', 'tiny-bigbird']: selected_class = MulticlassBigBird else: if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']: selected_class = BinaryBERT if model in ['xlnet', 'alxlnet']: selected_class = BinaryXLNET if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']: if model in ['bert', 'tiny-bert']: from malaya.transformers.bert import ( _extract_attention_weights_import, ) from malaya.transformers.bert import bert_num_layers tokenizer = sentencepiece_tokenizer_bert(path['tokenizer'], path['vocab']) if model in ['albert', 'tiny-albert']: from malaya.transformers.albert import ( _extract_attention_weights_import, ) from malaya.transformers.albert import bert_num_layers from malaya.transformers.albert import tokenization tokenizer = tokenization.FullTokenizer( vocab_file=path['vocab'], do_lower_case=False, spm_model_file=path['tokenizer'], ) inputs = ['Placeholder', 'Placeholder_1'] vectorizer = {'vectorizer': 'import/dense/BiasAdd:0'} attention = _extract_attention_weights_import(bert_num_layers[model], g) if model in ['xlnet', 'alxlnet']: if model in ['xlnet']: from malaya.transformers.xlnet import ( _extract_attention_weights_import, ) if model in ['alxlnet']: from malaya.transformers.alxlnet import ( _extract_attention_weights_import, ) inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2'] tokenizer = sentencepiece_tokenizer_xlnet(path['tokenizer']) vectorizer = {'vectorizer': 'import/transpose_3:0'} attention = _extract_attention_weights_import(g) if model in ['bigbird', 'tiny-bigbird']: inputs = ['Placeholder'] tokenizer = sentencepiece_tokenizer_bert(path['tokenizer'], path['vocab']) vectorizer = {'vectorizer': 'import/dense/BiasAdd:0'} attention = None outputs = ['logits', 'logits_seq'] input_nodes, output_nodes = nodes_session( g, inputs, outputs, extra=vectorizer, attention={'attention': attention}, ) return selected_class( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, label=label, class_name=class_name, )
def _transformer(model, bert_class, xlnet_class, **kwargs): model = model.lower() if model not in _availability: raise Exception( 'model not supported, please check supported models from malaya.similarity.available_transformer()' ) check_file(PATH_SIMILARITY[model], S3_PATH_SIMILARITY[model], **kwargs) g = load_graph(PATH_SIMILARITY[model]['model'], **kwargs) path = PATH_SIMILARITY if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']: if model in ['bert', 'tiny-bert']: from malaya.transformers.bert import ( _extract_attention_weights_import, ) from malaya.transformers.bert import bert_num_layers tokenizer = sentencepiece_tokenizer_bert(path[model]['tokenizer'], path[model]['vocab']) if model in ['albert', 'tiny-albert']: from malaya.transformers.albert import ( _extract_attention_weights_import, ) from malaya.transformers.albert import bert_num_layers from albert import tokenization tokenizer = tokenization.FullTokenizer( vocab_file=path[model]['vocab'], do_lower_case=False, spm_model_file=path[model]['tokenizer'], ) return bert_class( X=g.get_tensor_by_name('import/Placeholder:0'), segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'), input_masks=g.get_tensor_by_name('import/Placeholder_2:0'), logits=g.get_tensor_by_name('import/logits:0'), sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, label=['not similar', 'similar'], ) if model in ['xlnet', 'alxlnet']: if model in ['xlnet']: from malaya.transformers.xlnet import ( _extract_attention_weights_import, ) if model in ['alxlnet']: from malaya.transformers.alxlnet import ( _extract_attention_weights_import, ) tokenizer = sentencepiece_tokenizer_xlnet(path[model]['tokenizer']) return xlnet_class( X=g.get_tensor_by_name('import/Placeholder:0'), segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'), input_masks=g.get_tensor_by_name('import/Placeholder_2:0'), logits=g.get_tensor_by_name('import/logits:0'), sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, label=['not similar', 'similar'], )
def transformer(model: str = 'xlnet', quantized: bool = False, **kwargs): """ Load Transformer Constituency Parsing model, transfer learning Transformer + self attentive parsing. Parameters ---------- model : str, optional (default='bert') Model architecture supported. Allowed values: * ``'bert'`` - Google BERT BASE parameters. * ``'tiny-bert'`` - Google BERT TINY parameters. * ``'albert'`` - Google ALBERT BASE parameters. * ``'tiny-albert'`` - Google ALBERT TINY parameters. * ``'xlnet'`` - Google XLNET BASE parameters. quantized : bool, optional (default=False) if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine. Returns ------- result : malaya.model.tf.Constituency class """ model = model.lower() if model not in _transformer_availability: raise ValueError( 'model not supported, please check supported models from `malaya.constituency.available_transformer()`.' ) path = check_file( file=model, module='constituency', keys={ 'model': 'model.pb', 'vocab': MODEL_VOCAB[model], 'tokenizer': MODEL_BPE[model], 'setting': CONSTITUENCY_SETTING, }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) with open(path['setting']) as fopen: dictionary = json.load(fopen) if model in ['bert', 'tiny-bert', 'albert', 'tiny-albert']: tokenizer = sentencepiece_tokenizer_bert(path['tokenizer'], path['vocab']) mode = 'bert' if model in ['xlnet']: tokenizer = sentencepiece_tokenizer_xlnet(path['tokenizer']) mode = 'xlnet' inputs = ['input_ids', 'word_end_mask'] outputs = ['charts', 'tags'] input_nodes, output_nodes = nodes_session( g, inputs, outputs, extra={'vectorizer': _vectorizer_mapping[model]}) return Constituency( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, dictionary=dictionary, mode=mode, )
def transformer(model: str = 'bert', size: str = 'base', **kwargs): """ Load Transformer sentiment model. Parameters ---------- model : str, optional (default='bert') Model architecture supported. Allowed values: * ``'bert'`` - BERT architecture from google. * ``'xlnet'`` - XLNET architecture from google. * ``'albert'`` - ALBERT architecture from google. size : str, optional (default='base') Model size supported. Allowed values: * ``'base'`` - BASE size. * ``'small'`` - SMALL size. validate: bool, optional (default=True) Returns ------- BERT : malaya._models._bert_model.BINARY_BERT class """ model = model.lower() size = size.lower() if model not in _availability: raise Exception( 'model not supported, please check supported models from malaya.sentiment.available_transformer_model()' ) if size not in _availability[model]: raise Exception( 'size not supported, please check supported models from malaya.sentiment.available_transformer_model()' ) check_file(PATH_SIMILARITY[model][size], S3_PATH_SIMILARITY[model][size], **kwargs) g = load_graph(PATH_SIMILARITY[model][size]['model']) if model in ['albert', 'bert']: if model == 'bert': from ._transformer._bert import _extract_attention_weights_import if model == 'albert': from ._transformer._albert import _extract_attention_weights_import tokenizer, cls, sep = sentencepiece_tokenizer_bert( PATH_SIMILARITY[model][size]['tokenizer'], PATH_SIMILARITY[model][size]['vocab'], ) return SIAMESE_BERT( X=g.get_tensor_by_name('import/Placeholder:0'), segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'), input_masks=g.get_tensor_by_name('import/Placeholder_2:0'), logits=g.get_tensor_by_name('import/logits:0'), sess=generate_session(graph=g), tokenizer=tokenizer, label=['not similar', 'similar'], cls=cls, sep=sep, ) if model in ['xlnet']: from ._transformer._xlnet import _extract_attention_weights_import tokenizer = sentencepiece_tokenizer_xlnet( PATH_SIMILARITY[model][size]['tokenizer']) return SIAMESE_XLNET( X=g.get_tensor_by_name('import/Placeholder:0'), segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'), input_masks=g.get_tensor_by_name('import/Placeholder_2:0'), logits=g.get_tensor_by_name('import/logits:0'), sess=generate_session(graph=g), tokenizer=tokenizer, label=['not similar', 'similar'], )
def transformer(path, s3_path, class_name, label, model='bert', size='base', **kwargs): check_file(path[model][size], s3_path[model][size], **kwargs) g = load_graph(path[model][size]['model']) if len(label) > 2 or class_name == 'relevancy': if model in ['albert', 'bert']: selected_class = MULTICLASS_BERT if model in ['xlnet']: selected_class = MULTICLASS_XLNET else: if model in ['albert', 'bert']: selected_class = BINARY_BERT if model in ['xlnet']: selected_class = BINARY_XLNET if model in ['albert', 'bert']: if model == 'bert': from .._transformer._bert import _extract_attention_weights_import if model == 'albert': from .._transformer._albert import _extract_attention_weights_import tokenizer = sentencepiece_tokenizer_bert(path[model]['tokenizer'], path[model]['vocab']) return selected_class( X=g.get_tensor_by_name('import/Placeholder:0'), segment_ids=None, input_masks=None, logits=g.get_tensor_by_name('import/logits:0'), logits_seq=g.get_tensor_by_name('import/logits_seq:0'), sess=generate_session(graph=g), tokenizer=tokenizer, label=label, cls=cls, sep=sep, attns=_extract_attention_weights_import(bert_num_layers[size], g), class_name=class_name, ) if model in ['xlnet']: from .._transformer._xlnet import _extract_attention_weights_import tokenizer = sentencepiece_tokenizer_xlnet( path[model][size]['tokenizer']) return selected_class( X=g.get_tensor_by_name('import/Placeholder:0'), segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'), input_masks=g.get_tensor_by_name('import/Placeholder_2:0'), logits=g.get_tensor_by_name('import/logits:0'), logits_seq=g.get_tensor_by_name('import/logits_seq:0'), sess=generate_session(graph=g), tokenizer=tokenizer, label=label, attns=_extract_attention_weights_import(g), class_name=class_name, )