Пример #1
0
def _transformer(model,
                 bert_class,
                 xlnet_class,
                 quantized=False,
                 siamese=False,
                 **kwargs):
    model = model.lower()
    if model not in _transformer_availability:
        raise ValueError(
            'model not supported, please check supported models from `malaya.similarity.available_transformer()`.'
        )

    check_file(PATH_SIMILARITY[model],
               S3_PATH_SIMILARITY[model],
               quantized=quantized,
               **kwargs)
    if quantized:
        model_path = 'quantized'
    else:
        model_path = 'model'
    g = load_graph(PATH_SIMILARITY[model][model_path], **kwargs)

    path = PATH_SIMILARITY

    if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']:
        if model in ['bert', 'tiny-bert']:
            tokenizer = sentencepiece_tokenizer_bert(path[model]['tokenizer'],
                                                     path[model]['vocab'])

        if model in ['albert', 'tiny-albert']:
            tokenizer = tokenization.FullTokenizer(
                vocab_file=path[model]['vocab'],
                do_lower_case=False,
                spm_model_file=path[model]['tokenizer'],
            )

        selected_class = bert_class
        if siamese:
            selected_node = 'import/bert/pooler/dense/BiasAdd:0'

    if model in ['xlnet', 'alxlnet']:

        tokenizer = sentencepiece_tokenizer_xlnet(path[model]['tokenizer'])
        selected_class = xlnet_class
        if siamese:
            selected_node = 'import/model_1/sequnece_summary/summary/BiasAdd:0'

    if not siamese:
        selected_node = _vectorizer_mapping[model]

    return selected_class(
        X=g.get_tensor_by_name('import/Placeholder:0'),
        segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'),
        input_masks=g.get_tensor_by_name('import/Placeholder_2:0'),
        logits=g.get_tensor_by_name('import/logits:0'),
        vectorizer=g.get_tensor_by_name(selected_node),
        sess=generate_session(graph=g, **kwargs),
        tokenizer=tokenizer,
        label=['not similar', 'similar'],
    )
Пример #2
0
def transformer(class_name, model='xlnet', quantized=False, **kwargs):
    path = check_file(
        file=model,
        module=class_name,
        keys={
            'model': 'model.pb',
            'vocab': MODEL_VOCAB[model],
            'tokenizer': MODEL_BPE[model],
            'setting': TAGGING_SETTING[class_name],
        },
        quantized=quantized,
        **kwargs,
    )
    g = load_graph(path['model'], **kwargs)

    try:
        with open(path['setting']) as fopen:
            nodes = json.load(fopen)
    except:
        raise Exception(
            f"model corrupted due to some reasons, please run `malaya.clear_cache('{class_name}/{model}/{size}')` and try again"
        )

    if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']:
        if model in ['bert', 'tiny-bert']:
            tokenizer = sentencepiece_tokenizer_bert(path['tokenizer'],
                                                     path['vocab'])

        if model in ['albert', 'tiny-albert']:
            from malaya.transformers.albert import tokenization

            tokenizer = tokenization.FullTokenizer(
                vocab_file=path['vocab'],
                do_lower_case=False,
                spm_model_file=path['tokenizer'],
            )

        inputs = ['Placeholder', 'Placeholder_1']
        vectorizer = {'vectorizer': 'import/dense/BiasAdd:0'}
        Model = TaggingBERT

    if model in ['xlnet', 'alxlnet']:
        tokenizer = sentencepiece_tokenizer_xlnet(path['tokenizer'])
        inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2']
        vectorizer = {'vectorizer': 'import/transpose_3:0'}
        Model = TaggingXLNET

    outputs = ['logits']
    input_nodes, output_nodes = nodes_session(g,
                                              inputs,
                                              outputs,
                                              extra=vectorizer)

    return Model(
        input_nodes=input_nodes,
        output_nodes=output_nodes,
        sess=generate_session(graph=g, **kwargs),
        tokenizer=tokenizer,
        settings=nodes,
    )
Пример #3
0
def _transformer(model,
                 bert_class,
                 xlnet_class,
                 quantized=False,
                 siamese=False,
                 **kwargs):
    model = model.lower()
    if model not in _transformer_availability:
        raise ValueError(
            'model not supported, please check supported models from `malaya.similarity.available_transformer()`.'
        )

    path = check_file(
        file=model,
        module='similarity',
        keys={
            'model': 'model.pb',
            'vocab': MODEL_VOCAB[model],
            'tokenizer': MODEL_BPE[model],
        },
        quantized=quantized,
        **kwargs,
    )
    g = load_graph(path['model'], **kwargs)

    if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']:
        if model in ['bert', 'tiny-bert']:
            tokenizer = sentencepiece_tokenizer_bert(path['tokenizer'],
                                                     path['vocab'])

        if model in ['albert', 'tiny-albert']:
            tokenizer = AlbertTokenizer(vocab_file=path['vocab'],
                                        spm_model_file=path['tokenizer'])
        selected_class = bert_class

        if siamese:
            selected_node = 'import/bert/pooler/dense/BiasAdd:0'

    if model in ['xlnet', 'alxlnet']:

        tokenizer = sentencepiece_tokenizer_xlnet(path['tokenizer'])
        selected_class = xlnet_class
        if siamese:
            selected_node = 'import/model_1/sequnece_summary/summary/BiasAdd:0'

    if not siamese:
        selected_node = _vectorizer_mapping[model]

    inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2']
    outputs = ['logits']
    input_nodes, output_nodes = nodes_session(
        g, inputs, outputs, extra={'vectorizer': selected_node})

    return selected_class(
        input_nodes=input_nodes,
        output_nodes=output_nodes,
        sess=generate_session(graph=g, **kwargs),
        tokenizer=tokenizer,
        label=['not similar', 'similar'],
    )
Пример #4
0
def transformer(path,
                s3_path,
                class_name,
                model='xlnet',
                quantized=False,
                **kwargs):
    check_file(path[model], s3_path[model], quantized=quantized, **kwargs)
    if quantized:
        model_path = 'quantized'
    else:
        model_path = 'model'
    g = load_graph(path[model][model_path], **kwargs)

    try:
        with open(path[model]['setting']) as fopen:
            nodes = json.load(fopen)
    except:
        raise Exception(
            f"model corrupted due to some reasons, please run malaya.clear_cache('{class_name}/{model}/{size}') and try again"
        )

    if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']:
        if model in ['bert', 'tiny-bert']:
            tokenizer = sentencepiece_tokenizer_bert(path[model]['tokenizer'],
                                                     path[model]['vocab'])

        if model in ['albert', 'tiny-albert']:
            from albert import tokenization

            tokenizer = tokenization.FullTokenizer(
                vocab_file=path[model]['vocab'],
                do_lower_case=False,
                spm_model_file=path[model]['tokenizer'],
            )

        return TAGGING_BERT(
            X=g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids=None,
            input_masks=g.get_tensor_by_name('import/Placeholder_1:0'),
            logits=g.get_tensor_by_name('import/logits:0'),
            vectorizer=g.get_tensor_by_name('import/dense/BiasAdd:0'),
            sess=generate_session(graph=g, **kwargs),
            tokenizer=tokenizer,
            settings=nodes,
        )

    if model in ['xlnet', 'alxlnet']:
        tokenizer = sentencepiece_tokenizer_xlnet(path[model]['tokenizer'])
        return TAGGING_XLNET(
            X=g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'),
            input_masks=g.get_tensor_by_name('import/Placeholder_2:0'),
            logits=g.get_tensor_by_name('import/logits:0'),
            vectorizer=g.get_tensor_by_name('import/transpose_3:0'),
            sess=generate_session(graph=g, **kwargs),
            tokenizer=tokenizer,
            settings=nodes,
        )
Пример #5
0
def transformer(model: str = 'xlnet', **kwargs):
    """
    Load Transformer Constituency Parsing model, transfer learning Transformer + self attentive parsing.

    Parameters
    ----------
    model : str, optional (default='bert')
        Model architecture supported. Allowed values:

        * ``'bert'`` - Google BERT BASE parameters.
        * ``'tiny-bert'`` - Google BERT TINY parameters.
        * ``'albert'`` - Google ALBERT BASE parameters.
        * ``'tiny-albert'`` - Google ALBERT TINY parameters.
        * ``'xlnet'`` - Google XLNET BASE parameters.

    Returns
    -------
    result : malaya.model.tf.CONSTITUENCY class
    """

    model = model.lower()
    if model not in _transformer_availability:
        raise ValueError(
            'model not supported, please check supported models from malaya.constituency.available_transformer()'
        )

    check_file(PATH_CONSTITUENCY[model], S3_PATH_CONSTITUENCY[model], **kwargs)
    g = load_graph(PATH_CONSTITUENCY[model]['model'], **kwargs)

    with open(PATH_CONSTITUENCY[model]['dictionary']) as fopen:
        dictionary = json.load(fopen)

    if model in ['bert', 'tiny-bert', 'albert', 'tiny-albert']:

        tokenizer = sentencepiece_tokenizer_bert(
            PATH_CONSTITUENCY[model]['tokenizer'],
            PATH_CONSTITUENCY[model]['vocab'],
        )
        mode = 'bert'

    if model in ['xlnet']:
        tokenizer = sentencepiece_tokenizer_xlnet(
            PATH_CONSTITUENCY[model]['tokenizer'])
        mode = 'xlnet'

    from malaya.model.tf import CONSTITUENCY

    return CONSTITUENCY(
        input_ids=g.get_tensor_by_name('import/input_ids:0'),
        word_end_mask=g.get_tensor_by_name('import/word_end_mask:0'),
        charts=g.get_tensor_by_name('import/charts:0'),
        tags=g.get_tensor_by_name('import/tags:0'),
        sess=generate_session(graph=g, **kwargs),
        tokenizer=tokenizer,
        dictionary=dictionary,
        mode=mode,
    )
Пример #6
0
def transformer_squad(class_name, model = 'bert', quantized = False, **kwargs):
    path = check_file(
        file = model,
        module = class_name,
        keys = {
            'model': 'model.pb',
            'vocab': MODEL_VOCAB[model],
            'tokenizer': MODEL_BPE[model],
        },
        quantized = quantized,
        **kwargs,
    )

    g = load_graph(path['model'], **kwargs)
    inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2', 'Placeholder_3']

    if model in ['bert', 'tiny-bert']:
        tokenizer = sentencepiece_tokenizer_bert(
            path['tokenizer'], path['vocab']
        )
    if model in ['albert', 'tiny-albert']:
        tokenizer = AlbertTokenizer(
            vocab_file = path['vocab'], spm_model_file = path['tokenizer']
        )
    if model in ['xlnet', 'alxlnet']:
        tokenizer = sentencepiece_tokenizer_xlnet(path['tokenizer'])
        inputs.append('Placeholder_4')

    outputs = [
        'start_top_log_probs',
        'start_top_index',
        'end_top_log_probs',
        'end_top_index',
        'cls_logits',
        'logits_vectorize',
    ]
    input_nodes, output_nodes = nodes_session(g, inputs, outputs)

    mode = 'bert' if 'bert' in model else 'xlnet'
    return SQUAD(
        input_nodes = input_nodes,
        output_nodes = output_nodes,
        sess = generate_session(graph = g, **kwargs),
        tokenizer = tokenizer,
        class_name = class_name,
        mode = mode,
        length = LENGTHS[mode],
    )
Пример #7
0
def transformer(
    path, s3_path, class_name, model = 'xlnet', size = 'base', **kwargs
):
    check_file(path[model][size], s3_path[model][size], **kwargs)

    try:
        with open(path[model][size]['setting']) as fopen:
            nodes = json.load(fopen)
        g = load_graph(path[model][size]['model'])
    except:
        raise Exception(
            f"model corrupted due to some reasons, please run malaya.clear_cache('{class_name}/{model}/{size}') and try again"
        )

    if model in ['albert', 'bert']:
        tokenizer, cls, sep = sentencepiece_tokenizer_bert(
            path[model][size]['tokenizer'], path[model][size]['vocab']
        )
        return TAGGING_BERT(
            X = g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids = None,
            input_masks = None,
            logits = g.get_tensor_by_name('import/logits:0'),
            sess = generate_session(graph = g),
            tokenizer = tokenizer,
            cls = cls,
            sep = sep,
            settings = nodes,
        )

    if model in ['xlnet']:
        tokenizer = sentencepiece_tokenizer_xlnet(
            path[model][size]['tokenizer']
        )
        return TAGGING_XLNET(
            X = g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids = g.get_tensor_by_name('import/Placeholder_1:0'),
            input_masks = g.get_tensor_by_name('import/Placeholder_2:0'),
            logits = g.get_tensor_by_name('import/logits:0'),
            sess = generate_session(graph = g),
            tokenizer = tokenizer,
            settings = nodes,
        )
Пример #8
0
def transformer(model: str = 'xlnet', quantized: bool = False, **kwargs):
    """
    Load Transformer toxicity model.

    Parameters
    ----------
    model : str, optional (default='bert')
        Model architecture supported. Allowed values:

        * ``'bert'`` - Google BERT BASE parameters.
        * ``'tiny-bert'`` - Google BERT TINY parameters.
        * ``'albert'`` - Google ALBERT BASE parameters.
        * ``'tiny-albert'`` - Google ALBERT TINY parameters.
        * ``'xlnet'`` - Google XLNET BASE parameters.
        * ``'alxlnet'`` - Malaya ALXLNET BASE parameters.

    quantized : bool, optional (default=False)
        if True, will load 8-bit quantized model. 
        Quantized model not necessary faster, totally depends on the machine.

    Returns
    -------
    result : malaya.model.bert.SIGMOID_BERT class
    """

    model = model.lower()
    if model not in _transformer_availability:
        raise Exception(
            'model not supported, please check supported models from `malaya.toxicity.available_transformer()`.'
        )

    check_file(
        PATH_TOXIC[model], S3_PATH_TOXIC[model], quantized = quantized, **kwargs
    )
    if quantized:
        model_path = 'quantized'
    else:
        model_path = 'model'
    g = load_graph(PATH_TOXIC[model][model_path], **kwargs)

    path = PATH_TOXIC

    if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']:
        if model in ['bert', 'tiny-bert']:
            from malaya.transformers.bert import (
                _extract_attention_weights_import,
            )
            from malaya.transformers.bert import bert_num_layers

            tokenizer = sentencepiece_tokenizer_bert(
                path[model]['tokenizer'], path[model]['vocab']
            )
        if model in ['albert', 'tiny-albert']:
            from malaya.transformers.albert import (
                _extract_attention_weights_import,
            )
            from malaya.transformers.albert import bert_num_layers
            from albert import tokenization

            tokenizer = tokenization.FullTokenizer(
                vocab_file = path[model]['vocab'],
                do_lower_case = False,
                spm_model_file = path[model]['tokenizer'],
            )

        return SIGMOID_BERT(
            X = g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids = None,
            input_masks = g.get_tensor_by_name('import/Placeholder_1:0'),
            logits = g.get_tensor_by_name('import/logits:0'),
            logits_seq = g.get_tensor_by_name('import/logits_seq:0'),
            vectorizer = g.get_tensor_by_name('import/dense/BiasAdd:0'),
            sess = generate_session(graph = g, **kwargs),
            tokenizer = tokenizer,
            label = label,
            attns = _extract_attention_weights_import(
                bert_num_layers[model], g
            ),
            class_name = 'toxic',
        )

    if model in ['xlnet', 'alxlnet']:
        if model in ['xlnet']:
            from malaya.transformers.xlnet import (
                _extract_attention_weights_import,
            )
        if model in ['alxlnet']:
            from malaya.transformers.alxlnet import (
                _extract_attention_weights_import,
            )

        tokenizer = sentencepiece_tokenizer_xlnet(path[model]['tokenizer'])

        return SIGMOID_XLNET(
            X = g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids = g.get_tensor_by_name('import/Placeholder_1:0'),
            input_masks = g.get_tensor_by_name('import/Placeholder_2:0'),
            logits = g.get_tensor_by_name('import/logits:0'),
            logits_seq = g.get_tensor_by_name('import/logits_seq:0'),
            vectorizer = g.get_tensor_by_name('import/transpose_3:0'),
            sess = generate_session(graph = g, **kwargs),
            tokenizer = tokenizer,
            label = label,
            attns = _extract_attention_weights_import(g),
            class_name = 'toxic',
        )
Пример #9
0
def transformer(model='base', **kwargs):
    """
    Load transformer encoder-decoder model to generate a paraphrase given a string.

    Parameters
    ----------
    model : str, optional (default='base')
        Model architecture supported. Allowed values:

        * ``'base'`` - transformer Base parameters.
        * ``'tiny'`` - transformer Tiny parameters.
        * ``'tiny-bert'`` - BERT-BERT Tiny parameters.
        * ``'bert'`` - BERT-BERT Base parameters.

    Returns
    -------
    result: malaya.model.tf.PARAPHRASE class
    """

    model = model.lower()
    if model not in _transformer_availability:
        raise Exception(
            'model not supported, please check supported models from malaya.paraphrase.available_transformer()'
        )

    if 'bert' in model:

        path = PATH_PARAPHRASE[model]
        s3_path = S3_PATH_PARAPHRASE[model]

        check_file(path, s3_path, **kwargs)
        g = load_graph(path['model'])

        if model in ['bert', 'tiny-bert']:
            from malaya.text.bpe import sentencepiece_tokenizer_bert

            tokenizer = sentencepiece_tokenizer_bert(path['tokenizer'],
                                                     path['vocab'])

        from malaya.model.bert import PARAPHRASE_BERT

        return PARAPHRASE_BERT(
            X=g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'),
            input_masks=g.get_tensor_by_name('import/Placeholder_2:0'),
            logits=g.get_tensor_by_name('import/greedy:0'),
            sess=generate_session(graph=g),
            tokenizer=tokenizer,
        )

    else:
        path = PATH_PARAPHRASE['transformer']
        s3_path = S3_PATH_PARAPHRASE['transformer']

        check_file(path[model], s3_path[model], **kwargs)
        g = load_graph(path[model]['model'])

        from malaya.text.t2t import text_encoder
        from malaya.model.tf import PARAPHRASE

        encoder = text_encoder.SubwordTextEncoder(path[model]['vocab'])
        return PARAPHRASE(
            g.get_tensor_by_name('import/Placeholder:0'),
            g.get_tensor_by_name('import/greedy:0'),
            g.get_tensor_by_name('import/beam:0'),
            generate_session(graph=g),
            encoder,
        )
Пример #10
0
def transformer(
    path,
    s3_path,
    class_name,
    label,
    model='bert',
    quantized=False,
    **kwargs,
):
    check_file(path[model], s3_path[model], quantized=quantized, **kwargs)
    if quantized:
        model_path = 'quantized'
    else:
        model_path = 'model'
    g = load_graph(path[model][model_path], **kwargs)

    if len(label) > 2 or class_name == 'relevancy':
        if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']:
            selected_class = MULTICLASS_BERT
            selected_node = 'import/dense/BiasAdd:0'
        if model in ['xlnet', 'alxlnet']:
            selected_class = MULTICLASS_XLNET
            selected_node = 'import/transpose_3:0'

    else:
        if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']:
            selected_class = BINARY_BERT
            selected_node = 'import/dense/BiasAdd:0'
        if model in ['xlnet', 'alxlnet']:
            selected_class = BINARY_XLNET
            selected_node = 'import/transpose_3:0'

    if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']:
        if model in ['bert', 'tiny-bert']:
            from malaya.transformers.bert import (
                _extract_attention_weights_import, )
            from malaya.transformers.bert import bert_num_layers

            tokenizer = sentencepiece_tokenizer_bert(path[model]['tokenizer'],
                                                     path[model]['vocab'])
        if model in ['albert', 'tiny-albert']:
            from malaya.transformers.albert import (
                _extract_attention_weights_import, )
            from malaya.transformers.albert import bert_num_layers
            from albert import tokenization

            tokenizer = tokenization.FullTokenizer(
                vocab_file=path[model]['vocab'],
                do_lower_case=False,
                spm_model_file=path[model]['tokenizer'],
            )

        return selected_class(
            X=g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids=None,
            input_masks=g.get_tensor_by_name('import/Placeholder_1:0'),
            logits=g.get_tensor_by_name('import/logits:0'),
            logits_seq=g.get_tensor_by_name('import/logits_seq:0'),
            vectorizer=g.get_tensor_by_name(selected_node),
            sess=generate_session(graph=g, **kwargs),
            tokenizer=tokenizer,
            label=label,
            attns=_extract_attention_weights_import(bert_num_layers[model], g),
            class_name=class_name,
        )

    if model in ['xlnet', 'alxlnet']:
        if model in ['xlnet']:
            from malaya.transformers.xlnet import (
                _extract_attention_weights_import, )
        if model in ['alxlnet']:
            from malaya.transformers.alxlnet import (
                _extract_attention_weights_import, )

        tokenizer = sentencepiece_tokenizer_xlnet(path[model]['tokenizer'])

        return selected_class(
            X=g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'),
            input_masks=g.get_tensor_by_name('import/Placeholder_2:0'),
            logits=g.get_tensor_by_name('import/logits:0'),
            logits_seq=g.get_tensor_by_name('import/logits_seq:0'),
            vectorizer=g.get_tensor_by_name(selected_node),
            sess=generate_session(graph=g, **kwargs),
            tokenizer=tokenizer,
            label=label,
            attns=_extract_attention_weights_import(g),
            class_name=class_name,
        )
Пример #11
0
def transformer(model: str = 'bert', **kwargs):
    """
    Load Transformer sentiment model.

    Parameters
    ----------
    model : str, optional (default='bert')
        Model architecture supported. Allowed values:

        * ``'bert'`` - BERT architecture from google.
        * ``'tiny-bert'`` - BERT architecture from google with smaller parameters.
        * ``'albert'`` - ALBERT architecture from google.
        * ``'tiny-albert'`` - ALBERT architecture from google with smaller parameters.
        * ``'xlnet'`` - XLNET architecture from google.
        * ``'alxlnet'`` - XLNET architecture from google + Malaya.

    Returns
    -------
    BERT : malaya._models._bert_model.BINARY_BERT class
    """

    model = model.lower()
    if model not in _availability:
        raise Exception(
            'model not supported, please check supported models from malaya.similarity.available_transformer_model()'
        )

    check_file(PATH_SIMILARITY[model], S3_PATH_SIMILARITY[model], **kwargs)
    g = load_graph(PATH_SIMILARITY[model]['model'])

    path = PATH_SIMILARITY

    if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']:
        if model in ['bert', 'tiny-bert']:
            from malaya.transformers.bert import (
                _extract_attention_weights_import, )
            from malaya.transformers.bert import bert_num_layers

            tokenizer = sentencepiece_tokenizer_bert(path[model]['tokenizer'],
                                                     path[model]['vocab'])

        if model in ['albert', 'tiny-albert']:
            from malaya.transformers.albert import (
                _extract_attention_weights_import, )
            from malaya.transformers.albert import bert_num_layers
            from albert import tokenization

            tokenizer = tokenization.FullTokenizer(
                vocab_file=path[model]['vocab'],
                do_lower_case=False,
                spm_model_file=path[model]['tokenizer'],
            )

        return SIAMESE_BERT(
            X=g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'),
            input_masks=g.get_tensor_by_name('import/Placeholder_2:0'),
            logits=g.get_tensor_by_name('import/logits:0'),
            sess=generate_session(graph=g),
            tokenizer=tokenizer,
            label=['not similar', 'similar'],
        )

    if model in ['xlnet', 'alxlnet']:
        if model in ['xlnet']:
            from malaya.transformers.xlnet import (
                _extract_attention_weights_import, )
        if model in ['alxlnet']:
            from malaya.transformers.alxlnet import (
                _extract_attention_weights_import, )

        tokenizer = sentencepiece_tokenizer_xlnet(path[model]['tokenizer'])

        return SIAMESE_XLNET(
            X=g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'),
            input_masks=g.get_tensor_by_name('import/Placeholder_2:0'),
            logits=g.get_tensor_by_name('import/logits:0'),
            sess=generate_session(graph=g),
            tokenizer=tokenizer,
            label=['not similar', 'similar'],
        )
Пример #12
0
def transformer(model: str = 'xlnet', quantized: bool = False, **kwargs):
    """
    Load Transformer Dependency Parsing model, transfer learning Transformer + biaffine attention.

    Parameters
    ----------
    model : str, optional (default='bert')
        Model architecture supported. Allowed values:

        * ``'bert'`` - Google BERT BASE parameters.
        * ``'tiny-bert'`` - Google BERT TINY parameters.
        * ``'albert'`` - Google ALBERT BASE parameters.
        * ``'tiny-albert'`` - Google ALBERT TINY parameters.
        * ``'xlnet'`` - Google XLNET BASE parameters.
        * ``'alxlnet'`` - Malaya ALXLNET BASE parameters.
    
    quantized : bool, optional (default=False)
        if True, will load 8-bit quantized model. 
        Quantized model not necessary faster, totally depends on the machine.

    Returns
    -------
    result: model
        List of model classes:
        
        * if `bert` in model, will return `malaya.model.bert.DependencyBERT`.
        * if `xlnet` in model, will return `malaya.model.xlnet.DependencyXLNET`.
    """

    model = model.lower()
    if model not in _transformer_availability:
        raise ValueError(
            'model not supported, please check supported models from `malaya.dependency.available_transformer()`.'
        )

    path = check_file(
        file=model,
        module='dependency',
        keys={
            'model': 'model.pb',
            'vocab': MODEL_VOCAB[model],
            'tokenizer': MODEL_BPE[model],
        },
        quantized=quantized,
        **kwargs,
    )
    g = load_graph(path['model'], **kwargs)

    if model in ['bert', 'tiny-bert', 'albert', 'tiny-albert']:

        tokenizer = sentencepiece_tokenizer_bert(path['tokenizer'],
                                                 path['vocab'])
        inputs = ['Placeholder']
        vectorizer = {'vectorizer': 'import/dense/BiasAdd:0'}

        Model = DependencyBERT

    if model in ['xlnet', 'alxlnet']:

        tokenizer = sentencepiece_tokenizer_xlnet(path['tokenizer'])
        inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2']
        vectorizer = {'vectorizer': 'import/transpose_3:0'}

        Model = DependencyXLNET

    outputs = ['logits', 'heads_seq']
    input_nodes, output_nodes = nodes_session(g,
                                              inputs,
                                              outputs,
                                              extra=vectorizer)

    return Model(
        input_nodes=input_nodes,
        output_nodes=output_nodes,
        sess=generate_session(graph=g, **kwargs),
        tokenizer=tokenizer,
        settings=label,
    )
Пример #13
0
def transformer(model: str = 'bert', quantized: bool = False, **kwargs):
    """
    Load Transformer keyword similarity model.

    Parameters
    ----------
    model : str, optional (default='bert')
        Model architecture supported. Allowed values:

        * ``'bert'`` - Google BERT BASE parameters.
        * ``'tiny-bert'`` - Google BERT TINY parameters.
        * ``'xlnet'`` - Google XLNET BASE parameters.
        * ``'alxlnet'`` - Malaya ALXLNET BASE parameters.
        
    quantized : bool, optional (default=False)
        if True, will load 8-bit quantized model. 
        Quantized model not necessary faster, totally depends on the machine.

    Returns
    -------
    result: model
        List of model classes:
        
        * if `bert` in model, will return `malaya.model.bert.KeyphraseBERT`.
        * if `xlnet` in model, will return `malaya.model.xlnet.KeyphraseXLNET`.
    """

    model = model.lower()
    if model not in _transformer_availability:
        raise ValueError(
            'model not supported, please check supported models from `malaya.keyword_extraction.available_transformer()`.'
        )

    path = check_file(
        file=model,
        module='keyword-extraction',
        keys={
            'model': 'model.pb',
            'vocab': MODEL_VOCAB[model],
            'tokenizer': MODEL_BPE[model],
        },
        quantized=quantized,
        **kwargs,
    )
    g = load_graph(path['model'], **kwargs)
    outputs = ['logits']

    if model in ['bert', 'tiny-bert']:
        tokenizer = sentencepiece_tokenizer_bert(path['tokenizer'],
                                                 path['vocab'])
        inputs = [
            'Placeholder',
            'Placeholder_1',
            'Placeholder_2',
            'Placeholder_3',
        ]
        outputs.append('bert/summary')
        selected_class = KeyphraseBERT

    if model in ['xlnet', 'alxlnet']:
        tokenizer = sentencepiece_tokenizer_xlnet(path['tokenizer'])

        inputs = [
            'Placeholder',
            'Placeholder_1',
            'Placeholder_2',
            'Placeholder_3',
            'Placeholder_4',
            'Placeholder_5',
        ]
        outputs.append('xlnet/summary')
        selected_class = KeyphraseXLNET

    input_nodes, output_nodes = nodes_session(g, inputs, outputs)

    return selected_class(
        input_nodes=input_nodes,
        output_nodes=output_nodes,
        sess=generate_session(graph=g, **kwargs),
        tokenizer=tokenizer,
        label=['not similar', 'similar'],
    )
Пример #14
0
def transformer(model: str = 'xlnet', quantized: bool = False, **kwargs):
    """
    Load Transformer Dependency Parsing model, transfer learning Transformer + biaffine attention.

    Parameters
    ----------
    model : str, optional (default='bert')
        Model architecture supported. Allowed values:

        * ``'bert'`` - Google BERT BASE parameters.
        * ``'tiny-bert'`` - Google BERT TINY parameters.
        * ``'albert'`` - Google ALBERT BASE parameters.
        * ``'tiny-albert'`` - Google ALBERT TINY parameters.
        * ``'xlnet'`` - Google XLNET BASE parameters.
        * ``'alxlnet'`` - Malaya ALXLNET BASE parameters.
    
    quantized : bool, optional (default=False)
        if True, will load 8-bit quantized model. 
        Quantized model not necessary faster, totally depends on the machine.

    Returns
    -------
    result : Transformer class
    """

    model = model.lower()
    if model not in _transformer_availability:
        raise ValueError(
            'model not supported, please check supported models from `malaya.dependency.available_transformer()`.'
        )

    check_file(PATH_DEPENDENCY[model],
               S3_PATH_DEPENDENCY[model],
               quantized=quantized,
               **kwargs)

    if quantized:
        model_path = 'quantized'
    else:
        model_path = 'model'
    g = load_graph(PATH_DEPENDENCY[model][model_path], **kwargs)

    if model in ['bert', 'tiny-bert', 'albert', 'tiny-albert']:
        from malaya.model.bert import DEPENDENCY_BERT

        tokenizer = sentencepiece_tokenizer_bert(
            PATH_DEPENDENCY[model]['tokenizer'],
            PATH_DEPENDENCY[model]['vocab'])

        return DEPENDENCY_BERT(
            X=g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids=None,
            input_masks=None,
            logits=g.get_tensor_by_name('import/logits:0'),
            vectorizer=g.get_tensor_by_name('import/dense/BiasAdd:0'),
            sess=generate_session(graph=g, **kwargs),
            tokenizer=tokenizer,
            settings=label,
            heads_seq=g.get_tensor_by_name('import/heads_seq:0'),
        )

    if model in ['xlnet', 'alxlnet']:
        from malaya.model.xlnet import DEPENDENCY_XLNET

        tokenizer = sentencepiece_tokenizer_xlnet(
            PATH_DEPENDENCY[model]['tokenizer'])

        return DEPENDENCY_XLNET(
            X=g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'),
            input_masks=g.get_tensor_by_name('import/Placeholder_2:0'),
            logits=g.get_tensor_by_name('import/logits:0'),
            vectorizer=g.get_tensor_by_name('import/transpose_3:0'),
            sess=generate_session(graph=g, **kwargs),
            tokenizer=tokenizer,
            settings=label,
            heads_seq=g.get_tensor_by_name('import/heads_seq:0'),
        )
Пример #15
0
def transformer(model: str = 'xlnet', **kwargs):
    """
    Load Transformer Entity Tagging model, transfer learning Transformer + biaffine attention.

    Parameters
    ----------
    model : str, optional (default='bert')
        Model architecture supported. Allowed values:

        * ``'bert'`` - BERT architecture from google.
        * ``'tiny-bert'`` - BERT architecture from google with smaller parameters.
        * ``'albert'`` - ALBERT architecture from google.
        * ``'tiny-albert'`` - ALBERT architecture from google with smaller parameters.
        * ``'xlnet'`` - XLNET architecture from google.
        * ``'alxlnet'`` - XLNET architecture from google + Malaya.

    Returns
    -------
    result : Transformer class
    """

    model = model.lower()
    if model not in _availability:
        raise Exception(
            'model not supported, please check supported models from malaya.dependency.available_transformer()'
        )

    check_file(PATH_DEPENDENCY[model], S3_PATH_DEPENDENCY[model], **kwargs)
    g = load_graph(PATH_DEPENDENCY[model]['model'])

    if model in ['bert', 'tiny-bert', 'albert', 'tiny-albert']:
        from malaya.model.bert import DEPENDENCY_BERT

        tokenizer = sentencepiece_tokenizer_bert(
            PATH_DEPENDENCY[model]['tokenizer'], PATH_DEPENDENCY[model]['vocab']
        )

        return DEPENDENCY_BERT(
            X = g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids = None,
            input_masks = None,
            logits = g.get_tensor_by_name('import/logits:0'),
            sess = generate_session(graph = g),
            tokenizer = tokenizer,
            settings = label,
            heads_seq = g.get_tensor_by_name('import/heads_seq:0'),
        )

    if model in ['xlnet', 'alxlnet']:
        from malaya.model.xlnet import DEPENDENCY_XLNET

        tokenizer = sentencepiece_tokenizer_xlnet(
            PATH_DEPENDENCY[model]['tokenizer']
        )

        return DEPENDENCY_XLNET(
            X = g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids = g.get_tensor_by_name('import/Placeholder_1:0'),
            input_masks = g.get_tensor_by_name('import/Placeholder_2:0'),
            logits = g.get_tensor_by_name('import/logits:0'),
            sess = generate_session(graph = g),
            tokenizer = tokenizer,
            settings = label,
            heads_seq = g.get_tensor_by_name('import/heads_seq:0'),
        )
Пример #16
0
def transformer_ontonotes5(class_name,
                           model='xlnet',
                           quantized=False,
                           **kwargs):
    path = check_file(
        file=model,
        module=class_name,
        keys={
            'model': 'model.pb',
            'vocab': MODEL_VOCAB[model],
            'tokenizer': MODEL_BPE[model],
            'setting': TAGGING_SETTING[class_name],
        },
        quantized=quantized,
        **kwargs,
    )
    g = load_graph(path['model'], **kwargs)

    hypen = r'\w+(?:-\w+)+'
    hypen_left = r'\w+(?: -\w+)+'
    hypen_right = r'\w+(?:- \w+)+'
    hypen_both = r'\w+(?: - \w+)+'

    pipeline = [
        hypen,
        hypen_left,
        hypen_right,
        hypen_both,
        _expressions['percent'],
        _expressions['money'],
        _expressions['time'],
        _expressions['date'],
        _expressions['repeat_puncts'],
        _expressions['number'],
        _expressions['word'],
    ]
    pipeline.append('(?:\S)')
    compiled = re.compile(r'({})'.format('|'.join(pipeline)))

    def tok(string):
        tokens = compiled.findall(string)
        return [t[0] for t in tokens]

    try:
        with open(path['setting']) as fopen:
            nodes = json.load(fopen)
    except:
        raise Exception(
            f"model corrupted due to some reasons, please run `malaya.clear_cache('{class_name}/{model}/{size}')` and try again"
        )

    if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']:
        if model in ['bert', 'tiny-bert']:
            tokenizer = sentencepiece_tokenizer_bert(path['tokenizer'],
                                                     path['vocab'])

        if model in ['albert', 'tiny-albert']:
            from malaya.transformers.albert import tokenization

            tokenizer = tokenization.FullTokenizer(
                vocab_file=path['vocab'],
                do_lower_case=False,
                spm_model_file=path['tokenizer'],
            )

        inputs = ['Placeholder', 'Placeholder_1']
        vectorizer = {'vectorizer': 'import/dense/BiasAdd:0'}
        Model = TaggingBERT

        return TaggingBERT(
            X=g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids=None,
            input_masks=g.get_tensor_by_name('import/Placeholder_1:0'),
            logits=g.get_tensor_by_name('import/logits:0'),
            vectorizer=g.get_tensor_by_name('import/dense/BiasAdd:0'),
            sess=generate_session(graph=g, **kwargs),
            tokenizer=tokenizer,
            settings=nodes,
            tok=tok,
        )

    if model in ['xlnet', 'alxlnet']:
        tokenizer = sentencepiece_tokenizer_xlnet(path['tokenizer'])
        inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2']
        vectorizer = {'vectorizer': 'import/transpose_3:0'}
        Model = TaggingXLNET

    outputs = ['logits']
    input_nodes, output_nodes = nodes_session(g,
                                              inputs,
                                              outputs,
                                              extra=vectorizer)

    return Model(
        input_nodes=input_nodes,
        output_nodes=output_nodes,
        sess=generate_session(graph=g, **kwargs),
        tokenizer=tokenizer,
        settings=nodes,
    )
Пример #17
0
def transformer(
    class_name,
    label,
    model='bert',
    sigmoid=False,
    quantized=False,
    **kwargs,
):
    path = check_file(
        file=model,
        module=class_name,
        keys={
            'model': 'model.pb',
            'vocab': MODEL_VOCAB[model],
            'tokenizer': MODEL_BPE[model],
        },
        quantized=quantized,
        **kwargs,
    )
    g = load_graph(path['model'], **kwargs)

    if sigmoid:
        if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']:
            selected_class = SigmoidBERT
        if model in ['xlnet', 'alxlnet']:
            selected_class = SigmoidXLNET
    else:
        if len(label) > 2 or class_name == 'relevancy':
            if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']:
                selected_class = MulticlassBERT
            if model in ['xlnet', 'alxlnet']:
                selected_class = MulticlassXLNET
            if model in ['bigbird', 'tiny-bigbird']:
                selected_class = MulticlassBigBird

        else:
            if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']:
                selected_class = BinaryBERT
            if model in ['xlnet', 'alxlnet']:
                selected_class = BinaryXLNET

    if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']:
        if model in ['bert', 'tiny-bert']:
            from malaya.transformers.bert import (
                _extract_attention_weights_import, )
            from malaya.transformers.bert import bert_num_layers

            tokenizer = sentencepiece_tokenizer_bert(path['tokenizer'],
                                                     path['vocab'])
        if model in ['albert', 'tiny-albert']:
            from malaya.transformers.albert import (
                _extract_attention_weights_import, )
            from malaya.transformers.albert import bert_num_layers
            from malaya.transformers.albert import tokenization

            tokenizer = tokenization.FullTokenizer(
                vocab_file=path['vocab'],
                do_lower_case=False,
                spm_model_file=path['tokenizer'],
            )

        inputs = ['Placeholder', 'Placeholder_1']
        vectorizer = {'vectorizer': 'import/dense/BiasAdd:0'}
        attention = _extract_attention_weights_import(bert_num_layers[model],
                                                      g)

    if model in ['xlnet', 'alxlnet']:
        if model in ['xlnet']:
            from malaya.transformers.xlnet import (
                _extract_attention_weights_import, )
        if model in ['alxlnet']:
            from malaya.transformers.alxlnet import (
                _extract_attention_weights_import, )

        inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2']
        tokenizer = sentencepiece_tokenizer_xlnet(path['tokenizer'])
        vectorizer = {'vectorizer': 'import/transpose_3:0'}
        attention = _extract_attention_weights_import(g)

    if model in ['bigbird', 'tiny-bigbird']:
        inputs = ['Placeholder']
        tokenizer = sentencepiece_tokenizer_bert(path['tokenizer'],
                                                 path['vocab'])
        vectorizer = {'vectorizer': 'import/dense/BiasAdd:0'}
        attention = None

    outputs = ['logits', 'logits_seq']
    input_nodes, output_nodes = nodes_session(
        g,
        inputs,
        outputs,
        extra=vectorizer,
        attention={'attention': attention},
    )

    return selected_class(
        input_nodes=input_nodes,
        output_nodes=output_nodes,
        sess=generate_session(graph=g, **kwargs),
        tokenizer=tokenizer,
        label=label,
        class_name=class_name,
    )
Пример #18
0
def _transformer(model, bert_class, xlnet_class, **kwargs):
    model = model.lower()
    if model not in _availability:
        raise Exception(
            'model not supported, please check supported models from malaya.similarity.available_transformer()'
        )

    check_file(PATH_SIMILARITY[model], S3_PATH_SIMILARITY[model], **kwargs)
    g = load_graph(PATH_SIMILARITY[model]['model'], **kwargs)

    path = PATH_SIMILARITY

    if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']:
        if model in ['bert', 'tiny-bert']:
            from malaya.transformers.bert import (
                _extract_attention_weights_import, )
            from malaya.transformers.bert import bert_num_layers

            tokenizer = sentencepiece_tokenizer_bert(path[model]['tokenizer'],
                                                     path[model]['vocab'])

        if model in ['albert', 'tiny-albert']:
            from malaya.transformers.albert import (
                _extract_attention_weights_import, )
            from malaya.transformers.albert import bert_num_layers
            from albert import tokenization

            tokenizer = tokenization.FullTokenizer(
                vocab_file=path[model]['vocab'],
                do_lower_case=False,
                spm_model_file=path[model]['tokenizer'],
            )

        return bert_class(
            X=g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'),
            input_masks=g.get_tensor_by_name('import/Placeholder_2:0'),
            logits=g.get_tensor_by_name('import/logits:0'),
            sess=generate_session(graph=g, **kwargs),
            tokenizer=tokenizer,
            label=['not similar', 'similar'],
        )

    if model in ['xlnet', 'alxlnet']:
        if model in ['xlnet']:
            from malaya.transformers.xlnet import (
                _extract_attention_weights_import, )
        if model in ['alxlnet']:
            from malaya.transformers.alxlnet import (
                _extract_attention_weights_import, )

        tokenizer = sentencepiece_tokenizer_xlnet(path[model]['tokenizer'])

        return xlnet_class(
            X=g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'),
            input_masks=g.get_tensor_by_name('import/Placeholder_2:0'),
            logits=g.get_tensor_by_name('import/logits:0'),
            sess=generate_session(graph=g, **kwargs),
            tokenizer=tokenizer,
            label=['not similar', 'similar'],
        )
Пример #19
0
def transformer(model: str = 'xlnet', quantized: bool = False, **kwargs):
    """
    Load Transformer Constituency Parsing model, transfer learning Transformer + self attentive parsing.

    Parameters
    ----------
    model : str, optional (default='bert')
        Model architecture supported. Allowed values:

        * ``'bert'`` - Google BERT BASE parameters.
        * ``'tiny-bert'`` - Google BERT TINY parameters.
        * ``'albert'`` - Google ALBERT BASE parameters.
        * ``'tiny-albert'`` - Google ALBERT TINY parameters.
        * ``'xlnet'`` - Google XLNET BASE parameters.
    
    quantized : bool, optional (default=False)
        if True, will load 8-bit quantized model. 
        Quantized model not necessary faster, totally depends on the machine.

    Returns
    -------
    result : malaya.model.tf.Constituency class
    """

    model = model.lower()
    if model not in _transformer_availability:
        raise ValueError(
            'model not supported, please check supported models from `malaya.constituency.available_transformer()`.'
        )

    path = check_file(
        file=model,
        module='constituency',
        keys={
            'model': 'model.pb',
            'vocab': MODEL_VOCAB[model],
            'tokenizer': MODEL_BPE[model],
            'setting': CONSTITUENCY_SETTING,
        },
        quantized=quantized,
        **kwargs,
    )
    g = load_graph(path['model'], **kwargs)

    with open(path['setting']) as fopen:
        dictionary = json.load(fopen)

    if model in ['bert', 'tiny-bert', 'albert', 'tiny-albert']:

        tokenizer = sentencepiece_tokenizer_bert(path['tokenizer'],
                                                 path['vocab'])
        mode = 'bert'

    if model in ['xlnet']:
        tokenizer = sentencepiece_tokenizer_xlnet(path['tokenizer'])
        mode = 'xlnet'

    inputs = ['input_ids', 'word_end_mask']
    outputs = ['charts', 'tags']
    input_nodes, output_nodes = nodes_session(
        g, inputs, outputs, extra={'vectorizer': _vectorizer_mapping[model]})

    return Constituency(
        input_nodes=input_nodes,
        output_nodes=output_nodes,
        sess=generate_session(graph=g, **kwargs),
        tokenizer=tokenizer,
        dictionary=dictionary,
        mode=mode,
    )
Пример #20
0
def transformer(model: str = 'bert', size: str = 'base', **kwargs):
    """
    Load Transformer sentiment model.

    Parameters
    ----------
    model : str, optional (default='bert')
        Model architecture supported. Allowed values:

        * ``'bert'`` - BERT architecture from google.
        * ``'xlnet'`` - XLNET architecture from google.
        * ``'albert'`` - ALBERT architecture from google.
    size : str, optional (default='base')
        Model size supported. Allowed values:

        * ``'base'`` - BASE size.
        * ``'small'`` - SMALL size.
    validate: bool, optional (default=True)

    Returns
    -------
    BERT : malaya._models._bert_model.BINARY_BERT class
    """

    model = model.lower()
    size = size.lower()
    if model not in _availability:
        raise Exception(
            'model not supported, please check supported models from malaya.sentiment.available_transformer_model()'
        )
    if size not in _availability[model]:
        raise Exception(
            'size not supported, please check supported models from malaya.sentiment.available_transformer_model()'
        )

    check_file(PATH_SIMILARITY[model][size], S3_PATH_SIMILARITY[model][size],
               **kwargs)
    g = load_graph(PATH_SIMILARITY[model][size]['model'])

    if model in ['albert', 'bert']:
        if model == 'bert':
            from ._transformer._bert import _extract_attention_weights_import
        if model == 'albert':
            from ._transformer._albert import _extract_attention_weights_import

        tokenizer, cls, sep = sentencepiece_tokenizer_bert(
            PATH_SIMILARITY[model][size]['tokenizer'],
            PATH_SIMILARITY[model][size]['vocab'],
        )

        return SIAMESE_BERT(
            X=g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'),
            input_masks=g.get_tensor_by_name('import/Placeholder_2:0'),
            logits=g.get_tensor_by_name('import/logits:0'),
            sess=generate_session(graph=g),
            tokenizer=tokenizer,
            label=['not similar', 'similar'],
            cls=cls,
            sep=sep,
        )

    if model in ['xlnet']:
        from ._transformer._xlnet import _extract_attention_weights_import

        tokenizer = sentencepiece_tokenizer_xlnet(
            PATH_SIMILARITY[model][size]['tokenizer'])

        return SIAMESE_XLNET(
            X=g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'),
            input_masks=g.get_tensor_by_name('import/Placeholder_2:0'),
            logits=g.get_tensor_by_name('import/logits:0'),
            sess=generate_session(graph=g),
            tokenizer=tokenizer,
            label=['not similar', 'similar'],
        )
Пример #21
0
def transformer(path,
                s3_path,
                class_name,
                label,
                model='bert',
                size='base',
                **kwargs):
    check_file(path[model][size], s3_path[model][size], **kwargs)
    g = load_graph(path[model][size]['model'])

    if len(label) > 2 or class_name == 'relevancy':
        if model in ['albert', 'bert']:
            selected_class = MULTICLASS_BERT
        if model in ['xlnet']:
            selected_class = MULTICLASS_XLNET

    else:
        if model in ['albert', 'bert']:
            selected_class = BINARY_BERT
        if model in ['xlnet']:
            selected_class = BINARY_XLNET

    if model in ['albert', 'bert']:
        if model == 'bert':
            from .._transformer._bert import _extract_attention_weights_import
        if model == 'albert':
            from .._transformer._albert import _extract_attention_weights_import

        tokenizer = sentencepiece_tokenizer_bert(path[model]['tokenizer'],
                                                 path[model]['vocab'])

        return selected_class(
            X=g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids=None,
            input_masks=None,
            logits=g.get_tensor_by_name('import/logits:0'),
            logits_seq=g.get_tensor_by_name('import/logits_seq:0'),
            sess=generate_session(graph=g),
            tokenizer=tokenizer,
            label=label,
            cls=cls,
            sep=sep,
            attns=_extract_attention_weights_import(bert_num_layers[size], g),
            class_name=class_name,
        )
    if model in ['xlnet']:
        from .._transformer._xlnet import _extract_attention_weights_import

        tokenizer = sentencepiece_tokenizer_xlnet(
            path[model][size]['tokenizer'])

        return selected_class(
            X=g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'),
            input_masks=g.get_tensor_by_name('import/Placeholder_2:0'),
            logits=g.get_tensor_by_name('import/logits:0'),
            logits_seq=g.get_tensor_by_name('import/logits_seq:0'),
            sess=generate_session(graph=g),
            tokenizer=tokenizer,
            label=label,
            attns=_extract_attention_weights_import(g),
            class_name=class_name,
        )