Пример #1
def get_default_component_of_type(missing_component_type,language='en'):
    This function returns a default component for a missing component type.
    It is used to auto complete pipelines, which are missng required components.
    These represents defaults for many applications and should be set wisely.
    :param missing_component_type: String which is either just the component type or componenttype@spark_nlp_reference which stems from a models storageref and refers to some pretrained embeddings or model
    :return: a NLU component which is a either the default if there is no '@' in the @param missing_component_type or a default component for that particualar type

    logger.info('Getting default for missing_component_type=%s', missing_component_type)
    if not '@' in missing_component_type:
        # get default models if there is no @ in the model name included
        if missing_component_type == 'document': return Util('document_assembler')
        if missing_component_type == 'sentence': return Util('sentence_detector')
        if missing_component_type == 'sentence_embeddings': return Embeddings('use')
        if 'token' in missing_component_type: return nlu.components.tokenizer.Tokenizer("default_tokenizer", language=language)
        if missing_component_type == 'word_embeddings': return Embeddings(nlu_ref='glove')
        if missing_component_type == 'pos':   return Classifier(nlu_ref='pos')
        if missing_component_type == 'ner':   return Classifier(nlu_ref='ner')
        if missing_component_type == 'ner_converter':   return Util('ner_converter')
        if missing_component_type == 'chunk': return nlu.chunker.Chunker()
        if missing_component_type == 'ngram': return nlu.chunker.Chunker(nlu_ref='ngram')
        if missing_component_type == 'chunk_embeddings': return embeddings_chunker.EmbeddingsChunker()
        if missing_component_type == 'unlabeled_dependency': return UnlabledDepParser()
        if missing_component_type == 'labled_dependency': return LabledDepParser('dep')
        if missing_component_type == 'date': return nlu.Matcher('date')
        if missing_component_type == 'ner_converter': return Util('ner_converter')

        multi_lang =['ar']
        # if there is an @ in the name, we must get some specific pretrained model from the sparknlp reference that should follow after the @
        missing_component_type, sparknlp_reference = missing_component_type.split('@')
        if 'embed' in missing_component_type:
            if language in multi_lang : sparknlp_reference = resolve_multi_lang_embed(language,sparknlp_reference)
            return construct_component_from_identifier(language=language, component_type='embed',
        if 'pos' in missing_component_type or 'ner' in missing_component_type:
            return construct_component_from_identifier(language=language, component_type='classifier',
        if 'chunk_embeddings' in missing_component_type:
            return embeddings_chunker.EmbeddingsChunker()
        if 'unlabeled_dependency' in missing_component_type or 'dep.untyped' in missing_component_type:
            return UnlabledDepParser('dep.untyped')
        if 'labled_dependency' in missing_component_type or 'dep.typed' in missing_component_type:
            return LabledDepParser('dep.typed')
        if 'date' in missing_component_type:
            return None

        logger.exception("Could not resolve default component type for missing type=%s", missing_component_type)
Пример #2
def construct_component_from_identifier(language, component_type='', dataset='', component_embeddings='', nlu_ref='',
    Creates a NLU component from a pretrained SparkNLP model reference or Class reference.
    Class references will return default pretrained models
    :param language: Language of the sparknlp model reference
    :param component_type: Class which will be used to instantiate the model
    :param dataset: Dataset that the model was trained on
    :param component_embeddings: Embedded that the models was traiend on (if any)
    :param nlu_ref: Full user request
    :param nlp_ref: Full Spark NLP reference
    :return: Returns a NLU component which embelished the Spark NLP pretrained model and class for that model
    logger.info('Creating singular NLU component for type=%s sparknlp_ref=%s , dataset=%s, language=%s , nlu_ref=%s ',
                component_type, nlp_ref, dataset, language, nlu_ref)

        if any(
            x in NameSpace.seq2seq for x in [nlp_ref, nlu_ref, dataset, component_type, ]):
            return Seq2Seq(annotator_class=component_type, language=language, get_default=False, nlp_ref=nlp_ref,configs=dataset)

        # if any([component_type in NameSpace.word_embeddings,dataset in NameSpace.word_embeddings, nlu_ref in NameSpace.word_embeddings, nlp_ref in NameSpace.word_embeddings]):
        elif any(x in NameSpace.word_embeddings and not x in NameSpace.classifiers for x in
               [nlp_ref, nlu_ref, dataset, component_type, ] + dataset.split('_')):
            return Embeddings(get_default=False, nlp_ref=nlp_ref, nlu_ref=nlu_ref, language=language)

        # elif any([component_type in NameSpace.sentence_embeddings,dataset in NameSpace.sentence_embeddings, nlu_ref in NameSpace.sentence_embeddings, nlp_ref in NameSpace.sentence_embeddings]):
        if any(x in NameSpace.sentence_embeddings and not x in NameSpace.classifiers for x in
               [nlp_ref, nlu_ref, dataset, component_type, ] + dataset.split('_')):
            return Embeddings(get_default=False, nlp_ref=nlp_ref, nlu_ref=nlu_ref, language=language)

        elif any(
                x in NameSpace.classifiers for x in [nlp_ref, nlu_ref, dataset, component_type, ] + dataset.split('_')):
            return Classifier(get_default=False, nlp_ref=nlp_ref, nlu_ref=nlu_ref, language=language)

        elif any('spell' in x for x in [nlp_ref, nlu_ref, dataset, component_type]):
            return SpellChecker(annotator_class=component_type, language=language, get_default=True, nlp_ref=nlp_ref,

        elif any('dep' in x and not 'untyped' in x for x in [nlp_ref, nlu_ref, dataset, component_type]):
            return LabledDepParser()

        elif any('dep.untyped' in x or 'untyped' in x for x in [nlp_ref, nlu_ref, dataset, component_type]):
            return UnlabledDepParser()

        elif any('lemma' in x for x in [nlp_ref, nlu_ref, dataset, component_type]):
            return nlu.lemmatizer.Lemmatizer(language=language, nlp_ref=nlp_ref)

        elif any('norm' in x for x in [nlp_ref, nlu_ref, dataset, component_type]):
            return nlu.normalizer.Normalizer(nlp_ref=nlp_ref, nlu_ref=nlu_ref)
        elif any('clean' in x or 'stopword' in x for x in [nlp_ref, nlu_ref, dataset, component_type]):
            return nlu.StopWordsCleaner(language=language, get_default=False, nlp_ref=nlp_ref)
        elif any('sentence_detector' in x for x in [nlp_ref, nlu_ref, dataset, component_type]):
            return NLUSentenceDetector(nlu_ref=nlu_ref, nlp_ref=nlp_ref, language=language)

        elif any('match' in x for x in [nlp_ref, nlu_ref, dataset, component_type]):
            return Matcher(nlu_ref=nlu_ref, nlp_ref=nlp_ref)

        elif any('tokenize' in x or 'segment_words' in x for x in [nlp_ref, nlu_ref, dataset, component_type]):
            return nlu.tokenizer.Tokenizer(nlp_ref=nlp_ref, nlu_ref=nlu_ref, language=language,get_default=False)

        elif any('stem' in x for x in [nlp_ref, nlu_ref, dataset, component_type]):
            return Stemmer()

        # supported in future version with auto embed generation
        # elif any('embed_chunk' in x for x in [nlp_ref, nlu_ref, dataset, component_type] ):
        #     return embeddings_chunker.EmbeddingsChunker()

        elif any('chunk' in x for x in [nlp_ref, nlu_ref, dataset, component_type]):
            return nlu.chunker.Chunker()
        elif component_type == 'ngram':
            return nlu.chunker.Chunker('ngram')

        logger.exception('EXCEPTION: Could not resolve singular Component for type=%s and nlp_ref=%s and nlu_ref=%s',
                         component_type, nlp_ref, nlu_ref)
        return None
    except:  # if reference is not in namespace and not a component it will cause a unrecoverable crash
        logger.exception('EXCEPTION: Could not resolve singular Component for type=%s and nlp_ref=%s and nlu_ref=%s',
                         component_type, nlp_ref, nlu_ref)
        return None
Пример #3
def construct_component_from_identifier(language, component_type, dataset, component_embeddings, nlu_reference,
    Creates a NLU component from a pretrained SparkNLP model reference or Class reference.
    Class references will return default pretrained models
    :param language: Language of the sparknlp model reference
    :param component_type: Class which will be used to instantiate the model
    :param dataset: Dataset that the model was trained on
    :param component_embeddings: Embedded that the models was traiend on (if any)
    :param nlu_reference: Full user request
    :param sparknlp_reference: Full Spark NLP reference
    :return: Returns a NLU component which embelished the Spark NLP pretrained model and class for that model
    logger.info('Creating singular NLU component for type=%s sparknlp reference=%s , dataset=%s, language=%s ', component_type, sparknlp_reference, dataset, language)
    try : 
        if sparknlp_reference == 'yake':
            return Classifier('yake')
        elif 'bert' in dataset or component_type == 'embed' or 'albert' in component_type or 'bert' in component_type or 'xlnet' in component_type or 'use' in component_type or 'glove' in component_type or 'elmo' in component_type or 'tfhub_use' in sparknlp_reference\
                or 'bert' in sparknlp_reference or 'labse' in sparknlp_reference or component_type =='embed_sentence' or 'electra' in nlu_reference:
            if component_type == 'embed' and dataset != '' :
                return Embeddings(component_name=dataset, language=language, get_default=False,
            elif component_type == 'embed' :  return Embeddings(component_name=sparknlp_reference) #default
            else : return Embeddings(component_name=component_type, language=language, get_default=False,
        elif component_type == 'classify' or  'e2e' in sparknlp_reference:
            if component_type == 'classify' and dataset != '' :
                return Classifier(component_name=dataset, language=language, get_default=False,
            else : return Classifier(component_name=component_type, language=language, get_default=False,
        elif component_type == 'tokenize':
            return nlu.tokenizer.Tokenizer(component_name=component_type, language=language, get_default=False,
        elif component_type == 'pos':
            return Classifier(component_name=component_type, language=language, get_default=False,
        elif component_type == 'ner' or 'ner_dl' in sparknlp_reference:
            return Classifier(component_name='ner', language=language, get_default=False,
        elif component_type == 'sentiment':
            return Classifier(component_name=component_type, language=language, get_default=False,
        elif component_type == 'emotion':
            return Classifier(component_name=component_type, language=language, get_default=False,
        elif component_type == 'spell':
            return SpellChecker(component_name=component_type, language=language, get_default=False,
                                sparknlp_reference=sparknlp_reference, dataset = dataset)
        elif component_type == 'dep' and dataset!='untyped' :# There are no trainable dep parsers this gets only default dep
            return LabledDepParser(component_name='labeled_dependency_parser', language=language, get_default=True,
        elif component_type == 'dep.untyped' or  dataset =='untyped': # There are no trainable dep parsers this gets only default dep
            return UnlabledDepParser(component_name='unlabeled_dependency_parser', language=language, get_default=True,
        elif component_type == 'lemma':
            return nlu.lemmatizer.Lemmatizer(component_name=component_type, language=language, get_default=False,
        elif component_type == 'norm':
            return nlu.normalizer.Normalizer(component_name='normalizer', language=language, get_default=True,
        elif component_type == 'clean' or component_type == 'stopwords' :
            return nlu.StopWordsCleaner( language=language, get_default=False,
        elif component_type == 'sentence_detector':
            return NLUSentenceDetector(component_name=component_type, language=language, get_default=True,
        elif component_type == 'match':
            return Matcher(component_name=dataset, language=language, get_default=True,
        elif component_type == 'stem' or  component_type == 'stemm' or sparknlp_reference == 'stemmer' : 
            return Stemmer()
        elif component_type == 'chunk'  :return nlu.chunker.Chunker()
        elif component_type == 'ngram'  :return nlu.chunker.Chunker('ngram')
        elif component_type == 'embed_chunk': return embeddings_chunker.EmbeddingsChunker()
        elif component_type == 'regex' or sparknlp_reference =='regex_matcher' : return nlu.Matcher(component_name='regex')
        elif component_type == 'text' or sparknlp_reference =='text_matcher'  : return nlu.Matcher(component_name='text')

        logger.exception('EXCEPTION: Could not resolve singular Component for type=%s and sparknl reference=%s and nlu reference=%s', component_type, sparknlp_reference, nlu_reference)
        return None  
    except : # if reference is not in namespace and not a component it will cause a unrecoverable crash
        logger.exception('EXCEPTION: Could not resolve singular Component for type=%s and sparknl reference=%s and nlu reference=%s', component_type, sparknlp_reference, nlu_reference)
        return None