Пример #1
0
def construct_component_from_identifier(language,
                                        component_type='',
                                        dataset='',
                                        component_embeddings='',
                                        nlu_ref='',
                                        nlp_ref=''):
    '''
    Creates a NLU component from a pretrained SparkNLP model reference or Class reference.
    Class references will return default pretrained models
    :param language: Language of the sparknlp model reference
    :param component_type: Class which will be used to instantiate the model
    :param dataset: Dataset that the model was trained on
    :param component_embeddings: Embedded that the models was traiend on (if any)
    :param nlu_ref: Full user request
    :param nlp_ref: Full Spark NLP reference
    :return: Returns a NLU component which embelished the Spark NLP pretrained model and class for that model
    '''
    logger.info(
        'Creating singular NLU component for type=%s sparknlp_ref=%s , dataset=%s, language=%s , nlu_ref=%s ',
        component_type, nlp_ref, dataset, language, nlu_ref)
    try:

        # if any([component_type in NameSpace.word_embeddings,dataset in NameSpace.word_embeddings, nlu_ref in NameSpace.word_embeddings, nlp_ref in NameSpace.word_embeddings]):
        if any(x in NameSpace.word_embeddings
               and not x in NameSpace.classifiers for x in [
                   nlp_ref,
                   nlu_ref,
                   dataset,
                   component_type,
               ] + dataset.split('_')):
            return Embeddings(get_default=False,
                              nlp_ref=nlp_ref,
                              nlu_ref=nlu_ref,
                              language=language)

        # elif any([component_type in NameSpace.sentence_embeddings,dataset in NameSpace.sentence_embeddings, nlu_ref in NameSpace.sentence_embeddings, nlp_ref in NameSpace.sentence_embeddings]):
        if any(x in NameSpace.sentence_embeddings
               and not x in NameSpace.classifiers for x in [
                   nlp_ref,
                   nlu_ref,
                   dataset,
                   component_type,
               ] + dataset.split('_')):
            return Embeddings(get_default=False,
                              nlp_ref=nlp_ref,
                              nlu_ref=nlu_ref,
                              language=language)

        elif any(x in NameSpace.classifiers for x in [
                nlp_ref,
                nlu_ref,
                dataset,
                component_type,
        ] + dataset.split('_')):
            return Classifier(get_default=False,
                              nlp_ref=nlp_ref,
                              nlu_ref=nlu_ref,
                              language=language)

        elif any('spell' in x
                 for x in [nlp_ref, nlu_ref, dataset, component_type]):
            return SpellChecker(annotator_class=component_type,
                                language=language,
                                get_default=True,
                                nlp_ref=nlp_ref,
                                dataset=dataset)

        elif any('dep' in x and not 'untyped' in x
                 for x in [nlp_ref, nlu_ref, dataset, component_type]):
            return LabledDepParser()

        elif any('dep.untyped' in x or 'untyped' in x
                 for x in [nlp_ref, nlu_ref, dataset, component_type]):
            return UnlabledDepParser()

        elif any('lemma' in x
                 for x in [nlp_ref, nlu_ref, dataset, component_type]):
            return nlu.lemmatizer.Lemmatizer(language=language,
                                             nlp_ref=nlp_ref)

        elif any('norm' in x
                 for x in [nlp_ref, nlu_ref, dataset, component_type]):
            return nlu.normalizer.Normalizer()
        elif any('clean' in x or 'stopword' in x
                 for x in [nlp_ref, nlu_ref, dataset, component_type]):
            return nlu.StopWordsCleaner(language=language,
                                        get_default=False,
                                        nlp_ref=nlp_ref)
        elif any('sentence_detector' in x
                 for x in [nlp_ref, nlu_ref, dataset, component_type]):
            return NLUSentenceDetector(nlu_ref=nlu_ref,
                                       nlp_ref=nlp_ref,
                                       language=language)

        elif any('match' in x
                 for x in [nlp_ref, nlu_ref, dataset, component_type]):
            return Matcher(nlu_ref=nlu_ref, nlp_ref=nlp_ref)

        elif any('tokenize' in x
                 for x in [nlp_ref, nlu_ref, dataset, component_type]):
            return nlu.tokenizer.Tokenizer()

        elif any('stem' in x
                 for x in [nlp_ref, nlu_ref, dataset, component_type]):
            return Stemmer()

        # supported in future version with auto embed generation
        # elif any('embed_chunk' in x for x in [nlp_ref, nlu_ref, dataset, component_type] ):
        #     return embeddings_chunker.EmbeddingsChunker()

        elif any('chunk' in x
                 for x in [nlp_ref, nlu_ref, dataset, component_type]):
            return nlu.chunker.Chunker()
        elif component_type == 'ngram':
            return nlu.chunker.Chunker('ngram')

        logger.exception(
            'EXCEPTION: Could not resolve singular Component for type=%s and nlp_ref=%s and nlu_ref=%s',
            component_type, nlp_ref, nlu_ref)
        return None
    except:  # if reference is not in namespace and not a component it will cause a unrecoverable crash
        logger.exception(
            'EXCEPTION: Could not resolve singular Component for type=%s and nlp_ref=%s and nlu_ref=%s',
            component_type, nlp_ref, nlu_ref)
        return None
Пример #2
0
def construct_component_from_pipe_identifier(language, nlp_ref, nlu_ref,path=None):
    '''
    # creates a list of components from a Spark NLP Pipeline reference
    # 1. download pipeline
    # 2. unpack pipeline to annotators and create list of nlu components
    # 3. return list of nlu components
    :param nlu_ref:
    :param language: language of the pipeline
    :param nlp_ref: Reference to a spark nlp petrained pipeline
    :param path: Load pipe from HDD
    :return: Each element of the SaprkNLP pipeline wrapped as a NLU componed inside of a list
    '''
    logger.info("Starting Spark NLP to NLU pipeline conversion process")
    from sparknlp.pretrained import PretrainedPipeline, LightPipeline
    if 'language' in nlp_ref: language = 'xx'  # special edge case for lang detectors
    if path == None :
        pipe = PretrainedPipeline(nlp_ref, lang=language)
        iterable_stages = pipe.light_model.pipeline_model.stages
    else :
        pipe = LightPipeline(PipelineModel.load(path=path))
        iterable_stages = pipe.pipeline_model.stages
    constructed_components = []

    # for component in pipe.light_model.pipeline_model.stages:
    for component in iterable_stages:

        logger.info("Extracting model from Spark NLP pipeline: %s and creating Component", component)
        parsed = str(component).split('_')[0].lower()
        logger.info("Parsed Component for : %s", parsed)
        c_name = component.__class__.__name__
        if isinstance(component, NerConverter):
            constructed_components.append(Util(annotator_class='ner_converter', model=component))
        elif parsed in NameSpace.word_embeddings + NameSpace.sentence_embeddings:
            constructed_components.append(nlu.Embeddings(model=component))
        elif parsed in NameSpace.classifiers:
            constructed_components.append(nlu.Classifier(model=component))
        elif isinstance(component, MultiClassifierDLModel):
            constructed_components.append(nlu.Classifier(model=component, nlp_ref='multiclassifierdl'))
        elif isinstance(component, PerceptronModel):
            constructed_components.append(nlu.Classifier(nlp_ref='classifierdl', model=component))
        elif isinstance(component, (ClassifierDl,ClassifierDLModel)):
            constructed_components.append(nlu.Classifier(nlp_ref='classifierdl', model=component))
        elif isinstance(component, UniversalSentenceEncoder):
            constructed_components.append(nlu.Embeddings(model=component, nlp_ref='use'))
        elif isinstance(component, BertEmbeddings):
            constructed_components.append(nlu.Embeddings(model=component, nlp_ref='bert'))
        elif isinstance(component, AlbertEmbeddings):
            constructed_components.append(nlu.Embeddings(model=component, nlp_ref='albert'))
        elif isinstance(component, XlnetEmbeddings):
            constructed_components.append(nlu.Embeddings(model=component, nlp_ref='xlnet'))
        elif isinstance(component, WordEmbeddingsModel):
            constructed_components.append(nlu.Embeddings(model=component, nlp_ref='glove'))
        elif isinstance(component, ElmoEmbeddings):
            constructed_components.append(nlu.Embeddings(model=component, nlp_ref='elmo'))
        elif isinstance(component, BertSentenceEmbeddings):
            constructed_components.append(nlu.Embeddings(model=component, nlp_ref='bert_sentence'))
        elif isinstance(component, UniversalSentenceEncoder):
            constructed_components.append(nlu.Embeddings(model=component, nlu_ref='use'))
        elif isinstance(component, TokenizerModel) and parsed != 'regex':
            constructed_components.append(nlu.Tokenizer(model=component))
        elif isinstance(component, TokenizerModel) and parsed == 'regex' :
            constructed_components.append(nlu.Tokenizer(model=component, annotator_class='regex_tokenizer'))
        elif isinstance(component, DocumentAssembler):
            constructed_components.append(nlu.Util(model=component))
        elif isinstance(component, SentenceDetectorDLModel):
            constructed_components.append(NLUSentenceDetector(annotator_class='deep_sentence_detector', model=component))
        elif isinstance(component, (SentenceDetectorDLModel, SentenceDetector)):
            constructed_components.append(NLUSentenceDetector(annotator_class='pragmatic_sentence_detector', model=component))
        elif isinstance(component, RegexMatcherModel) or parsed == 'match':
            constructed_components.append(nlu.Matcher(model=component, annotator_class='regex'))
        elif isinstance(component, TextMatcherModel):
            constructed_components.append(nlu.Matcher(model=component, annotator_class='text'))
        elif isinstance(component, DateMatcher):
            constructed_components.append(nlu.Matcher(model=component, annotator_class='date'))
        elif isinstance(component, ContextSpellCheckerModel):
            constructed_components.append(nlu.SpellChecker(model=component, annotator_class='context'))
        elif isinstance(component, SymmetricDeleteModel):
            constructed_components.append(nlu.SpellChecker(model=component, annotator_class='symmetric'))
        elif isinstance(component, NorvigSweetingModel):
            constructed_components.append(nlu.SpellChecker(model=component, annotator_class='norvig'))
        elif isinstance(component, LemmatizerModel):
            constructed_components.append(nlu.lemmatizer.Lemmatizer(model=component))
        elif isinstance(component, NormalizerModel):
            constructed_components.append(nlu.normalizer.Normalizer(model=component))
        elif isinstance(component, Stemmer):
            constructed_components.append(nlu.stemmer.Stemmer(model=component))
        elif isinstance(component, (NerDLModel, NerCrfModel)):
            constructed_components.append(nlu.Classifier(model=component, annotator_class='ner'))
        elif isinstance(component, LanguageDetectorDL):
            constructed_components.append(nlu.Classifier(model=component, annotator_class='language_detector'))

        elif isinstance(component, DependencyParserModel):
            constructed_components.append(UnlabledDepParser(model=component))
        elif isinstance(component, TypedDependencyParserModel):
            constructed_components.append(LabledDepParser(model=component))
        elif isinstance(component, MultiClassifierDLModel):
            constructed_components.append(nlu.Classifier(model=component, nlp_ref='multiclassifierdl'))
        elif isinstance(component, (SentimentDetectorModel,SentimentDLModel)):
            constructed_components.append(nlu.Classifier(model=component, nlp_ref='sentimentdl'))
        elif isinstance(component, (SentimentDetectorModel,ViveknSentimentModel)):
            constructed_components.append(nlu.Classifier(model=component, nlp_ref='vivekn'))
        elif isinstance(component, Chunker):
            constructed_components.append(nlu.chunker.Chunker(model=component))
        elif isinstance(component, NGram):
            constructed_components.append(nlu.chunker.Chunker(model=component))
        elif isinstance(component, ChunkEmbeddings):
            constructed_components.append(embeddings_chunker.EmbeddingsChunker(model=component))
        elif isinstance(component, StopWordsCleaner):
            constructed_components.append(nlu.StopWordsCleaner(model=component))
        elif isinstance(component, (TextMatcherModel, RegexMatcherModel, DateMatcher,MultiDateMatcher)) or parsed == 'match':
            constructed_components.append(nlu.Matcher(model=component))
        elif isinstance(component,(T5Transformer)):
            constructed_components.append(nlu.Seq2Seq(annotator_class='t5', model=component))
        elif isinstance(component,(MarianTransformer)):
            constructed_components.append(nlu.Seq2Seq(annotator_class='marian', model=component))
        else:
            logger.exception(
                f"EXCEPTION: Could not infer component type for lang={language} and nlp_ref={nlp_ref} and model {component} during pipeline conversion,")
            logger.info("USING DEFAULT ANNOTATOR TYPE Lemmatizer to fix issue")
            constructed_components.append(nlu.normalizer.Normalizer(model=component))

        logger.info(f"Extracted into NLU Component type : {parsed}", )
        if None in constructed_components:
            logger.exception(
                f"EXCEPTION: Could not infer component type for lang={language} and nlp_ref={nlp_ref} during pipeline conversion,")
            return None
    return constructed_components
Пример #3
0
def get_default_component_of_type(missing_component_type):
    '''
    This function returns a default component for a missing component type.
    It is used to auto complete pipelines, which are missng required components.
    These represents defaults for many applications and should be set wisely.
    :param missing_component_type: String which is either just the component type or componenttype@spark_nlp_reference which stems from a models storageref and refers to some pretrained embeddings or model
    :return: a NLU component which is a either the default if there is no '@' in the @param missing_component_type or a default component for that particualar type
    '''

    logger.info('Getting default for missing_component_type=%s',
                missing_component_type)
    if not '@' in missing_component_type:
        # get default models if there is no @ in the model name included
        if missing_component_type == 'document':
            return Util('document_assembler')
        if missing_component_type == 'sentence':
            return Util('sentence_detector')
        if missing_component_type == 'sentence_embeddings':
            return Embeddings('use')
        if 'token' in missing_component_type:
            return nlu.components.tokenizer.Tokenizer("default_tokenizer")
        if missing_component_type == 'word_embeddings':
            return Embeddings(nlu_ref='glove')
        if missing_component_type == 'pos': return Classifier(nlu_ref='pos')
        if missing_component_type == 'ner': return Classifier(nlu_ref='ner')
        if missing_component_type == 'ner_converter':
            return Util('ner_converter')
        if missing_component_type == 'chunk': return nlu.chunker.Chunker()
        if missing_component_type == 'ngram':
            return nlu.chunker.Chunker(nlu_ref='ngram')
        if missing_component_type == 'chunk_embeddings':
            return embeddings_chunker.EmbeddingsChunker()
        if missing_component_type == 'unlabeled_dependency':
            return UnlabledDepParser()
        if missing_component_type == 'labled_dependency':
            return LabledDepParser('dep')
        if missing_component_type == 'date': return nlu.Matcher('date')
        if missing_component_type == 'ner_converter':
            return Util('ner_converter')

    else:
        # if there is an @ in the name, we must get some specific pretrained model from the sparknlp reference that should follow after the @
        missing_component_type, sparknlp_reference = missing_component_type.split(
            '@')
        if 'embed' in missing_component_type:
            return construct_component_from_identifier(
                language='en',
                component_type='embed',
                nlp_ref=sparknlp_reference)
        if 'pos' in missing_component_type or 'ner' in missing_component_type:
            return construct_component_from_identifier(
                language='en',
                component_type='classifier',
                nlp_ref=sparknlp_reference)
        if 'chunk_embeddings' in missing_component_type:
            return embeddings_chunker.EmbeddingsChunker()
        if 'unlabeled_dependency' in missing_component_type or 'dep.untyped' in missing_component_type:
            return UnlabledDepParser('dep.untyped')
        if 'labled_dependency' in missing_component_type or 'dep.typed' in missing_component_type:
            return LabledDepParser('dep.typed')
        if 'date' in missing_component_type:
            return None

        logger.exception(
            "Could not resolve default component type for missing type=%s",
            missing_component_type)