def construct_component_from_identifier(language, component_type='', dataset='', component_embeddings='', nlu_ref='', nlp_ref=''): ''' Creates a NLU component from a pretrained SparkNLP model reference or Class reference. Class references will return default pretrained models :param language: Language of the sparknlp model reference :param component_type: Class which will be used to instantiate the model :param dataset: Dataset that the model was trained on :param component_embeddings: Embedded that the models was traiend on (if any) :param nlu_ref: Full user request :param nlp_ref: Full Spark NLP reference :return: Returns a NLU component which embelished the Spark NLP pretrained model and class for that model ''' logger.info( 'Creating singular NLU component for type=%s sparknlp_ref=%s , dataset=%s, language=%s , nlu_ref=%s ', component_type, nlp_ref, dataset, language, nlu_ref) try: # if any([component_type in NameSpace.word_embeddings,dataset in NameSpace.word_embeddings, nlu_ref in NameSpace.word_embeddings, nlp_ref in NameSpace.word_embeddings]): if any(x in NameSpace.word_embeddings and not x in NameSpace.classifiers for x in [ nlp_ref, nlu_ref, dataset, component_type, ] + dataset.split('_')): return Embeddings(get_default=False, nlp_ref=nlp_ref, nlu_ref=nlu_ref, language=language) # elif any([component_type in NameSpace.sentence_embeddings,dataset in NameSpace.sentence_embeddings, nlu_ref in NameSpace.sentence_embeddings, nlp_ref in NameSpace.sentence_embeddings]): if any(x in NameSpace.sentence_embeddings and not x in NameSpace.classifiers for x in [ nlp_ref, nlu_ref, dataset, component_type, ] + dataset.split('_')): return Embeddings(get_default=False, nlp_ref=nlp_ref, nlu_ref=nlu_ref, language=language) elif any(x in NameSpace.classifiers for x in [ nlp_ref, nlu_ref, dataset, component_type, ] + dataset.split('_')): return Classifier(get_default=False, nlp_ref=nlp_ref, nlu_ref=nlu_ref, language=language) elif any('spell' in x for x in [nlp_ref, nlu_ref, dataset, component_type]): return SpellChecker(annotator_class=component_type, language=language, get_default=True, nlp_ref=nlp_ref, dataset=dataset) elif any('dep' in x and not 'untyped' in x for x in [nlp_ref, nlu_ref, dataset, component_type]): return LabledDepParser() elif any('dep.untyped' in x or 'untyped' in x for x in [nlp_ref, nlu_ref, dataset, component_type]): return UnlabledDepParser() elif any('lemma' in x for x in [nlp_ref, nlu_ref, dataset, component_type]): return nlu.lemmatizer.Lemmatizer(language=language, nlp_ref=nlp_ref) elif any('norm' in x for x in [nlp_ref, nlu_ref, dataset, component_type]): return nlu.normalizer.Normalizer() elif any('clean' in x or 'stopword' in x for x in [nlp_ref, nlu_ref, dataset, component_type]): return nlu.StopWordsCleaner(language=language, get_default=False, nlp_ref=nlp_ref) elif any('sentence_detector' in x for x in [nlp_ref, nlu_ref, dataset, component_type]): return NLUSentenceDetector(nlu_ref=nlu_ref, nlp_ref=nlp_ref, language=language) elif any('match' in x for x in [nlp_ref, nlu_ref, dataset, component_type]): return Matcher(nlu_ref=nlu_ref, nlp_ref=nlp_ref) elif any('tokenize' in x for x in [nlp_ref, nlu_ref, dataset, component_type]): return nlu.tokenizer.Tokenizer() elif any('stem' in x for x in [nlp_ref, nlu_ref, dataset, component_type]): return Stemmer() # supported in future version with auto embed generation # elif any('embed_chunk' in x for x in [nlp_ref, nlu_ref, dataset, component_type] ): # return embeddings_chunker.EmbeddingsChunker() elif any('chunk' in x for x in [nlp_ref, nlu_ref, dataset, component_type]): return nlu.chunker.Chunker() elif component_type == 'ngram': return nlu.chunker.Chunker('ngram') logger.exception( 'EXCEPTION: Could not resolve singular Component for type=%s and nlp_ref=%s and nlu_ref=%s', component_type, nlp_ref, nlu_ref) return None except: # if reference is not in namespace and not a component it will cause a unrecoverable crash logger.exception( 'EXCEPTION: Could not resolve singular Component for type=%s and nlp_ref=%s and nlu_ref=%s', component_type, nlp_ref, nlu_ref) return None
def construct_component_from_pipe_identifier(language, nlp_ref, nlu_ref,path=None): ''' # creates a list of components from a Spark NLP Pipeline reference # 1. download pipeline # 2. unpack pipeline to annotators and create list of nlu components # 3. return list of nlu components :param nlu_ref: :param language: language of the pipeline :param nlp_ref: Reference to a spark nlp petrained pipeline :param path: Load pipe from HDD :return: Each element of the SaprkNLP pipeline wrapped as a NLU componed inside of a list ''' logger.info("Starting Spark NLP to NLU pipeline conversion process") from sparknlp.pretrained import PretrainedPipeline, LightPipeline if 'language' in nlp_ref: language = 'xx' # special edge case for lang detectors if path == None : pipe = PretrainedPipeline(nlp_ref, lang=language) iterable_stages = pipe.light_model.pipeline_model.stages else : pipe = LightPipeline(PipelineModel.load(path=path)) iterable_stages = pipe.pipeline_model.stages constructed_components = [] # for component in pipe.light_model.pipeline_model.stages: for component in iterable_stages: logger.info("Extracting model from Spark NLP pipeline: %s and creating Component", component) parsed = str(component).split('_')[0].lower() logger.info("Parsed Component for : %s", parsed) c_name = component.__class__.__name__ if isinstance(component, NerConverter): constructed_components.append(Util(annotator_class='ner_converter', model=component)) elif parsed in NameSpace.word_embeddings + NameSpace.sentence_embeddings: constructed_components.append(nlu.Embeddings(model=component)) elif parsed in NameSpace.classifiers: constructed_components.append(nlu.Classifier(model=component)) elif isinstance(component, MultiClassifierDLModel): constructed_components.append(nlu.Classifier(model=component, nlp_ref='multiclassifierdl')) elif isinstance(component, PerceptronModel): constructed_components.append(nlu.Classifier(nlp_ref='classifierdl', model=component)) elif isinstance(component, (ClassifierDl,ClassifierDLModel)): constructed_components.append(nlu.Classifier(nlp_ref='classifierdl', model=component)) elif isinstance(component, UniversalSentenceEncoder): constructed_components.append(nlu.Embeddings(model=component, nlp_ref='use')) elif isinstance(component, BertEmbeddings): constructed_components.append(nlu.Embeddings(model=component, nlp_ref='bert')) elif isinstance(component, AlbertEmbeddings): constructed_components.append(nlu.Embeddings(model=component, nlp_ref='albert')) elif isinstance(component, XlnetEmbeddings): constructed_components.append(nlu.Embeddings(model=component, nlp_ref='xlnet')) elif isinstance(component, WordEmbeddingsModel): constructed_components.append(nlu.Embeddings(model=component, nlp_ref='glove')) elif isinstance(component, ElmoEmbeddings): constructed_components.append(nlu.Embeddings(model=component, nlp_ref='elmo')) elif isinstance(component, BertSentenceEmbeddings): constructed_components.append(nlu.Embeddings(model=component, nlp_ref='bert_sentence')) elif isinstance(component, UniversalSentenceEncoder): constructed_components.append(nlu.Embeddings(model=component, nlu_ref='use')) elif isinstance(component, TokenizerModel) and parsed != 'regex': constructed_components.append(nlu.Tokenizer(model=component)) elif isinstance(component, TokenizerModel) and parsed == 'regex' : constructed_components.append(nlu.Tokenizer(model=component, annotator_class='regex_tokenizer')) elif isinstance(component, DocumentAssembler): constructed_components.append(nlu.Util(model=component)) elif isinstance(component, SentenceDetectorDLModel): constructed_components.append(NLUSentenceDetector(annotator_class='deep_sentence_detector', model=component)) elif isinstance(component, (SentenceDetectorDLModel, SentenceDetector)): constructed_components.append(NLUSentenceDetector(annotator_class='pragmatic_sentence_detector', model=component)) elif isinstance(component, RegexMatcherModel) or parsed == 'match': constructed_components.append(nlu.Matcher(model=component, annotator_class='regex')) elif isinstance(component, TextMatcherModel): constructed_components.append(nlu.Matcher(model=component, annotator_class='text')) elif isinstance(component, DateMatcher): constructed_components.append(nlu.Matcher(model=component, annotator_class='date')) elif isinstance(component, ContextSpellCheckerModel): constructed_components.append(nlu.SpellChecker(model=component, annotator_class='context')) elif isinstance(component, SymmetricDeleteModel): constructed_components.append(nlu.SpellChecker(model=component, annotator_class='symmetric')) elif isinstance(component, NorvigSweetingModel): constructed_components.append(nlu.SpellChecker(model=component, annotator_class='norvig')) elif isinstance(component, LemmatizerModel): constructed_components.append(nlu.lemmatizer.Lemmatizer(model=component)) elif isinstance(component, NormalizerModel): constructed_components.append(nlu.normalizer.Normalizer(model=component)) elif isinstance(component, Stemmer): constructed_components.append(nlu.stemmer.Stemmer(model=component)) elif isinstance(component, (NerDLModel, NerCrfModel)): constructed_components.append(nlu.Classifier(model=component, annotator_class='ner')) elif isinstance(component, LanguageDetectorDL): constructed_components.append(nlu.Classifier(model=component, annotator_class='language_detector')) elif isinstance(component, DependencyParserModel): constructed_components.append(UnlabledDepParser(model=component)) elif isinstance(component, TypedDependencyParserModel): constructed_components.append(LabledDepParser(model=component)) elif isinstance(component, MultiClassifierDLModel): constructed_components.append(nlu.Classifier(model=component, nlp_ref='multiclassifierdl')) elif isinstance(component, (SentimentDetectorModel,SentimentDLModel)): constructed_components.append(nlu.Classifier(model=component, nlp_ref='sentimentdl')) elif isinstance(component, (SentimentDetectorModel,ViveknSentimentModel)): constructed_components.append(nlu.Classifier(model=component, nlp_ref='vivekn')) elif isinstance(component, Chunker): constructed_components.append(nlu.chunker.Chunker(model=component)) elif isinstance(component, NGram): constructed_components.append(nlu.chunker.Chunker(model=component)) elif isinstance(component, ChunkEmbeddings): constructed_components.append(embeddings_chunker.EmbeddingsChunker(model=component)) elif isinstance(component, StopWordsCleaner): constructed_components.append(nlu.StopWordsCleaner(model=component)) elif isinstance(component, (TextMatcherModel, RegexMatcherModel, DateMatcher,MultiDateMatcher)) or parsed == 'match': constructed_components.append(nlu.Matcher(model=component)) elif isinstance(component,(T5Transformer)): constructed_components.append(nlu.Seq2Seq(annotator_class='t5', model=component)) elif isinstance(component,(MarianTransformer)): constructed_components.append(nlu.Seq2Seq(annotator_class='marian', model=component)) else: logger.exception( f"EXCEPTION: Could not infer component type for lang={language} and nlp_ref={nlp_ref} and model {component} during pipeline conversion,") logger.info("USING DEFAULT ANNOTATOR TYPE Lemmatizer to fix issue") constructed_components.append(nlu.normalizer.Normalizer(model=component)) logger.info(f"Extracted into NLU Component type : {parsed}", ) if None in constructed_components: logger.exception( f"EXCEPTION: Could not infer component type for lang={language} and nlp_ref={nlp_ref} during pipeline conversion,") return None return constructed_components
def get_default_component_of_type(missing_component_type): ''' This function returns a default component for a missing component type. It is used to auto complete pipelines, which are missng required components. These represents defaults for many applications and should be set wisely. :param missing_component_type: String which is either just the component type or componenttype@spark_nlp_reference which stems from a models storageref and refers to some pretrained embeddings or model :return: a NLU component which is a either the default if there is no '@' in the @param missing_component_type or a default component for that particualar type ''' logger.info('Getting default for missing_component_type=%s', missing_component_type) if not '@' in missing_component_type: # get default models if there is no @ in the model name included if missing_component_type == 'document': return Util('document_assembler') if missing_component_type == 'sentence': return Util('sentence_detector') if missing_component_type == 'sentence_embeddings': return Embeddings('use') if 'token' in missing_component_type: return nlu.components.tokenizer.Tokenizer("default_tokenizer") if missing_component_type == 'word_embeddings': return Embeddings(nlu_ref='glove') if missing_component_type == 'pos': return Classifier(nlu_ref='pos') if missing_component_type == 'ner': return Classifier(nlu_ref='ner') if missing_component_type == 'ner_converter': return Util('ner_converter') if missing_component_type == 'chunk': return nlu.chunker.Chunker() if missing_component_type == 'ngram': return nlu.chunker.Chunker(nlu_ref='ngram') if missing_component_type == 'chunk_embeddings': return embeddings_chunker.EmbeddingsChunker() if missing_component_type == 'unlabeled_dependency': return UnlabledDepParser() if missing_component_type == 'labled_dependency': return LabledDepParser('dep') if missing_component_type == 'date': return nlu.Matcher('date') if missing_component_type == 'ner_converter': return Util('ner_converter') else: # if there is an @ in the name, we must get some specific pretrained model from the sparknlp reference that should follow after the @ missing_component_type, sparknlp_reference = missing_component_type.split( '@') if 'embed' in missing_component_type: return construct_component_from_identifier( language='en', component_type='embed', nlp_ref=sparknlp_reference) if 'pos' in missing_component_type or 'ner' in missing_component_type: return construct_component_from_identifier( language='en', component_type='classifier', nlp_ref=sparknlp_reference) if 'chunk_embeddings' in missing_component_type: return embeddings_chunker.EmbeddingsChunker() if 'unlabeled_dependency' in missing_component_type or 'dep.untyped' in missing_component_type: return UnlabledDepParser('dep.untyped') if 'labled_dependency' in missing_component_type or 'dep.typed' in missing_component_type: return LabledDepParser('dep.typed') if 'date' in missing_component_type: return None logger.exception( "Could not resolve default component type for missing type=%s", missing_component_type)