Exemplo n.º 1
0
def construct_trainable_component_from_identifier(nlu_ref, nlp_ref):
    '''
    This method returns a Spark NLP annotator Approach class embelished by a NLU component
    :param nlu_ref: nlu ref to the trainable model
    :param nlp_ref: nlp ref to the trainable model
    :return: trainable model as a NLU component
    '''

    logger.info(
        f'Creating trainable NLU component for nlu_ref = {nlu_ref} and nlp_ref = {nlp_ref}'
    )
    try:
        if nlu_ref in [
                'train.deep_sentence_detector', 'train.sentence_detector'
        ]:
            #no label col but trainable?
            return nlu.NLUSentenceDetector(
                annotator_class='deep_sentence_detector', trainable='True')
        if nlu_ref in ['train.context_spell', 'train.spell']:
            pass
        if nlu_ref in ['train.symmetric_spell']:
            pass
        if nlu_ref in ['train.norvig_spell']:
            pass
        if nlu_ref in ['train.unlabeled_dependency_parser']:
            pass
        if nlu_ref in ['train.labeled_dependency_parser']:
            pass
        if nlu_ref in ['train.classifier_dl', 'train.classifier']:
            return nlu.Classifier(annotator_class='classifier_dl',
                                  trainable=True)
        if nlu_ref in ['train.ner', 'train.named_entity_recognizer_dl']:
            return nlu.Classifier(annotator_class='ner', trainable=True)
        if nlu_ref in ['train.sentiment_dl', 'train.sentiment']:
            return nlu.Classifier(annotator_class='sentiment_dl',
                                  trainable=True)
        if nlu_ref in ['train.vivekn_sentiment']:
            pass
        if nlu_ref in ['train.pos']:
            return nlu.Classifier(annotator_class='pos', trainable=True)
        if nlu_ref in ['train.multi_classifier']:
            return nlu.Classifier(annotator_class='multi_classifier',
                                  trainable=True)
        if nlu_ref in ['train.word_seg', 'train.word_segmenter']:
            return nlu.Tokenizer(annotator_class='word_segmenter',
                                 trainable=True)

    except:  # if reference is not in namespace and not a component it will cause a unrecoverable crash
        logger.exception(
            f'EXCEPTION: Could not create trainable NLU component for nlu_ref = {nlu_ref} and nlp_ref = {nlp_ref}'
        )
        return None
Exemplo n.º 2
0
def construct_component_from_pipe_identifier(language, sparknlp_reference):
    '''
    # creates a list of components from a Spark NLP Pipeline reference
    # 1. download pipeline
    # 2. unpack pipeline to annotators and create list of nlu components
    # 3. return list of nlu components
    :param language: language of the pipeline
    :param sparknlp_reference: Reference to a spark nlp petrained pipeline
    :return: Each element of the SaprkNLP pipeline wrapped as a NLU componed inside of a list
    '''
    logger.info("Starting Spark NLP to NLU pipeline conversion process")
    from sparknlp.pretrained import PretrainedPipeline
    if 'language' in sparknlp_reference : language='xx' #special edge case for lang detectors
    pipe = PretrainedPipeline(sparknlp_reference, lang=language)
    constructed_components = []
    for component in pipe.light_model.pipeline_model.stages:
        logger.info("Extracting model from Spark NLP pipeline: %s and creating Component", component)
        parsed=''
        parsed = str(component).split('_')[0].lower()
        logger.info("Parsed Component for : %s", parsed)
        
        if 'NerConverter' in  component.name : constructed_components.append(Util(component_name='ner_converter', model=component)) 
        elif parsed == 'match': constructed_components.append(nlu.Matcher(model=component)) 
        elif parsed == 'document': constructed_components.append(nlu.Util(model=component)) 
        elif parsed == 'sentence': constructed_components.append(nlu.Util(component_name='sentence_detector',model=component)) # todo differentiate normal and deep detector
        elif parsed == 'regex': constructed_components.append(nlu.Matcher(component_name='regex', model=component))
        elif parsed == 'text': constructed_components.append(nlu.Matcher(model=component))
        elif parsed == 'spell': constructed_components.append(nlu.SpellChecker(model=component))
        elif parsed == 'lemmatizer': constructed_components.append(nlu.lemmatizer.Lemmatizer(model=component))
        elif parsed == 'normalizer': constructed_components.append(nlu.lemmatizer.Normalizer(model=component))
        elif parsed == 'stemmer': constructed_components.append(nlu.stemmer.Stemmer(model=component))
        elif parsed == 'pos' or parsed =='language': constructed_components.append(nlu.Classifier(model=component))
        elif parsed == 'word': constructed_components.append(nlu.Embeddings(model=component))
        elif parsed == 'ner' or  parsed == 'nerdlmodel': constructed_components.append(nlu.Classifier(component_name='ner',model=component))
        elif parsed == 'dependency': constructed_components.append(nlu.Util(model=component))
        elif parsed == 'typed': constructed_components.append(nlu.Util(model=component)) # todo util abuse
        elif parsed == 'multi': constructed_components.append(nlu.Util(model=component)) # todo util abuse 
        elif parsed == 'sentimentdlmodel': constructed_components.append(nlu.Classifier(model=component))
        elif parsed in ['universal','bert','albert', 'elmo', 'xlnet', 'glove','electra','covidbert','small_bert','']  : constructed_components.append(nlu.Embeddings(model=component))
        elif parsed == 'vivekn': constructed_components.append(nlu.Classifier(component_name='vivekn', model=component))
        elif parsed == 'chunker': constructed_components.append(nlu.chunker.Chunker(model=component))
        elif parsed == 'ngram': constructed_components.append(nlu.chunker.Chunker(model=component))
        elif '2e2' in parsed: constructed_components.append(nlu.Embeddings(model=component))
        elif parsed == 'embeddings_chunk': constructed_components.append(embeddings_chunker.EmbeddingsChunker(model=component))
        elif parsed == 'stopwords': constructed_components.append(nlu.StopWordsCleaner(model=component))
        
        logger.info("Extracted into NLU Component type : %s", parsed)
        if None in constructed_components :
            logger.exception("EXCEPTION: Could not infer component type for lang=%s and sparknlp_reference=%s during pipeline conversion,", language,sparknlp_reference)
            return None
    return constructed_components
Exemplo n.º 3
0
def construct_component_from_pipe_identifier(language, nlp_ref, nlu_ref,path=None):
    '''
    # creates a list of components from a Spark NLP Pipeline reference
    # 1. download pipeline
    # 2. unpack pipeline to annotators and create list of nlu components
    # 3. return list of nlu components
    :param nlu_ref:
    :param language: language of the pipeline
    :param nlp_ref: Reference to a spark nlp petrained pipeline
    :param path: Load pipe from HDD
    :return: Each element of the SaprkNLP pipeline wrapped as a NLU componed inside of a list
    '''
    logger.info("Starting Spark NLP to NLU pipeline conversion process")
    from sparknlp.pretrained import PretrainedPipeline, LightPipeline
    if 'language' in nlp_ref: language = 'xx'  # special edge case for lang detectors
    if path == None :
        pipe = PretrainedPipeline(nlp_ref, lang=language)
        iterable_stages = pipe.light_model.pipeline_model.stages
    else :
        pipe = LightPipeline(PipelineModel.load(path=path))
        iterable_stages = pipe.pipeline_model.stages
    constructed_components = []

    # for component in pipe.light_model.pipeline_model.stages:
    for component in iterable_stages:

        logger.info("Extracting model from Spark NLP pipeline: %s and creating Component", component)
        parsed = str(component).split('_')[0].lower()
        logger.info("Parsed Component for : %s", parsed)
        c_name = component.__class__.__name__
        if isinstance(component, NerConverter):
            constructed_components.append(Util(annotator_class='ner_converter', model=component))
        elif parsed in NameSpace.word_embeddings + NameSpace.sentence_embeddings:
            constructed_components.append(nlu.Embeddings(model=component))
        elif parsed in NameSpace.classifiers:
            constructed_components.append(nlu.Classifier(model=component))
        elif isinstance(component, MultiClassifierDLModel):
            constructed_components.append(nlu.Classifier(model=component, nlp_ref='multiclassifierdl'))
        elif isinstance(component, PerceptronModel):
            constructed_components.append(nlu.Classifier(nlp_ref='classifierdl', model=component))
        elif isinstance(component, (ClassifierDl,ClassifierDLModel)):
            constructed_components.append(nlu.Classifier(nlp_ref='classifierdl', model=component))
        elif isinstance(component, UniversalSentenceEncoder):
            constructed_components.append(nlu.Embeddings(model=component, nlp_ref='use'))
        elif isinstance(component, BertEmbeddings):
            constructed_components.append(nlu.Embeddings(model=component, nlp_ref='bert'))
        elif isinstance(component, AlbertEmbeddings):
            constructed_components.append(nlu.Embeddings(model=component, nlp_ref='albert'))
        elif isinstance(component, XlnetEmbeddings):
            constructed_components.append(nlu.Embeddings(model=component, nlp_ref='xlnet'))
        elif isinstance(component, WordEmbeddingsModel):
            constructed_components.append(nlu.Embeddings(model=component, nlp_ref='glove'))
        elif isinstance(component, ElmoEmbeddings):
            constructed_components.append(nlu.Embeddings(model=component, nlp_ref='elmo'))
        elif isinstance(component, BertSentenceEmbeddings):
            constructed_components.append(nlu.Embeddings(model=component, nlp_ref='bert_sentence'))
        elif isinstance(component, UniversalSentenceEncoder):
            constructed_components.append(nlu.Embeddings(model=component, nlu_ref='use'))
        elif isinstance(component, TokenizerModel) and parsed != 'regex':
            constructed_components.append(nlu.Tokenizer(model=component))
        elif isinstance(component, TokenizerModel) and parsed == 'regex' :
            constructed_components.append(nlu.Tokenizer(model=component, annotator_class='regex_tokenizer'))
        elif isinstance(component, DocumentAssembler):
            constructed_components.append(nlu.Util(model=component))
        elif isinstance(component, SentenceDetectorDLModel):
            constructed_components.append(NLUSentenceDetector(annotator_class='deep_sentence_detector', model=component))
        elif isinstance(component, (SentenceDetectorDLModel, SentenceDetector)):
            constructed_components.append(NLUSentenceDetector(annotator_class='pragmatic_sentence_detector', model=component))
        elif isinstance(component, RegexMatcherModel) or parsed == 'match':
            constructed_components.append(nlu.Matcher(model=component, annotator_class='regex'))
        elif isinstance(component, TextMatcherModel):
            constructed_components.append(nlu.Matcher(model=component, annotator_class='text'))
        elif isinstance(component, DateMatcher):
            constructed_components.append(nlu.Matcher(model=component, annotator_class='date'))
        elif isinstance(component, ContextSpellCheckerModel):
            constructed_components.append(nlu.SpellChecker(model=component, annotator_class='context'))
        elif isinstance(component, SymmetricDeleteModel):
            constructed_components.append(nlu.SpellChecker(model=component, annotator_class='symmetric'))
        elif isinstance(component, NorvigSweetingModel):
            constructed_components.append(nlu.SpellChecker(model=component, annotator_class='norvig'))
        elif isinstance(component, LemmatizerModel):
            constructed_components.append(nlu.lemmatizer.Lemmatizer(model=component))
        elif isinstance(component, NormalizerModel):
            constructed_components.append(nlu.normalizer.Normalizer(model=component))
        elif isinstance(component, Stemmer):
            constructed_components.append(nlu.stemmer.Stemmer(model=component))
        elif isinstance(component, (NerDLModel, NerCrfModel)):
            component.setIncludeConfidence(True) # Pipes dont always extrat confidences, so here we enable all pipes to extract confidences manually
            constructed_components.append(nlu.Classifier(model=component, annotator_class='ner'))
        elif isinstance(component, LanguageDetectorDL):
            constructed_components.append(nlu.Classifier(model=component, annotator_class='language_detector'))

        elif isinstance(component, DependencyParserModel):
            constructed_components.append(UnlabledDepParser(model=component))
        elif isinstance(component, TypedDependencyParserModel):
            constructed_components.append(LabledDepParser(model=component))
        elif isinstance(component, MultiClassifierDLModel):
            constructed_components.append(nlu.Classifier(model=component, nlp_ref='multiclassifierdl'))
        elif isinstance(component, (SentimentDetectorModel,SentimentDLModel)):
            constructed_components.append(nlu.Classifier(model=component, nlp_ref='sentimentdl'))
        elif isinstance(component, (SentimentDetectorModel,ViveknSentimentModel)):
            constructed_components.append(nlu.Classifier(model=component, nlp_ref='vivekn'))
        elif isinstance(component, Chunker):
            constructed_components.append(nlu.chunker.Chunker(model=component))
        elif isinstance(component, NGram):
            constructed_components.append(nlu.chunker.Chunker(model=component))
        elif isinstance(component, ChunkEmbeddings):
            constructed_components.append(embeddings_chunker.EmbeddingsChunker(model=component))
        elif isinstance(component, StopWordsCleaner):
            constructed_components.append(nlu.StopWordsCleaner(model=component))
        elif isinstance(component, (TextMatcherModel, RegexMatcherModel, DateMatcher,MultiDateMatcher)) or parsed == 'match':
            constructed_components.append(nlu.Matcher(model=component))
        elif isinstance(component,(T5Transformer)):
            constructed_components.append(nlu.Seq2Seq(annotator_class='t5', model=component))
        elif isinstance(component,(MarianTransformer)):
            constructed_components.append(nlu.Seq2Seq(annotator_class='marian', model=component))
        else:
            logger.exception(
                f"EXCEPTION: Could not infer component type for lang={language} and nlp_ref={nlp_ref} and model {component} during pipeline conversion,")
            logger.info("USING DEFAULT ANNOTATOR TYPE Lemmatizer to fix issue")
            constructed_components.append(nlu.normalizer.Normalizer(model=component))

        logger.info(f"Extracted into NLU Component type : {parsed}", )
        if None in constructed_components:
            logger.exception(
                f"EXCEPTION: Could not infer component type for lang={language} and nlp_ref={nlp_ref} during pipeline conversion,")
            return None
    return constructed_components
Exemplo n.º 4
0
def construct_component_from_pipe_identifier(language, nlp_ref, nlu_ref):
    '''
    # creates a list of components from a Spark NLP Pipeline reference
    # 1. download pipeline
    # 2. unpack pipeline to annotators and create list of nlu components
    # 3. return list of nlu components
    :param language: language of the pipeline
    :param nlp_ref: Reference to a spark nlp petrained pipeline
    :return: Each element of the SaprkNLP pipeline wrapped as a NLU componed inside of a list
    '''
    logger.info("Starting Spark NLP to NLU pipeline conversion process")
    from sparknlp.pretrained import PretrainedPipeline
    if 'language' in nlp_ref: language = 'xx'  # special edge case for lang detectors
    pipe = PretrainedPipeline(nlp_ref, lang=language)
    constructed_components = []
    for component in pipe.light_model.pipeline_model.stages:
        logger.info("Extracting model from Spark NLP pipeline: %s and creating Component", component)
        parsed = str(component).split('_')[0].lower()
        logger.info("Parsed Component for : %s", parsed)
        c_name = component.__class__.__name__
        if c_name == 'NerConverter':
            constructed_components.append(Util(annotator_class='ner_converter', model=component))
        elif parsed in NameSpace.word_embeddings + NameSpace.sentence_embeddings:
            constructed_components.append(nlu.Embeddings(model=component))
        elif parsed in NameSpace.classifiers:
            constructed_components.append(nlu.Classifier(model=component))
        elif c_name == 'TokenizerModel' and parsed !='regex':
            constructed_components.append(nlu.Tokenizer(model=component))
        elif c_name == 'TokenizerModel':
            constructed_components.append(nlu.Tokenizer(model=component,annotator_class='regex_tokenizer'))
        elif parsed == 'match':
            constructed_components.append(nlu.Matcher(model=component))
        elif parsed == 'document':
            constructed_components.append(nlu.Util(model=component))
        elif parsed == 'sentence':
            constructed_components.append(nlu.Util(annotator_class='sentence_detector', model=component))
        elif parsed == 'regex':
            constructed_components.append(nlu.Matcher(model=component, nlu_ref=parsed))
        elif parsed == 'date':
            constructed_components.append(nlu.Matcher(model=component, nlu_ref=parsed))
        elif parsed == 'text':
            constructed_components.append(nlu.Matcher(model=component, nlu_ref=parsed))
        elif parsed == 'spell':
            constructed_components.append(nlu.SpellChecker(model=component))
        elif parsed == 'lemmatizer':
            constructed_components.append(nlu.lemmatizer.Lemmatizer(model=component))
        elif parsed == 'normalizer':
            constructed_components.append(nlu.normalizer.Normalizer(model=component))
        elif parsed == 'stemmer':
            constructed_components.append(nlu.stemmer.Stemmer(model=component))
        elif c_name == 'PerceptronModel':
            constructed_components.append(nlu.Classifier(annotator_class='classifierdl', model=component))
        elif c_name == 'ClassifierDLModel':
            constructed_components.append(nlu.Classifier(annotator_class='language_detector', model=component))

        elif parsed == 'word':
            constructed_components.append(nlu.Embeddings(model=component))
        elif parsed == 'ner' or parsed == 'nerdlmodel':
            constructed_components.append(nlu.Classifier(model=component))
        elif parsed == 'dependency':
            constructed_components.append(nlu.Util(model=component))
        elif parsed == 'typed':
            constructed_components.append(nlu.UnlabledDepParser(model=component))
        elif parsed == 'multi':
            constructed_components.append(nlu.Classifier(model=component))
        elif parsed == 'sentimentdlmodel':
            constructed_components.append(nlu.Classifier(model=component))

        elif parsed == 'chunker':
            constructed_components.append(nlu.chunker.Chunker(model=component))
        elif parsed == 'ngram':
            constructed_components.append(nlu.chunker.Chunker(model=component))
        elif parsed == 'embeddings_chunk':
            constructed_components.append(embeddings_chunker.EmbeddingsChunker(model=component))
        elif parsed == 'stopwords':
            constructed_components.append(nlu.StopWordsCleaner(model=component))
        else:
            logger.exception(
                "EXCEPTION: Could not infer component type for lang=%s and nlp_ref=%s during pipeline conversion,",
                language, nlp_ref)
            logger.info("USING DEFAULT ANNOTATOR TYPE Lemmatizer to fix issue")
            constructed_components.append(nlu.normalizer.Normalizer(model=component))

        logger.info("Extracted into NLU Component type : %s", parsed)
        if None in constructed_components:
            logger.exception(
                "EXCEPTION: Could not infer component type for lang=%s and nlp_ref=%s during pipeline conversion,",
                language, nlp_ref)
            return None
    return constructed_components