예제 #1
0
    def __init__(self,
                 annotator_class='lemmatizer',
                 language='en',
                 component_type='lemmatizer',
                 get_default=False,
                 model=None,
                 nlp_ref='',
                 nlu_ref='',
                 is_licensed=False,
                 loaded_from_pretrained_pipe=False):

        if model != None: self.model = model
        else:
            if 'lemma' in annotator_class:
                from nlu import SparkNLPLemmatizer
                if get_default:
                    self.model = SparkNLPLemmatizer.get_default_model()
                else:
                    self.model = SparkNLPLemmatizer.get_pretrained_model(
                        nlp_ref, language)
        SparkNLUComponent.__init__(
            self,
            annotator_class,
            component_type,
            loaded_from_pretrained_pipe=loaded_from_pretrained_pipe)
예제 #2
0
파일: chunker.py 프로젝트: JohnSnowLabs/nlu
    def __init__(self,
                 annotator_class='default_chunker',
                 language='en',
                 component_type='chunker',
                 get_default=True,
                 nlp_ref='',
                 nlu_ref='',
                 model=None,
                 lang='en',
                 loaded_from_pretrained_pipe=False,
                 is_licensed=False):
        if model != None: self.model = model
        else:
            if annotator_class == 'default_chunker':
                from nlu import DefaultChunker
                if get_default: self.model = DefaultChunker.get_default_model()
                else:
                    self.model = DefaultChunker.get_default_model(
                    )  # there are no pretrained chunkers, only default 1
            if annotator_class == 'ngram':
                from nlu import NGram
                if get_default: self.model = NGram.get_default_model()
                else:
                    self.model = NGram.get_default_model(
                    )  # there are no pretrained chunkers, only default 1
            if annotator_class == 'contextual_parser':
                from nlu.components.chunkers.contextual_parser.contextual_parser import ContextualParser
                if get_default:
                    self.model = ContextualParser.get_default_model()
                else:
                    self.model = ContextualParser.get_default_model()

        SparkNLUComponent.__init__(self, annotator_class, component_type,
                                   nlu_ref, nlp_ref, lang,
                                   loaded_from_pretrained_pipe, is_licensed)
예제 #3
0
    def __init__(self,
                 annotator_class='relation_extractor',
                 lang='en',
                 component_type='relation_extractor',
                 get_default=True,
                 model=None,
                 nlp_ref='',
                 nlu_ref='',
                 trainable=False,
                 is_licensed=False,
                 loaded_from_pretrained_pipe=False):

        if 're_' in nlp_ref: annotator_class = 'relation_extractor'
        if 'redl' in nlp_ref: annotator_class = 'relation_extractor_dl'

        if model != None: self.model = model
        else:
            if annotator_class == 'relation_extractor':
                from nlu.components.relation_extractors.relation_extractor.relation_extractor import RelationExtraction
                if trainable:
                    self.model = RelationExtraction.get_default_trainable_model(
                    )
                else:
                    self.model = RelationExtraction.get_pretrained_model(
                        nlp_ref, lang, 'clinical/models')

            elif annotator_class == 'relation_extractor_dl':
                from nlu.components.relation_extractors.relation_extractor_dl.relation_extractor_dl import RelationExtractionDL
                # if trainable : self.model = RelationExtractionDL.get_default_trainable_model()
                self.model = RelationExtractionDL.get_pretrained_model(
                    nlp_ref, lang, 'clinical/models')

        SparkNLUComponent.__init__(self, annotator_class, component_type,
                                   nlu_ref, nlp_ref, lang,
                                   loaded_from_pretrained_pipe, is_licensed)
예제 #4
0
    def __init__(self, annotator_class='default_tokenizer', language='en', component_type='tokenizer', get_default=True,
                 nlp_ref='', nlu_ref='', lang='en', model=None, is_licensed=False, loaded_from_pretrained_pipe=False):

        if 'segment_words' in nlu_ref:
            annotator_class = 'word_segmenter'
        elif 'token' in annotator_class and language in nlu.AllComponentsInfo().all_right_to_left_langs_with_pretrained_tokenizer:
            annotator_class = 'word_segmenter'

        if model != None:
            self.model = model
        elif annotator_class == 'default_tokenizer':
            from nlu import DefaultTokenizer
            if get_default:
                self.model = DefaultTokenizer.get_default_model()
            else:
                self.model = DefaultTokenizer.get_default_model()  # there are no pretrained tokenizrs, only default 1
        elif annotator_class == 'word_segmenter':
            from nlu import WordSegmenter
            if get_default and language == '':
                self.model = WordSegmenter.get_default_model()
            elif get_default and language != '':
                self.model = WordSegmenter.get_default_model_for_lang(language)
            else:
                self.model = WordSegmenter.get_pretrained_model(nlp_ref,
                                                                language)  # there are no pretrained tokenizrs, only default 1

        SparkNLUComponent.__init__(self, annotator_class, component_type, nlu_ref=nlu_ref, nlp_ref=nlp_ref,
                                   loaded_from_pretrained_pipe=True, lang=lang)
예제 #5
0
 def __init__(self, annotator_class='chunk_embedder', language='en', component_type='embeddings_chunk', get_default = True, nlp_ref='', model=None, nlu_ref='',lang='en',loaded_from_pretrained_pipe=False):
     if model != None : self.model = model
     else : 
         if annotator_class == 'chunk_embedder' :
             from nlu import ChunkEmbedder
             if get_default : self.model =  ChunkEmbedder.get_default_model()
             else : self.model =  ChunkEmbedder.get_default_model()  # there are no pretrained chunkers, only default 1
     SparkNLUComponent.__init__(self, annotator_class, component_type, nlu_ref,nlp_ref)
예제 #6
0
파일: matcher.py 프로젝트: JohnSnowLabs/nlu
    def __init__(self,
                 annotator_class='date_matcher',
                 language='en',
                 component_type='matcher',
                 get_default=False,
                 nlp_ref='',
                 model=None,
                 nlu_ref='',
                 dataset='',
                 is_licensed=False,
                 loaded_from_pretrained_pipe=False):

        if 'date' in nlp_ref or 'date' in nlu_ref:
            annotator_class = 'date_matcher'
        elif 'regex' in nlp_ref or 'regex' in nlu_ref:
            annotator_class = 'regex_matcher'
        elif 'context' in nlu_ref:
            annotator_class = 'context_parser'
        elif 'text' in nlp_ref or 'text' in nlu_ref:
            annotator_class = 'text_matcher'
        elif '_matcher' not in annotator_class:
            annotator_class = annotator_class + '_matcher'
        if model != None:
            self.model = model
        else:
            if 'context' in annotator_class:
                from nlu.components.matchers.context_parser.context_parser import ContextParser
                is_licensed = True
                if get_default:
                    self.model = ContextParser.get_default_model()
                else:
                    self.model = ContextParser.get_default_model()

            elif 'text' in annotator_class:
                from nlu import TextMatcher
                if get_default or nlp_ref == 'text_matcher':
                    self.model = TextMatcher.get_default_model()
                else:
                    self.model = TextMatcher.get_pretrained_model(
                        nlp_ref, language)
            elif 'date' in annotator_class:
                from nlu import DateMatcher
                from nlu.components.matchers.date_matcher.date_matcher import DateMatcher as DateM

                if get_default: self.model = DateM.get_default_model()
                else: self.model = DateM.get_default_model()
            elif 'regex' in annotator_class:
                from nlu import RegexMatcher
                if get_default:
                    self.model = RegexMatcher.get_default_model()
                else:
                    self.model = RegexMatcher.get_pretrained_model(
                        nlu_ref, language)

        SparkNLUComponent.__init__(self, annotator_class, component_type,
                                   nlu_ref, nlp_ref, language,
                                   loaded_from_pretrained_pipe, is_licensed)
예제 #7
0
    def __init__(self, annotator_class='deidentifier', lang='en', component_type='deidentifier', get_default=False, model = None, nlp_ref ='', nlu_ref='', trainable=False, is_licensed=True,loaded_from_pretrained_pipe=False):
        annotator_class= 'deidentifier'
        if model != None : self.model = model
        else :
            if annotator_class == 'deidentifier':
                from nlu.components.deidentifiers.deidentifier.deidentifier import Deidentifier
                if get_default : self.model = Deidentifier.get_default_model()
                else : self.model = Deidentifier.get_pretrained_model(nlp_ref, lang)

        SparkNLUComponent.__init__(self, annotator_class, component_type, nlu_ref, nlp_ref, lang,loaded_from_pretrained_pipe , is_licensed)
예제 #8
0
    def __init__(self,
                 annotator_class='document_assembler',
                 component_type='util',
                 model=None,
                 loaded_from_pretrained_pipe=False,
                 nlu_ref='',
                 nlp_ref='',
                 lang='en',
                 is_licensed=False):
        # super(Tokenizer,self).__init__(annotator_class = annotator_class, component_type = component_type)
        if annotator_class == 'ner_converter':
            annotator_class = 'ner_to_chunk_converter'
        if model != None: self.model = model
        else:
            if annotator_class == 'document_assembler':
                from nlu import SparkNlpDocumentAssembler
                self.model = SparkNlpDocumentAssembler.get_default_model()
            elif annotator_class == 'deep_sentence_detector':
                from nlu import SentenceDetectorDeep
                self.model = SentenceDetectorDeep.get_default_model()
            elif annotator_class == 'sentence_detector':
                from nlu import SparkNLPSentenceDetector
                self.model = SparkNLPSentenceDetector.get_default_model()
            elif annotator_class == 'ner_to_chunk_converter':
                from nlu import NerToChunkConverter
                self.model = NerToChunkConverter.get_default_model()
            elif annotator_class == 'sentence_embeddings':
                from nlu import SparkNLPSentenceEmbeddings
                self.model = SparkNLPSentenceEmbeddings.get_default_model()
            elif annotator_class == 'feature_assembler':
                from nlu.components.utils.feature_assembler.feature_assembler import SparkNLPFeatureAssembler
                self.model = SparkNLPFeatureAssembler.get_default_model()
            elif annotator_class == 'ner_to_chunk_converter_licensed':
                from nlu.components.utils.ner_to_chunk_converter_licensed.ner_to_chunk_converter_licensed import NerToChunkConverterLicensed
                self.model = NerToChunkConverterLicensed.get_default_model()
            elif annotator_class == 'chunk_merger':
                from nlu.components.utils.chunk_merger.chunk_merger import ChunkMerger
                self.model = ChunkMerger.get_default_model()
            elif annotator_class == 'doc2chunk':
                from nlu.components.utils.doc2chunk.doc_2_chunk import Doc_2_Chunk
                self.model = Doc_2_Chunk.get_default_model()
            elif annotator_class == 'chunk_2_doc':
                from nlu.components.utils.chunk_2_doc.doc_2_chunk import Chunk_2_Doc
                self.model = Chunk_2_Doc.get_default_model()

        SparkNLUComponent.__init__(
            self,
            annotator_class,
            component_type,
            nlu_ref,
            lang,
            loaded_from_pretrained_pipe=loaded_from_pretrained_pipe)
예제 #9
0
    def __init__(self,
                 annotator_class='context_spell',
                 language='en',
                 component_type='spell_checker',
                 get_default=True,
                 model=None,
                 nlp_ref='',
                 dataset='',
                 nlu_ref='',
                 is_licensed=False,
                 loaded_from_pretrained_pipe=True):
        if 'context' in nlu_ref: annotator_class = 'context_spell'
        elif 'norvig' in nlu_ref: annotator_class = 'norvig_spell'
        elif 'spellcheck_dl' in nlp_ref: annotator_class = 'context_spell'
        elif 'spell.med' in nlu_ref: annotator_class = 'context_spell'
        elif 'spell.clinical' in nlu_ref: annotator_class = 'context_spell'
        elif '.med' in nlu_ref: annotator_class = 'context_spell'

        if model != None: self.model = model
        else:
            if 'context' in annotator_class:
                from nlu import ContextSpellChecker
                if is_licensed:
                    self.model = ContextSpellChecker.get_pretrained_model(
                        nlp_ref, language, bucket='clinical/models')
                elif get_default:
                    self.model = ContextSpellChecker.get_default_model()
                else:
                    self.model = ContextSpellChecker.get_pretrained_model(
                        nlp_ref, language)
            elif 'norvig' in annotator_class:
                from nlu import NorvigSpellChecker
                if get_default:
                    self.model = NorvigSpellChecker.get_default_model()
                else:
                    self.model = NorvigSpellChecker.get_pretrained_model(
                        nlp_ref, language)
            elif 'symmetric' in annotator_class:
                from nlu import SymmetricSpellChecker
                if get_default:
                    self.model = SymmetricSpellChecker.get_default_model()
                else:
                    self.model = SymmetricSpellChecker.get_pretrained_model(
                        nlp_ref, language)

        SparkNLUComponent.__init__(
            self,
            annotator_class,
            component_type,
            loaded_from_pretrained_pipe=loaded_from_pretrained_pipe)
예제 #10
0
    def __init__(self,
                 annotator_class='sentiment_dl',
                 component_type='classifier',
                 model=None):
        self.model = model
        SparkNLUComponent.__init__(self, annotator_class, component_type)
        # Make sure input/output cols match up with NLU defaults
        if len(self.info.spark_input_column_names) == 1:
            model.setInputCols(self.info.spark_input_column_names[0])
        else:
            model.setInputCols(self.info.spark_input_column_names)

        if len(self.info.spark_output_column_names) == 1:
            model.setOutputCol(self.info.spark_output_column_names[0])
        else:
            model.setOutputCol(self.info.spark_output_column_names)
예제 #11
0
 def __init__(self,
              annotator_class='stemmer',
              component_type='stemmer',
              model=None,
              nlu_ref='',
              nlp_ref='',
              loaded_from_pretrained_pipe=False):
     if model != None: self.model = model
     else:
         if annotator_class == 'stemmer':
             from nlu import SparkNLPStemmer
             self.model = SparkNLPStemmer.get_default_model()
     SparkNLUComponent.__init__(
         self,
         annotator_class,
         component_type,
         loaded_from_pretrained_pipe=loaded_from_pretrained_pipe)
예제 #12
0
    def __init__(self,
                 annotator_class='normalizer',
                 language='en',
                 component_type='normalizer',
                 get_default=True,
                 nlp_ref='',
                 nlu_ref='',
                 model=None,
                 is_licensed=False,
                 loaded_from_pretrained_pipe=False):
        if model != None: self.model = model
        else:
            if 'norm_document' in nlu_ref:
                annotator_class = 'document_normalizer'
            elif 'drug' in nlu_ref:
                annotator_class = 'drug_normalizer'
            elif 'norm' in nlu_ref:
                annotator_class = 'normalizer'

            if annotator_class == 'normalizer':
                from nlu import SparkNLPNormalizer
                if get_default:
                    self.model = SparkNLPNormalizer.get_default_model()
                else:
                    self.model = SparkNLPNormalizer.get_pretrained_model(
                        nlp_ref, language
                    )  # there is no pretrained API for Normalizer in SparkNLP yet
            elif annotator_class == 'document_normalizer':
                from nlu import SparkNLPDocumentNormalizer
                if get_default:
                    self.model = SparkNLPDocumentNormalizer.get_default_model()
                else:
                    self.model = SparkNLPDocumentNormalizer.get_pretrained_model(
                        nlp_ref, language
                    )  # there is no pretrained API for Normalizer in SparkNLP yet
            elif annotator_class == 'drug_normalizer':
                from nlu.components.normalizers.drug_normalizer.drug_normalizer import DrugNorm
                is_licensed = True
                if get_default: self.model = DrugNorm.get_default_model()

        SparkNLUComponent.__init__(
            self,
            annotator_class,
            component_type,
            loaded_from_pretrained_pipe=loaded_from_pretrained_pipe)
예제 #13
0
    def __init__(self, annotator_class='t5', language ='en', component_type='seq2seq', get_default=True, model = None, nlp_ref ='', nlu_ref ='',dataset='', configs='', is_licensed=False,loaded_from_pretrained_pipe=False):
        if 't5' in nlu_ref or 't5' in nlp_ref: annotator_class = 't5'
        elif 'marian' in nlu_ref or 'marian' in nlp_ref: annotator_class = 'marian'
        elif 'translate_to' in nlu_ref or 'translate_to' in nlp_ref or 'translate_to' in annotator_class: annotator_class = 'marian'


        if model != None : self.model = model
        else :
            if 't5' in annotator_class :
                from nlu import T5
                if is_licensed : self.model = T5.get_pretrained_model(nlp_ref, language, bucket='clinical/models')
                elif get_default: self.model =  T5.get_default_model()
                elif configs !='' : self.model = T5.get_preconfigured_model(nlp_ref,language,configs)
                else : self.model = T5.get_pretrained_model(nlp_ref, language)

            elif 'marian' in annotator_class  :
                from nlu import Marian
                if get_default : self.model =  Marian.get_default_model()
                else : self.model = Marian.get_pretrained_model(nlp_ref, language)
        SparkNLUComponent.__init__(self, annotator_class, component_type,loaded_from_pretrained_pipe=loaded_from_pretrained_pipe)
예제 #14
0
    def __init__(self,
                 annotator_class='sentence_entity_resolver',
                 language='en',
                 component_type='resolution',
                 get_default=True,
                 model=None,
                 nlp_ref='',
                 nlu_ref='',
                 trainable=False,
                 is_licensed=True,
                 loaded_from_pretrained_pipe=False):

        if 'resolve' in nlu_ref and 'resolve_chunk' not in nlu_ref:
            annotator_class = 'sentence_entity_resolver'
        if 'resolve_chunk' in nlu_ref:
            annotator_class = 'chunk_entity_resolver'

        if model != None: self.model = model
        else:
            if annotator_class == 'sentence_entity_resolver':
                from nlu.components.resolutions.sentence_entity_resolver.sentence_resolver import SentenceResolver
                if trainable:
                    self.model = SentenceResolver.get_default_trainable_model()
                elif get_default:
                    self.model = SentenceResolver.get_default_model()
                else:
                    self.model = SentenceResolver.get_pretrained_model(
                        nlp_ref, language)
            elif annotator_class == 'chunk_entity_resolver':
                from nlu.components.resolutions.chunk_entity_resolver.chunk_resolver import ChunkResolver
                if trainable:
                    self.model = ChunkResolver.get_default_trainable_model()
                elif get_default:
                    self.model = ChunkResolver.get_default_model()
                else:
                    self.model = ChunkResolver.get_pretrained_model(
                        nlp_ref, language)

        SparkNLUComponent.__init__(self, annotator_class, component_type,
                                   nlu_ref, nlp_ref, language,
                                   loaded_from_pretrained_pipe, is_licensed)
예제 #15
0
 def __init__(self,
              annotator_class='sentence_detector',
              language='en',
              component_type='sentence_detector',
              get_default=True,
              model=None,
              nlp_ref='',
              nlu_ref='',
              trainable=False,
              is_licensed=False,
              lang='en',
              loaded_from_pretrained_pipe=False):
     if annotator_class == 'sentence_detector' and 'pragmatic' not in nlu_ref:
         annotator_class = 'deep_sentence_detector'  #default
     else:
         annotator_class = 'pragmatic_sentence_detector'
     if model != None: self.model = model
     else:
         if annotator_class == 'deep_sentence_detector' or 'ner_dl' in nlp_ref:
             from nlu import SentenceDetectorDeep
             if trainable:
                 self.model = SentenceDetectorDeep.get_trainable_model()
             elif get_default:
                 self.model = SentenceDetectorDeep.get_default_model()
             else:
                 self.model = SentenceDetectorDeep.get_pretrained_model(
                     nlp_ref, language)
         elif annotator_class == 'pragmatic_sentence_detector':
             from nlu import PragmaticSentenceDetector
             if get_default:
                 self.model = PragmaticSentenceDetector.get_default_model()
     SparkNLUComponent.__init__(
         self,
         annotator_class,
         component_type,
         nlu_ref,
         lang,
         loaded_from_pretrained_pipe=loaded_from_pretrained_pipe)
예제 #16
0
    def __init__(self,
                 annotator_class='assertion_dl',
                 lang='en',
                 component_type='assertion',
                 get_default=True,
                 model=None,
                 nlp_ref='',
                 nlu_ref='',
                 trainable=False,
                 is_licensed=False,
                 loaded_from_pretrained_pipe=False):

        if 'jsl' in nlu_ref: annotator_class = 'assertion_dl'

        if model != None: self.model = model
        else:
            if annotator_class == 'assertion_dl':
                from nlu.components.assertions.assertion_dl.assertion_dl import AssertionDL
                if trainable:
                    self.model = AssertionDL.get_default_trainable_model()
                elif get_default:
                    self.model = AssertionDL.get_default_model()
                else:
                    self.model = AssertionDL.get_pretrained_model(
                        nlp_ref, lang)

            elif annotator_class == 'assertion_log_reg':
                from nlu.components.assertions.assertion_log_reg.assertion_log_reg import AssertionLogReg
                if trainable:
                    self.model = AssertionLogReg.get_default_trainable_model()
                elif get_default:
                    self.model = AssertionLogReg.get_default_model()
                else:                    self.model = AssertionLogReg\
                 .get_pretrained_model(nlp_ref, lang)

        SparkNLUComponent.__init__(self, annotator_class, component_type,
                                   nlu_ref, nlp_ref, lang,
                                   loaded_from_pretrained_pipe, is_licensed)
예제 #17
0
    def __init__(self,
                 annotator_class='unlabeled_dependency_parser',
                 language='en',
                 component_type='dependency_untyped',
                 get_default=True,
                 nlp_ref='',
                 nlu_ref='',
                 model=None,
                 loaded_from_pretrained_pipe=False,
                 is_licensed=False):

        if model != None: self.model = model
        elif 'dep' in annotator_class or 'dep.untyped' in annotator_class or annotator_class == 'unlabeled_dependency_parser':
            from nlu.components.dependency_untypeds.unlabeled_dependency_parser.unlabeled_dependency_parser import UnlabeledDependencyParser
            if get_default:
                self.model = UnlabeledDependencyParser.get_default_model()
            else:
                self.model = UnlabeledDependencyParser.get_pretrained_model(
                    nlp_ref, language)

        SparkNLUComponent.__init__(self, annotator_class, component_type,
                                   nlu_ref, nlp_ref, language,
                                   loaded_from_pretrained_pipe, is_licensed)
예제 #18
0
    def __init__(self,
                 annotator_class='stopwordcleaner',
                 lang='en',
                 component_type='stopwordscleaner',
                 get_default=False,
                 model=None,
                 nlp_ref='',
                 nlu_ref='',
                 loaded_from_pretrained_pipe=False,
                 is_licensed=False):

        if model != None: self.model = model
        else:
            if 'stop' in annotator_class:
                from nlu import NLUStopWordcleaner
                if get_default:
                    self.model = NLUStopWordcleaner.get_default_model()
                else:
                    self.model = NLUStopWordcleaner.get_pretrained_model(
                        nlp_ref, lang)
        SparkNLUComponent.__init__(self, annotator_class, component_type,
                                   nlu_ref, nlp_ref, lang,
                                   loaded_from_pretrained_pipe, is_licensed)
예제 #19
0
    def __init__(self,
                 annotator_class='glove',
                 lang='en',
                 component_type='embedding',
                 get_default=True,
                 model=None,
                 nlp_ref='',
                 nlu_ref='',
                 is_licensed=False,
                 resolution_ref='',
                 loaded_from_pretrained_pipe=False,
                 do_ref_checks=True):
        if do_ref_checks:
            if 'use' in nlu_ref and 'bert' not in nlu_ref or 'tfhub_use' in nlp_ref and 'bert' not in nlp_ref:
                annotator_class = 'use'
                # first check for sentence then token embeddings.
            elif 'longformer' in nlu_ref:
                annotator_class = 'longformer'
            elif 'doc2vec' in nlu_ref:
                annotator_class = 'doc2vec'

            elif 'sent' in nlu_ref and 'xlm_roberta' in nlu_ref:
                annotator_class = 'sentence_xlm'
            elif 'xlm' in nlu_ref or 'xlm' in nlp_ref:
                annotator_class = 'xlm'
            elif 'roberta' in nlu_ref or 'roberta' in nlp_ref:
                annotator_class = 'roberta'
            elif 'distil' in nlu_ref or 'distil' in nlp_ref:
                annotator_class = 'distil_bert'

            elif 'bert' in nlp_ref and 'albert' not in nlp_ref and 'sent' in nlp_ref:
                annotator_class = 'sentence_bert'
            elif 'bert' in nlu_ref and 'albert' not in nlu_ref and 'sent' in nlu_ref:
                annotator_class = 'sentence_bert'
            elif 'bert' in nlu_ref and 'use' in nlu_ref:
                annotator_class = 'sentence_bert'

            elif 'elmo' in nlp_ref:
                annotator_class = 'elmo'
            elif 'elmo' in nlu_ref:
                annotator_class = 'elmo'

            elif 'electra' in nlp_ref and 'sent' in nlp_ref:
                annotator_class = 'sentence_bert'
            elif 'electra' in nlu_ref and 'sent' in nlu_ref:
                annotator_class = 'sentence_bert'

            elif 'bert' in nlu_ref and 'albert' not in nlu_ref:
                annotator_class = 'bert'
            elif 'bert' in nlp_ref and 'albert' not in nlp_ref:
                annotator_class = 'bert'

            elif 'electra' in nlu_ref or 'electra' in nlp_ref:
                annotator_class = 'bert'
            elif 'labse' in nlu_ref or 'labse' in nlp_ref:
                annotator_class = 'sentence_bert'

            elif 'tfhub' in nlu_ref or 'tfhub' in nlp_ref:
                annotator_class = 'use'
            elif 'glove' in nlu_ref or 'glove' in nlp_ref:
                annotator_class = 'glove'
            elif 'cc_300d' in nlu_ref or 'cc_300d' in nlp_ref:
                annotator_class = 'glove'

            elif 'albert' in nlu_ref or 'albert' in nlp_ref:
                annotator_class = 'albert'
            elif 'xlnet' in nlu_ref or 'xlnet' in nlp_ref:
                annotator_class = 'xlnet'

                # Default component models for nlu actions that dont specify a particular model
            elif 'embed_sentence' in nlu_ref:
                annotator_class = 'glove'
            elif 'embed' in nlu_ref:
                annotator_class = 'glove'

        if model != None: self.model = model
        else:

            # Check if this lang has embeddings, if NOT set to multi lang xx!
            multi_lang_embeds = ['th']
            if lang in multi_lang_embeds: lang = 'xx'
            if 'sentence_xlm' == annotator_class:
                from nlu import Sentence_XLM
                if get_default: self.model = Sentence_XLM.get_default_model()
                else:
                    self.model = Sentence_XLM.get_pretrained_model(
                        nlp_ref, lang)
            elif 'doc2vec' == annotator_class:
                from nlu import Doc2Vec
                if get_default: self.model = Doc2Vec.get_default_model()
                else: self.model = Doc2Vec.get_pretrained_model(nlp_ref, lang)
            elif 'longformer' == annotator_class:
                from nlu import Longformer
                if get_default: self.model = Longformer.get_default_model()
                else:
                    self.model = Longformer.get_pretrained_model(nlp_ref, lang)
            elif 'xlm' == annotator_class:
                from nlu import XLM
                if get_default: self.model = XLM.get_default_model()
                else: self.model = XLM.get_pretrained_model(nlp_ref, lang)
            elif 'roberta' == annotator_class:
                from nlu import Roberta
                if get_default: self.model = Roberta.get_default_model()
                else: self.model = Roberta.get_pretrained_model(nlp_ref, lang)
            elif 'distil_bert' == annotator_class:
                from nlu import DistilBert
                if get_default: self.model = DistilBert.get_default_model()
                else:
                    self.model = DistilBert.get_pretrained_model(nlp_ref, lang)
            elif 'albert' == annotator_class:
                from nlu import SparkNLPAlbert
                if get_default: self.model = SparkNLPAlbert.get_default_model()
                else:
                    self.model = SparkNLPAlbert.get_pretrained_model(
                        nlp_ref, lang)
            elif 'bert' in annotator_class and 'sent' in annotator_class:
                from nlu import BertSentence
                if get_default: self.model = BertSentence.get_default_model()
                elif is_licensed:
                    self.model = BertSentence.get_pretrained_model(
                        nlp_ref, lang, 'clinical/models')
                else:
                    self.model = BertSentence.get_pretrained_model(
                        nlp_ref, lang)
            elif 'electra' in annotator_class and 'sent' in annotator_class:
                from nlu import BertSentence
                if get_default: self.model = BertSentence.get_default_model()
                elif is_licensed:
                    self.model = BertSentence.get_pretrained_model(
                        nlp_ref, lang, 'clinical/models')
                else:
                    self.model = BertSentence.get_pretrained_model(
                        nlp_ref, lang)
            elif 'bert' in annotator_class:
                from nlu import SparkNLPBert
                if get_default: self.model = SparkNLPBert.get_default_model()
                elif is_licensed:
                    self.model = SparkNLPBert.get_pretrained_model(
                        nlp_ref, lang, 'clinical/models')
                else:
                    self.model = SparkNLPBert.get_pretrained_model(
                        nlp_ref, lang)
            elif 'elmo' in annotator_class:
                from nlu import SparkNLPElmo
                if get_default: self.model = SparkNLPElmo.get_default_model()
                else:
                    self.model = SparkNLPElmo.get_pretrained_model(
                        nlp_ref, lang)
            elif 'xlnet' in annotator_class:
                from nlu import SparkNLPXlnet
                if get_default: self.model = SparkNLPXlnet.get_default_model()
                else:
                    self.model = SparkNLPXlnet.get_pretrained_model(
                        nlp_ref, lang)
            elif 'use' in annotator_class:
                from nlu import SparkNLPUse
                if get_default: self.model = SparkNLPUse.get_default_model()
                else:
                    self.model = SparkNLPUse.get_pretrained_model(
                        nlp_ref, lang)
            elif 'glove' in annotator_class:
                from nlu import Glove
                if annotator_class == 'glove' and get_default == True:
                    self.model = Glove.get_default_model()
                else:
                    if get_default: self.model = Glove.get_default_model()
                    elif is_licensed:
                        self.model = Glove.get_pretrained_model(
                            nlp_ref, lang, 'clinical/models')
                    else:
                        if nlp_ref == 'glove_840B_300' or nlp_ref == 'glove_6B_300':
                            # if lang=='en' and nlp_ref=='glove_6B_300': #special case
                            lang = 'xx'  # For these particular Glove embeddings, anyreference to them is actually the reference to the multilingual onces
                            self.model = Glove.get_pretrained_model(
                                nlp_ref, lang)
                        else:
                            self.model = Glove.get_pretrained_model(
                                nlp_ref, lang)

        SparkNLUComponent.__init__(
            self,
            annotator_class,
            component_type,
            nlu_ref,
            nlp_ref,
            lang,
            loaded_from_pretrained_pipe=loaded_from_pretrained_pipe)
예제 #20
0
    def __init__(self,
                 annotator_class='sentiment_dl',
                 language='en',
                 component_type='classifier',
                 get_default=True,
                 model=None,
                 nlp_ref='',
                 nlu_ref='',
                 trainable=False,
                 is_licensed=False,
                 do_ref_checks=True,
                 loaded_from_pretrained_pipe=False):
        if do_ref_checks:
            if 'e2e' in nlu_ref or 'toxic' in nlu_ref:
                annotator_class = 'multi_classifier'
            elif 'e2e' in nlp_ref or 'toxic' in nlp_ref:
                annotator_class = 'multi_classifier'
            elif 'distilbert_sequence' in nlp_ref or 'distilbert_sequence' in nlu_ref:
                annotator_class = 'seq_distilbert'
            elif 'bert_sequence' in nlp_ref or 'bert_sequence' in nlu_ref:
                annotator_class = 'seq_bert'
            elif 'token_bert' in nlp_ref or 'token_bert' in nlu_ref:
                annotator_class = 'token_bert'
            elif 'token_distilbert' in nlp_ref or 'token_distilbert' in nlu_ref:
                annotator_class = 'token_distilbert'
            elif 'token_distilroberta' in nlp_ref or 'token_distilroberta' in nlu_ref:
                annotator_class = 'token_roberta'
            elif 'token_xlm_roberta' in nlp_ref or 'token_xlm_roberta' in nlu_ref:
                annotator_class = 'token_xlm_roberta'
            elif 'token_roberta' in nlp_ref or 'token_roberta' in nlu_ref:
                annotator_class = 'token_roberta'
            elif 'token_albert' in nlp_ref or 'token_albert' in nlu_ref:
                annotator_class = 'token_albert'
            elif 'token_xlnet' in nlp_ref or 'token_xlnet' in nlu_ref:
                annotator_class = 'token_xlnet'
            elif 'token_longformer' in nlp_ref or 'token_longformer' in nlu_ref:
                annotator_class = 'token_longformer'
            elif 'multiclassifierdl' in nlp_ref:
                annotator_class = 'multi_classifier'
            elif 'classifierdl' in nlp_ref:
                annotator_class = 'classifier_dl'
            elif 'yake' in nlu_ref:
                annotator_class = 'yake'
            elif 'yake' in nlp_ref:
                annotator_class = 'yake'
            elif 'sentimentdl' in nlp_ref:
                annotator_class = 'sentiment_dl'

            elif 'vivekn' in nlp_ref or 'vivekn' in nlp_ref:
                annotator_class = 'vivekn_sentiment'

            elif 'wiki_' in nlu_ref or 'wiki_' in nlp_ref:
                annotator_class = 'language_detector'
            elif 'pos' in nlu_ref and 'ner' not in nlu_ref:
                annotator_class = 'pos'
            elif 'pos' in nlp_ref and 'ner' not in nlp_ref:
                annotator_class = 'pos'
            elif 'icd' in nlu_ref and 'med_ner' not in nlu_ref:
                annotator_class = 'classifier_dl'
            elif 'med_ner' in nlu_ref:
                annotator_class = 'ner_healthcare'
            elif 'generic_classifier' in nlu_ref:
                annotator_class = 'generic_classifier'
            elif 'ner' in nlu_ref and 'generic' not in nlu_ref:
                annotator_class = 'ner'
            elif 'ner' in nlp_ref and 'generic' not in nlp_ref:
                annotator_class = 'ner'

        if model != None:
            self.model = model
            from sparknlp.annotator import NerDLModel, NerCrfModel
            if isinstance(self.model, (NerDLModel, NerCrfModel)):
                self.model.setIncludeConfidence(True)
            elif is_licensed:
                from sparknlp_jsl.annotator import MedicalNerModel
                if isinstance(self.model, MedicalNerModel):
                    self.model.setIncludeConfidence(True)
        else:
            if 'seq_distilbert' == annotator_class:
                from nlu import SeqDilstilBertClassifier
                if get_default:
                    self.model = SeqDilstilBertClassifier.get_default_model()
                elif is_licensed:
                    self.model = SeqDilstilBertClassifier.get_pretrained_model(
                        nlp_ref, language, 'clinical/models')
                else:
                    self.model = SeqDilstilBertClassifier.get_pretrained_model(
                        nlp_ref, language)
            elif 'seq_bert' == annotator_class:
                from nlu import SeqBertClassifier
                if get_default:
                    self.model = SeqBertClassifier.get_default_model()
                elif is_licensed:
                    self.model = SeqBertClassifier.get_pretrained_model(
                        nlp_ref, language, 'clinical/models')
                else:
                    self.model = SeqBertClassifier.get_pretrained_model(
                        nlp_ref, language)
            elif 'sentiment' in annotator_class and 'vivekn' not in annotator_class:
                from nlu import SentimentDl
                if trainable:
                    self.model = SentimentDl.get_default_trainable_model()
                elif is_licensed:
                    self.model = SentimentDl.get_pretrained_model(
                        nlp_ref, language, bucket='clinical/models')
                elif get_default:
                    self.model = SentimentDl.get_default_model()
                else:
                    self.model = SentimentDl.get_pretrained_model(
                        nlp_ref, language)
            elif 'token_distilbert' == annotator_class:
                from nlu import TokenDistilBert
                if get_default:
                    self.model = TokenDistilBert.get_default_model()
                elif is_licensed:
                    self.model = TokenDistilBert.get_pretrained_model(
                        nlp_ref, language, 'clinical/models')
                else:
                    self.model = TokenDistilBert.get_pretrained_model(
                        nlp_ref, language)
            elif 'token_bert' == annotator_class:
                from nlu import TokenBert
                if get_default:
                    self.model = TokenBert.get_default_model()
                elif is_licensed:
                    self.model = TokenBert.get_pretrained_model(
                        nlp_ref, language, 'clinical/models')
                else:
                    self.model = TokenBert.get_pretrained_model(
                        nlp_ref, language)
            elif 'token_xlm_roberta' == annotator_class:
                from nlu import TokenXlmRoBerta
                if get_default:
                    self.model = TokenXlmRoBerta.get_default_model()
                elif is_licensed:
                    self.model = TokenXlmRoBerta.get_pretrained_model(
                        nlp_ref, language, 'clinical/models')
                else:
                    self.model = TokenXlmRoBerta.get_pretrained_model(
                        nlp_ref, language)
            elif 'token_roberta' == annotator_class:
                from nlu import TokenRoBerta
                if get_default:
                    self.model = TokenRoBerta.get_default_model()
                elif is_licensed:
                    self.model = TokenRoBerta.get_pretrained_model(
                        nlp_ref, language, 'clinical/models')
                else:
                    self.model = TokenRoBerta.get_pretrained_model(
                        nlp_ref, language)
            elif 'token_albert' == annotator_class:
                from nlu import TokenAlbert
                if get_default:
                    self.model = TokenAlbert.get_default_model()
                elif is_licensed:
                    self.model = TokenAlbert.get_pretrained_model(
                        nlp_ref, language, 'clinical/models')
                else:
                    self.model = TokenAlbert.get_pretrained_model(
                        nlp_ref, language)
            elif 'token_longformer' == annotator_class:
                from nlu import TokenLongFormer
                if get_default:
                    self.model = TokenLongFormer.get_default_model()
                elif is_licensed:
                    self.model = TokenLongFormer.get_pretrained_model(
                        nlp_ref, language, 'clinical/models')
                else:
                    self.model = TokenLongFormer.get_pretrained_model(
                        nlp_ref, language)
            elif 'token_xlnet' == annotator_class:
                from nlu import TokenXlnet
                if get_default:
                    self.model = TokenXlnet.get_default_model()
                elif is_licensed:
                    self.model = TokenXlnet.get_pretrained_model(
                        nlp_ref, language, 'clinical/models')
                else:
                    self.model = TokenXlnet.get_pretrained_model(
                        nlp_ref, language)
            elif 'generic_classifier' in annotator_class:
                from nlu.components.classifiers.generic_classifier.generic_classifier import GenericClassifier
                if trainable:
                    self.model = GenericClassifier.get_default_trainable_model(
                    )
                else:
                    self.model = GenericClassifier.get_pretrained_model(
                        nlp_ref, language, bucket='clinical/models')
            elif 'vivekn' in annotator_class:
                from nlu import ViveknSentiment
                if get_default:
                    self.model = ViveknSentiment.get_default_model()
                else:
                    self.model = ViveknSentiment.get_pretrained_model(
                        nlp_ref, language)
            elif 'ner' in annotator_class and 'ner_healthcare' not in annotator_class:
                from nlu import NERDL
                if trainable:
                    self.model = NERDL.get_default_trainable_model()
                elif is_licensed:
                    self.model = NERDL.get_pretrained_model(
                        nlp_ref, language, bucket='clinical/models')
                elif get_default:
                    self.model = NERDL.get_default_model()
                else:
                    self.model = NERDL.get_pretrained_model(nlp_ref, language)
                if hasattr(self, 'model'):
                    self.model.setIncludeConfidence(True)
            elif 'ner.crf' in annotator_class:
                from nlu import NERDLCRF
                if get_default:
                    self.model = NERDLCRF.get_default_model()
                else:
                    self.model = NERDLCRF.get_pretrained_model(
                        nlp_ref, language)
                if hasattr(self, 'model'):
                    self.model.setIncludeConfidence(True)
            elif ('classifier_dl' in annotator_class or annotator_class
                  == 'toxic') and not 'multi' in annotator_class:
                from nlu import ClassifierDl
                if trainable:
                    self.model = ClassifierDl.get_trainable_model()
                elif is_licensed:
                    self.model = ClassifierDl.get_pretrained_model(
                        nlp_ref, language, bucket='clinical/models')
                elif get_default:
                    self.model = ClassifierDl.get_default_model()
                else:
                    self.model = ClassifierDl.get_pretrained_model(
                        nlp_ref, language)
                if hasattr(self.model, 'setIncludeConfidence'):
                    self.model.setIncludeConfidence(True)
            elif 'language_detector' in annotator_class:
                from nlu import LanguageDetector
                if get_default:
                    self.model = LanguageDetector.get_default_model()
                else:
                    self.model = LanguageDetector.get_pretrained_model(
                        nlp_ref, language)
            elif 'pos' in annotator_class:
                from nlu import PartOfSpeechJsl
                if trainable:
                    self.model = PartOfSpeechJsl.get_default_trainable_model()
                elif get_default:
                    self.model = PartOfSpeechJsl.get_default_model()
                elif is_licensed:
                    self.model = PartOfSpeechJsl.get_pretrained_model(
                        nlp_ref, language, bucket='clinical/models')
                else:
                    self.model = PartOfSpeechJsl.get_pretrained_model(
                        nlp_ref, language)

            elif 'yake' in annotator_class:
                from nlu import Yake
                self.model = Yake.get_default_model()
            elif 'multi_classifier' in annotator_class:
                from nlu import MultiClassifier
                if trainable:
                    self.model = MultiClassifier.get_default_trainable_model()
                elif get_default:
                    self.model = MultiClassifier.get_default_model()
                else:
                    self.model = MultiClassifier.get_pretrained_model(
                        nlp_ref, language)
            elif 'ner_healthcare' in annotator_class:
                from nlu.components.classifiers.ner_healthcare.ner_dl_healthcare import NERDLHealthcare
                if trainable:
                    self.model = NERDLHealthcare.get_default_trainable_model()
                else:
                    self.model = NERDLHealthcare.get_pretrained_model(
                        nlp_ref, language, bucket='clinical/models')

        SparkNLUComponent.__init__(self, annotator_class, component_type,
                                   nlu_ref, nlp_ref, language,
                                   loaded_from_pretrained_pipe, is_licensed)