예제 #1
0
def parse_test(params):
    if params.test == "parser":
        print ("Running " + params.test)
        print ("==PARAMETERS==")
        print ("EMBEDDINGS: " + params.embeddings)
        print ("MODEL FILE: " + params.model_base)
        print ("DECODER: " + params.decoder)
        print ("OUTPUT: " + params.output_file)
        print ("CONFIG FILE: " + str(params.config))
        print ("==============\n")

        testset = Dataset(params.test_file)
        encodings = Encodings()
        encodings.load(params.model_base + ".encodings")
        encodings.update_wordlist(testset)
        print ("Updated word list: " + str(len(encodings.word_list)))
        config = ParserConfig(filename=params.config)
        embeddings = WordEmbeddings()
        embeddings.read_from_file(params.embeddings, encodings.word_list)
        parser = BDRNNParser(config, encodings, embeddings)
        parser.load(params.model_base + ".bestUAS")
        if params.decoder == 'mst':
            print ("!!!!!!!!!!!!!!!!!!!!!!!!!USING MST DECODER")
            from graph.decoders import MSTDecoder
            parser.decoder = MSTDecoder()
        f = fopen(params.output_file, "w")
        last_proc = 0
        index = 0
        for seq in testset.sequences:
            index += 1
            proc = index * 100 / len(testset.sequences)
            if proc % 5 == 0 and proc != last_proc:
                last_proc = proc
                sys.stdout.write(" " + str(proc))
                sys.stdout.flush()

            rez = parser.tag(seq)
            iSeq = 0
            iRez = 0
            while iSeq < len(seq):
                while seq[iSeq].is_compound_entry:
                    iSeq += 1
                seq[iSeq].xpos = rez[iRez].xpos
                seq[iSeq].upos = rez[iRez].upos
                seq[iSeq].attrs = rez[iRez].attrs
                seq[iSeq].head = rez[iRez].head
                seq[iSeq].label = rez[iRez].label
                seq[iSeq].lemma = rez[iRez].lemma
                iSeq += 1
                iRez += 1

            for entry in seq:
                f.write(str(entry.index) + "\t" + str(entry.word) + "\t" + str(entry.lemma) + "\t" + str(
                    entry.upos) + "\t" + str(entry.xpos) + "\t" + str(entry.attrs) + "\t" + str(
                    entry.head) + "\t" + str(entry.label) + "\t" + str(entry.deps) + "\t" + str(
                    entry.space_after) + "\n")
            f.write("\n")

        f.close()
        sys.stdout.write("\n")
예제 #2
0
def parse_run(params):
    sys.stdout.write("\nINPUT FILE: " + params.input_file)
    sys.stdout.write("\nOUTPUT FILE: " + params.output_file)
    sys.stdout.write("\nMODELS FILE: " + params.models + "\n")
    sys.stdout.flush()

    components = params.run.split(",")
    tokenize = True if "tokenizer" in components else False
    compound = True if "compound" in components else False
    lemmatize = True if "lemmatizer" in components else False
    tag = True if "tagger" in components else False
    parse = True if "parser" in components else False

    # common elements load
    sys.stdout.write("\nLoading embeddings : " + params.embeddings + " ...\n")
    embeddings = WordEmbeddings()
    embeddings.read_from_file(params.embeddings, None)

    encodings = None
    if tokenize == True:
        if not os.path.isfile(
                os.path.join(params.models, "tokenizer-tok.bestAcc")):
            sys.stdout.write(
                "\n\tTokenizer model not found! (" +
                os.path.join(params.models, "tokenizer-tok.bestAcc") + ")")
            sys.stdout.flush()
            sys.exit(1)
        sys.stdout.write("\n\tTokenization enabled.\n")
        tokenizer_encodings = Encodings(verbose=False)
        tokenizer_encodings.load(
            os.path.join(params.models, "tokenizer.encodings"))
    if compound == True:
        if not os.path.isfile(os.path.join(params.models, "compound.bestAcc")):
            sys.stdout.write("\n\tCompound word expander model not found!")
            sys.stdout.flush()
            sys.exit(1)
        sys.stdout.write("\n\tCompound word expander enabled.\n")
        if encodings == None:
            encodings = Encodings(verbose=False)
            encodings.load(os.path.join(params.models, "lemmatizer.encodings"))
    if lemmatize == True:
        if not os.path.isfile(os.path.join(params.models,
                                           "lemmatizer.bestACC")):
            sys.stdout.write("\n\tLemmatization model not found!")
            sys.stdout.flush()
            sys.exit(1)
        sys.stdout.write("\n\tLemmatization enabled.\n")
        if encodings == None:
            encodings = Encodings(verbose=False)
            encodings.load(os.path.join(params.models, "lemmatizer.encodings"))
    if tag == True:
        if not os.path.isfile(os.path.join(params.models,
                                           "tagger.bestOVERALL")):
            sys.stdout.write("\n\tTagger model not found!")
            sys.stdout.flush()
            sys.exit(1)
        sys.stdout.write("\n\tTagger enabled.\n")
        if encodings == None:
            encodings = Encodings(verbose=False)
            encodings.load(os.path.join(params.models, "tagger.encodings"))
    if parse == True:
        if not os.path.isfile(os.path.join(params.models, "parser.bestUAS")):
            sys.stdout.write("\n\tParser model not found!")
            sys.stdout.flush()
            sys.exit(1)
        sys.stdout.write("\n\tParser enabled.\n")
        if encodings == None:
            encodings = Encodings(verbose=False)
            encodings.load(os.path.join(params.models, "parser.encodings"))

    sequences = None
    if tokenize:
        sys.stdout.write("\nTokenizing " + params.input_file + " ... \n\t")
        sys.stdout.flush()

        from io_utils.config import TieredTokenizerConfig
        from generic_networks.tokenizers import TieredTokenizer
        config = TieredTokenizerConfig(
            os.path.join(params.models, "tokenizer.conf"))
        tokenizer_object = TieredTokenizer(config,
                                           tokenizer_encodings,
                                           embeddings,
                                           runtime=True)
        tokenizer_object.load(os.path.join(params.models, "tokenizer"))

        with open(params.input_file, 'r') as file:
            lines = file.readlines()
        # analyze use of spaces in first part of the file
        test = ""
        useSpaces = " "
        cnt = 0
        while True:
            test = test + lines[cnt]
            # print(lines[cnt])
            if cnt >= len(lines) or cnt > 5:
                break
            cnt += 1
        if float(test.count(' ')) / float(len(test)) < 0.02:
            useSpaces = ""
        # print (str(float(test.count(' '))/float(len(test))))
        i = -1
        input_string = ""
        sequences = []
        while i < len(lines) - 1:
            i += 1
            input_string = input_string + lines[i].replace("\r", "").replace(
                "\n", "").strip() + useSpaces
            if lines[i].strip() == "" or i == len(lines) - 1:  # end of block
                if input_string.strip() != "":
                    sequences += tokenizer_object.tokenize(input_string)
                input_string = ""

        del tokenizer_object  # free memory
    else:
        ds = Dataset(params.input_file)
        sequences = ds.sequences
    sys.stdout.write(" done\n")
    sys.stdout.flush()

    if compound:
        sys.stdout.write("\nCompound word expanding " + params.input_file +
                         " ... \n\t")
        sys.stdout.flush()
        from generic_networks.token_expanders import CompoundWordExpander
        from io_utils.config import CompoundWordConfig
        config = CompoundWordConfig(
            os.path.join(params.models, "compound.conf"))
        compoundwordexpander_object = CompoundWordExpander(config,
                                                           encodings,
                                                           embeddings,
                                                           runtime=True)
        compoundwordexpander_object.load(
            os.path.join(params.models, "compound.bestAcc"))
        sequences = compoundwordexpander_object.expand_sequences(sequences)
        del compoundwordexpander_object  # free memory
        sys.stdout.write(" done\n")
        sys.stdout.flush()

    if parse == True:
        sys.stdout.write("\nParsing " + params.input_file + " ... \n\t")
        sys.stdout.flush()
        from io_utils.config import ParserConfig
        from generic_networks.parsers import BDRNNParser
        config = ParserConfig(os.path.join(params.models, "parser.conf"))
        parser_object = BDRNNParser(config,
                                    encodings,
                                    embeddings,
                                    runtime=True)
        parser_object.load(os.path.join(params.models, "parser.bestUAS"))
        sequences = parser_object.parse_sequences(sequences)
        del parser_object  # free memory
        sys.stdout.write(" done\n")
        sys.stdout.flush()

    if tag == True:
        sys.stdout.write("\nTagging " + params.input_file + " ... \n\t")
        sys.stdout.flush()
        from io_utils.config import TaggerConfig
        from generic_networks.taggers import BDRNNTagger
        config = TaggerConfig(os.path.join(params.models, "tagger.conf"))
        tagger_object_UPOS = BDRNNTagger(config,
                                         encodings,
                                         embeddings,
                                         runtime=True)
        tagger_object_UPOS.load(os.path.join(params.models, "tagger.bestUPOS"))
        tagger_object_XPOS = BDRNNTagger(config,
                                         encodings,
                                         embeddings,
                                         runtime=True)
        tagger_object_XPOS.load(os.path.join(params.models, "tagger.bestXPOS"))
        tagger_object_ATTRS = BDRNNTagger(config,
                                          encodings,
                                          embeddings,
                                          runtime=True)
        tagger_object_ATTRS.load(
            os.path.join(params.models, "tagger.bestATTRS"))

        new_sequences = []
        for sequence in sequences:
            new_sequence = copy.deepcopy(sequence)
            predicted_tags_UPOS = tagger_object_UPOS.tag(new_sequence)
            predicted_tags_XPOS = tagger_object_XPOS.tag(new_sequence)
            predicted_tags_ATTRS = tagger_object_ATTRS.tag(new_sequence)
            for entryIndex in range(len(sequence)):
                new_sequence[entryIndex].upos = predicted_tags_UPOS[
                    entryIndex][0]
                new_sequence[entryIndex].xpos = predicted_tags_XPOS[
                    entryIndex][1]
                new_sequence[entryIndex].attrs = predicted_tags_ATTRS[
                    entryIndex][2]
            new_sequences.append(new_sequence)
        sequences = copy.deepcopy(new_sequences)
        del tagger_object_UPOS  # free memory
        del tagger_object_XPOS  # free memory
        del tagger_object_ATTRS  # free memory
        sys.stdout.write(" done\n")
        sys.stdout.flush()

    if lemmatize:
        sys.stdout.write("\nLemmatizing " + params.input_file + " ... \n\t")
        sys.stdout.flush()
        from generic_networks.lemmatizers import FSTLemmatizer
        from io_utils.config import LemmatizerConfig
        config = LemmatizerConfig(
            os.path.join(params.models, "lemmatizer.conf"))
        lemmatizer_object = FSTLemmatizer(config,
                                          encodings,
                                          embeddings,
                                          runtime=True)
        lemmatizer_object.load(
            os.path.join(params.models, "lemmatizer.bestACC"))
        sequences = lemmatizer_object.lemmatize_sequences(sequences)
        del lemmatizer_object  # free memory
        sys.stdout.write(" done\n")
        sys.stdout.flush()

    output_dataset = Dataset()
    output_dataset.sequences = sequences
    output_dataset.write(params.output_file)
예제 #3
0
def parse_train(params):
    if params.train == 'mt':
        print "Starting training for " + params.train
        print "==PARAMETERS=="
        print "SRC TRAIN FILE: " + params.mt_train_src
        print "SRC DEV FILE: " + params.mt_dev_src
        print "SRC TEST FILE: " + str(params.mt_test_src)
        print "SRC EMBEDDINGS FILE: " + params.mt_source_embeddings
        print "DST TRAIN FILE: " + params.mt_train_dst
        print "DST DEV FILE: " + params.mt_dev_dst
        print "DST TEST FILE: " + str(params.mt_test_dst)
        print "DST EMBEDDINGS FILE: " + params.mt_destination_embeddings
        print "STOPPING CONDITION: " + str(params.itters)
        print "OUTPUT BASE: " + params.output_base
        print "AUX SOFTMAX WEIGHT: " + str(params.aux_softmax_weight)
        print "CONFIG FILE: " + str(params.config)
        print "==============\n"
        trainset = MTDataset(params.mt_train_src, params.mt_train_dst)
        devset = MTDataset(params.mt_dev_src, params.mt_dev_dst)
        if params.mt_test_src and params.mt_test_dst:
            testset = MTDataset(params.mt_test_src, params.mt_test_dst)
        else:
            testset = None

        config = NMTConfig(params.config)
        sys.stdout.write("--SOURCE--\n")
        sys.stdout.flush()
        src_enc = Encodings()
        src_enc.compute(trainset.to_conll_dataset('src'),
                        devset.to_conll_dataset('src'),
                        word_cutoff=5)
        sys.stdout.write("--DESTINATION--\n")
        sys.stdout.flush()
        dst_enc = Encodings()
        dst_enc.compute(trainset.to_conll_dataset('dst'),
                        devset.to_conll_dataset('dst'),
                        word_cutoff=5)
        sys.stdout.write("Reading source embeddings\n")
        src_we = WordEmbeddings()
        src_we.read_from_file(params.mt_source_embeddings,
                              'label',
                              full_load=False)
        sys.stdout.write("Reading destination embeddings\n")
        dst_we = WordEmbeddings()
        dst_we.read_from_file(params.mt_destination_embeddings,
                              'label',
                              full_load=False)
        nmt = BRNNMT(src_we, dst_we, src_enc, dst_enc, config)
        trainer = MTTrainer(nmt,
                            src_enc,
                            dst_enc,
                            src_we,
                            dst_we,
                            params.itters,
                            trainset,
                            devset,
                            testset=testset)
        trainer.start_training(params.output_base,
                               batch_size=params.batch_size)

    if params.train == "tagger":
        print "Starting training for " + params.train
        print "==PARAMETERS=="
        print "TRAIN FILE: " + params.train_file
        print "DEV FILE: " + params.dev_file
        if params.test_file is not None:
            print "TEST FILE: " + params.test_file
        print "EMBEDDINGS FILE: " + params.embeddings
        print "STOPPING CONDITION: " + str(params.itters)
        print "OUTPUT BASE: " + params.output_base
        print "AUX SOFTMAX WEIGHT: " + str(params.aux_softmax_weight)
        print "CONFIG FILE: " + str(params.config)
        print "==============\n"

        trainset = Dataset(params.train_file)
        devset = Dataset(params.dev_file)
        if params.test_file:
            testset = Dataset(params.test_file)
        else:
            testset = None
        config = TaggerConfig(params.config)
        if not config._valid:
            return

        encodings = Encodings()
        encodings.compute(trainset, devset, 'label')
        # update wordlist if testset was provided
        if params.test_file:
            encodings.update_wordlist(testset)
        embeddings = WordEmbeddings()
        embeddings.read_from_file(params.embeddings, encodings.word_list)
        tagger = BDRNNTagger(config,
                             encodings,
                             embeddings,
                             aux_softmax_weight=params.aux_softmax_weight)
        trainer = TaggerTrainer(tagger, encodings, params.itters, trainset,
                                devset, testset)
        trainer.start_training(params.output_base,
                               batch_size=params.batch_size)

    elif params.train == "parser":
        print "Starting training for " + params.train
        print "==PARAMETERS=="
        print "TRAIN FILE: " + params.train_file
        print "DEV FILE: " + params.dev_file
        if params.test_file is not None:
            print "TEST FILE: " + params.test_file
        print "EMBEDDINGS FILE: " + params.embeddings
        print "STOPPING CONDITION: " + str(params.itters)
        print "OUTPUT BASE: " + params.output_base
        print "AUX SOFTMAX WEIGHT: " + str(params.aux_softmax_weight)
        print "CONFIG FILE: " + str(params.config)
        print "==============\n"

        trainset = Dataset(params.train_file)
        devset = Dataset(params.dev_file)
        if params.test_file:
            testset = Dataset(params.test_file)
        else:
            testset = None
        config = ParserConfig(params.config)
        if not config._valid:
            return
        # PARAM INJECTION
        if params.params != None:
            parts = params.params.split(":")
            for param in parts:
                variable = param.split("=")[0]
                value = param[len(variable) + 1:]
                print("External param injection: " + variable + "=" + value)
                exec("config.__dict__[\"" + variable + "\"] = " + value)
                # END INJECTION
        encodings = Encodings()
        encodings.compute(trainset, devset, 'label')
        # update wordlist if testset was provided
        if params.test_file:
            encodings.update_wordlist(testset)
        embeddings = WordEmbeddings()
        embeddings.read_from_file(params.embeddings, encodings.word_list)
        parser = BDRNNParser(config,
                             encodings,
                             embeddings,
                             aux_softmax_weight=params.aux_softmax_weight)
        trainer = ParserTrainer(parser, encodings, params.itters, trainset,
                                devset, testset)
        trainer.start_training(params.output_base, params.batch_size)

    elif params.train == "lemmatizer":
        print "Starting training for " + params.train
        print "==PARAMETERS=="
        print "TRAIN FILE: " + params.train_file
        print "DEV FILE: " + params.dev_file
        if params.test_file is not None:
            print "TEST FILE: " + params.test_file
        print "EMBEDDINGS FILE: " + params.embeddings
        print "STOPPING CONDITION: " + str(params.itters)
        print "OUTPUT BASE: " + params.output_base
        print "AUX SOFTMAX WEIGHT: " + str(params.aux_softmax_weight)
        print "CONFIG FILE: " + str(params.config)
        print "==============\n"

        trainset = Dataset(params.train_file)
        devset = Dataset(params.dev_file)
        if params.test_file:
            testset = Dataset(params.test_file)
        else:
            testset = None
        config = LemmatizerConfig(params.config)
        encodings = Encodings()
        encodings.compute(trainset, devset, 'label')
        # update wordlist if testset was provided
        if params.test_file:
            encodings.update_wordlist(testset)

        embeddings = None
        lemmatizer = FSTLemmatizer(config, encodings, embeddings)
        trainer = LemmatizerTrainer(lemmatizer, encodings, params.itters,
                                    trainset, devset, testset)
        trainer.start_training(params.output_base,
                               batch_size=params.batch_size)

    elif params.train == "compound":
        print "Starting training for " + params.train
        print "==PARAMETERS=="
        print "TRAIN FILE: " + params.train_file
        print "DEV FILE: " + params.dev_file
        if params.test_file is not None:
            print "TEST FILE: " + params.test_file
        print "EMBEDDINGS FILE: " + params.embeddings
        print "STOPPING CONDITION: " + str(params.itters)
        print "OUTPUT BASE: " + params.output_base
        print "AUX SOFTMAX WEIGHT: " + str(params.aux_softmax_weight)
        print "CONFIG FILE: " + str(params.config)
        print "==============\n"

        trainset = Dataset(params.train_file)
        devset = Dataset(params.dev_file)
        if params.test_file:
            testset = Dataset(params.test_file)
        else:
            testset = None
        config = CompoundWordConfig(params.config)
        encodings = Encodings()
        encodings.compute(trainset, devset, 'label')
        # update wordlist if testset was provided
        if params.test_file:
            encodings.update_wordlist(testset)

        embeddings = None
        expander = CompoundWordExpander(config, encodings, embeddings)
        trainer = CompoundWordTrainer(expander, encodings, params.itters,
                                      trainset, devset, testset)
        trainer.start_training(params.output_base,
                               batch_size=params.batch_size)

    elif params.train == "tokenizer":
        print "Starting training for " + params.train
        print "==PARAMETERS=="
        print "TRAIN FILE: " + params.train_file
        print "RAW TRAIN FILE: " + (params.raw_train_file if params.
                                    raw_train_file is not None else "n/a")
        print "DEV FILE: " + params.dev_file
        print "RAW DEV FILE: " + (params.raw_dev_file if params.raw_dev_file
                                  is not None else "n/a")
        print "TEST FILE: " + (params.test_file
                               if params.test_file is not None else "n/a")
        print "RAW TEST FILE: " + (params.raw_test_file if params.raw_test_file
                                   is not None else "n/a")
        print "EMBEDDINGS FILE: " + params.embeddings
        print "STOPPING CONDITION: " + str(params.itters)
        print "OUTPUT BASE: " + params.output_base
        print "CONFIG FILE: " + str(params.config)
        print "==============\n"
        trainset = Dataset(params.train_file)
        devset = Dataset(params.dev_file)
        if params.test_file:
            testset = Dataset(params.test_file)
        else:
            testset = None
        from generic_networks.tokenizers import TieredTokenizer
        config = TieredTokenizerConfig(params.config)
        config.raw_test_file = params.raw_test_file
        config.base = params.output_base
        config.patience = params.itters
        if not config._valid:
            return

        encodings = Encodings()
        encodings.compute(trainset, devset, 'label')
        # update wordlist if testset was provided
        if params.test_file:
            encodings.update_wordlist(testset)
        embeddings = WordEmbeddings()
        embeddings.read_from_file(
            params.embeddings, None
        )  # setting wordlist to None triggers Word Embeddings to act as cache-only and load offsets for all words
        tokenizer = TieredTokenizer(config, encodings, embeddings)
        trainer = TokenizerTrainer(tokenizer,
                                   encodings,
                                   params.itters,
                                   trainset,
                                   devset,
                                   testset,
                                   raw_train_file=params.raw_train_file,
                                   raw_dev_file=params.raw_dev_file,
                                   raw_test_file=params.raw_test_file,
                                   gold_train_file=params.train_file,
                                   gold_dev_file=params.dev_file,
                                   gold_test_file=params.test_file)
        trainer.start_training(params.output_base,
                               batch_size=params.batch_size)
예제 #4
0
            print "--dev-file is mandatory"
            valid = False
        if not params.embeddings:
            print "--embeddings is mandatory"
            valid = False
        if not params.output_base:
            print "--store is mandatory"
            valid = False
    if valid:
        parse_train(params)

if params.server:
    from server.webserver import EmbeddedWebserver

    WordEmbeddings
    we = WordEmbeddings()
    we.read_from_file(params.embeddings, None, False)
    ews = EmbeddedWebserver(we,
                            port=params.port,
                            lemma=params.model_lemmatization,
                            tokenization=params.model_tokenization,
                            tagging=params.model_tagging,
                            parsing=params.model_parsing)

if params.test:
    valid = True
    if valid:
        parse_test(params)

if params.run:
    valid = True
예제 #5
0
    def load(self,
             language_code,
             version="latest",
             tokenization=True,
             compound_word_expanding=False,
             tagging=True,
             lemmatization=True,
             parsing=True):
        """
        Loads the pipeline with all available models for the target language.

        @param lang_code: Target language code. See http://opensource.adobe.com/NLP-Cube/ for available languages and their codes
        @param version: "latest" to get the latest version, or other specific version in like "1.0", "2.1", etc .
       
        """
        # Initialize a ModelStore object
        model_store_object = ModelStore(disk_path=self._model_repository)

        # Find a local model or download it if it does not exist, returning the local model folder path
        model_folder_path = model_store_object.find(lang_code=language_code,
                                                    version=version,
                                                    verbose=self._verbose)

        # Load metadata from the model
        self.metadata.read(os.path.join(model_folder_path, "metadata.json"))

        # Load embeddings
        embeddings = WordEmbeddings(verbose=False)
        if self._verbose:
            sys.stdout.write('\tLoading embeddings... \n')
        embeddings.read_from_file(os.path.join(
            self._embeddings_repository, self.metadata.embeddings_file_name),
                                  None,
                                  full_load=False)

        # 1. Load tokenizer
        if tokenization:
            if not os.path.isfile(
                    os.path.join(model_folder_path, 'tokenizer-tok.bestAcc')):
                sys.stdout.write(
                    '\tTokenization is not available on this model. \n')
            else:
                if self._verbose:
                    sys.stdout.write('\tLoading tokenization model ...\n')
                tokenizer_encodings = Encodings(verbose=False)
                tokenizer_encodings.load(
                    os.path.join(model_folder_path, 'tokenizer.encodings'))
                config = TieredTokenizerConfig(
                    os.path.join(model_folder_path, 'tokenizer.conf'))
                self._tokenizer = TieredTokenizer(config,
                                                  tokenizer_encodings,
                                                  embeddings,
                                                  runtime=True)
                self._tokenizer.load(
                    os.path.join(model_folder_path, 'tokenizer'))

        # 3. Load compound
        if compound_word_expanding:
            if not os.path.isfile(
                    os.path.join(model_folder_path, 'compound.bestAcc')):
                if self._verbose:  # supress warning here because many languages do not have compund words
                    sys.stdout.write(
                        '\tCompound word expansion is not available on this model. \n'
                    )
            else:
                if self._verbose:
                    sys.stdout.write(
                        '\tLoading compound word expander model ...\n')
                compound_encodings = Encodings(verbose=False)
                compound_encodings.load(
                    os.path.join(model_folder_path, 'compound.encodings'))
                config = CompoundWordConfig(
                    os.path.join(model_folder_path, 'compound.conf'))
                self._compound_word_expander = CompoundWordExpander(
                    config, compound_encodings, embeddings, runtime=True)
                self._compound_word_expander.load(
                    os.path.join(model_folder_path, 'compound.bestAcc'))

        # 4. Load lemmatizer
        if lemmatization:
            if not os.path.isfile(
                    os.path.join(model_folder_path, 'lemmatizer.bestACC')):
                sys.stdout.write(
                    '\tLemmatizer is not available on this model. \n')
            else:
                if self._verbose:
                    sys.stdout.write('\tLoading lemmatization model ...\n')
                lemmatizer_encodings = Encodings(verbose=False)
                lemmatizer_encodings.load(
                    os.path.join(model_folder_path, 'lemmatizer.encodings'))
                config = LemmatizerConfig(
                    os.path.join(model_folder_path, 'lemmatizer.conf'))
                self._lemmatizer = FSTLemmatizer(config,
                                                 lemmatizer_encodings,
                                                 embeddings,
                                                 runtime=True)
                self._lemmatizer.load(
                    os.path.join(model_folder_path, 'lemmatizer.bestACC'))

        # 5. Load taggers
        if tagging or lemmatization:  # we need tagging for lemmatization
            if not os.path.isfile(
                    os.path.join(model_folder_path, 'tagger.bestUPOS')):
                sys.stdout.write(
                    '\tTagging is not available on this model. \n')
                if lemmatization:
                    sys.stdout.write(
                        '\t\tDisabling the lemmatization model due to missing tagger. \n'
                    )
                    self._lemmatizer = None
            else:
                if self._verbose:
                    sys.stdout.write('\tLoading tagger model ...\n')
                tagger_encodings = Encodings(verbose=False)
                tagger_encodings.load(
                    os.path.join(model_folder_path, 'tagger.encodings'))
                config = TaggerConfig(
                    os.path.join(model_folder_path, 'tagger.conf'))
                self._tagger = [None, None, None]
                self._tagger[0] = BDRNNTagger(config,
                                              tagger_encodings,
                                              embeddings,
                                              runtime=True)
                self._tagger[0].load(
                    os.path.join(model_folder_path, 'tagger.bestUPOS'))
                self._tagger[1] = BDRNNTagger(config,
                                              tagger_encodings,
                                              embeddings,
                                              runtime=True)
                self._tagger[1].load(
                    os.path.join(model_folder_path, 'tagger.bestXPOS'))
                self._tagger[2] = BDRNNTagger(config,
                                              tagger_encodings,
                                              embeddings,
                                              runtime=True)
                self._tagger[2].load(
                    os.path.join(model_folder_path, 'tagger.bestATTRS'))

        # 6. Load parser
        if parsing:
            if not os.path.isfile(
                    os.path.join(model_folder_path, 'parser.bestUAS')):
                sys.stdout.write(
                    '\tParsing is not available on this model... \n')
            else:
                if self._verbose:
                    sys.stdout.write('\tLoading parser model ...\n')
                parser_encodings = Encodings(verbose=False)
                parser_encodings.load(
                    os.path.join(model_folder_path, 'parser.encodings'))
                config = ParserConfig(
                    os.path.join(model_folder_path, 'parser.conf'))
                self._parser = BDRNNParser(config,
                                           parser_encodings,
                                           embeddings,
                                           runtime=True)
                self._parser.load(
                    os.path.join(model_folder_path, 'parser.bestUAS'))

        self._loaded = True
        if self._verbose:
            sys.stdout.write('Model loading complete.\n\n')
예제 #6
0
    def load(self, lang_code, base_path=None):
        """
        Loads the pipeline with all available models for the target language
        @param lang_code: Target language code. See http://opensource.adobe.com/NLP-Cube/ for available languages and their codes
        @param base_path: Base path for models. Only required for custom-trained models. Otherwise, just leave this parameter untouched to use the default model location
        @return: True if loading was successful, False otherwise
        """
        sys.stdout.write('Loading models for ' + lang_code + "\n")
        if base_path is None:
            global BASE_PATH
            base_path = BASE_PATH

        self.embeddings = WordEmbeddings()
        self.embeddings.read_from_file(os.path.join(base_path, lang_code + "/wiki." + lang_code + ".vec"), None,
                                       full_load=False)
        if not os.path.isfile(os.path.join(base_path, lang_code + "/tokenizer-tok.bestAcc")):
            sys.stdout.write(
                "\tTokenization disabled. \n")
        else:
            self.tokenizer_enabled = True
            sys.stdout.write("\tTokenization enabled.\n")
            tokenizer_encodings = Encodings(verbose=False)
            tokenizer_encodings.load(os.path.join(base_path, lang_code + "/tokenizer.encodings"))
            from io_utils.config import TieredTokenizerConfig
            from generic_networks.tokenizers import TieredTokenizer
            config = TieredTokenizerConfig(os.path.join(base_path, lang_code + "/tokenizer.conf"))
            tokenizer_object = TieredTokenizer(config, tokenizer_encodings, self.embeddings, runtime=True)
            tokenizer_object.load(os.path.join(base_path, lang_code + "/tokenizer"))
            self.models[PipelineComponents.TOKENIZER] = tokenizer_object

        if not os.path.isfile(os.path.join(base_path, lang_code + "/compound.bestAcc")):
            sys.stdout.write(
                "\tCompound disabled. \n")
        else:
            self.compound_enabled = True
            sys.stdout.write("\tCompound enabled.\n")
            compound_encodings = Encodings(verbose=False)
            compound_encodings.load(os.path.join(base_path, lang_code + "/compound.encodings"))
            from io_utils.config import CompoundWordConfig
            from generic_networks.token_expanders import CompoundWordExpander
            config = CompoundWordConfig(os.path.join(base_path, lang_code + "/compound.conf"))
            compound_object = CompoundWordExpander(config, compound_encodings, self.embeddings, runtime=True)
            compound_object.load(os.path.join(base_path, lang_code + "/compound.bestAcc"))
            self.models[PipelineComponents.COMPOUND] = compound_object

        if not os.path.isfile(os.path.join(base_path, lang_code + "/lemmatizer.bestACC")):
            sys.stdout.write(
                "\tLemmatizer disabled. \n")
        else:
            self.lemmatizer_enabled = True
            sys.stdout.write("\tLemmatizer enabled.\n")
            lemmatizer_encodings = Encodings(verbose=False)
            lemmatizer_encodings.load(os.path.join(base_path, lang_code + "/lemmatizer.encodings"))
            from io_utils.config import LemmatizerConfig
            from generic_networks.lemmatizers import FSTLemmatizer
            config = LemmatizerConfig(os.path.join(base_path, lang_code + "/lemmatizer.conf"))
            lemmatizer_object = FSTLemmatizer(config, lemmatizer_encodings, self.embeddings, runtime=True)
            lemmatizer_object.load(os.path.join(base_path, lang_code + "/lemmatizer.bestACC"))
            self.models[PipelineComponents.LEMMATIZER] = lemmatizer_object

        if not os.path.isfile(os.path.join(base_path, lang_code + "/tagger.bestUPOS")):
            sys.stdout.write(
                "\tTagger disabled. \n")
        else:
            self.tagger_enabled = True
            sys.stdout.write("\tTagger enabled.\n")
            tagger_encodings = Encodings(verbose=False)
            tagger_encodings.load(os.path.join(base_path, lang_code + "/tagger.encodings"))
            from io_utils.config import TaggerConfig
            from generic_networks.taggers import BDRNNTagger
            config = TaggerConfig(os.path.join(base_path, lang_code + "/tagger.conf"))

            tagger_upos_object = BDRNNTagger(config, tagger_encodings, self.embeddings, runtime=True)
            tagger_upos_object.load(os.path.join(base_path, lang_code + "/tagger.bestUPOS"))
            tagger_xpos_object = BDRNNTagger(config, tagger_encodings, self.embeddings, runtime=True)
            tagger_xpos_object.load(os.path.join(base_path, tagger_encodings + "/tagger.bestXPOS"))
            tagger_attrs_object = BDRNNTagger(config, tagger_encodings, self.embeddings, runtime=True)
            tagger_attrs_object.load(os.path.join(base_path, lang_code + "/tagger.bestATTRS"))

            self.models[PipelineComponents.TAGGER] = [tagger_upos_object, tagger_xpos_object, tagger_attrs_object]

        if not os.path.isfile(os.path.join(base_path, lang_code + "/parser.bestUAS")):
            sys.stdout.write(
                "\tParser disabled. \n")
        else:
            self.parser_enabled = True
            sys.stdout.write("\tParser enabled.\n")
            lemmatizer_encodings = Encodings(verbose=False)
            lemmatizer_encodings.load(os.path.join(base_path, lang_code + "/parser.encodings"))
            from io_utils.config import ParserConfig
            from generic_networks.parsers import BDRNNParser
            config = ParserConfig(os.path.join(base_path, lang_code + "/parser.conf"))
            parser_object = BDRNNParser(config, lemmatizer_encodings, self.embeddings, runtime=True)
            parser_object.load(os.path.join(base_path, lang_code + "/parser.bestUAS"))
            self.models[PipelineComponents.PARSER] = parser_object
예제 #7
0
class Cube:
    def __init__(self):
        """
        Create an empty instance for Cube
        Before it can be used, you must call @method load with @param language_code set to your target language
        """
        self.loaded = False
        self.tokenizer_enabled = False
        self.compound_enabled = False
        self.lemmatizer_enabled = False
        self.parser_enabled = False
        self.tokenizer_enabled = False
        self.tagger_enabled = False
        self.models = {}
        self.embeddings = None

    def download_models(self, lang_code):
        """
        Downloads pre-trained models for the desired language. All existing models will be overwritten
        @param lang_code: Target language code. See http://opensource.adobe.com/NLP-Cube/ for available languages and their codes
        @return: True if the download was successful, False otherwise
        """
        sys.stdout.write('TODO: Downloading models for ' + lang_code + "\n")

    def load(self, lang_code, base_path=None):
        """
        Loads the pipeline with all available models for the target language
        @param lang_code: Target language code. See http://opensource.adobe.com/NLP-Cube/ for available languages and their codes
        @param base_path: Base path for models. Only required for custom-trained models. Otherwise, just leave this parameter untouched to use the default model location
        @return: True if loading was successful, False otherwise
        """
        sys.stdout.write('Loading models for ' + lang_code + "\n")
        if base_path is None:
            global BASE_PATH
            base_path = BASE_PATH

        self.embeddings = WordEmbeddings()
        self.embeddings.read_from_file(os.path.join(base_path, lang_code + "/wiki." + lang_code + ".vec"), None,
                                       full_load=False)
        if not os.path.isfile(os.path.join(base_path, lang_code + "/tokenizer-tok.bestAcc")):
            sys.stdout.write(
                "\tTokenization disabled. \n")
        else:
            self.tokenizer_enabled = True
            sys.stdout.write("\tTokenization enabled.\n")
            tokenizer_encodings = Encodings(verbose=False)
            tokenizer_encodings.load(os.path.join(base_path, lang_code + "/tokenizer.encodings"))
            from io_utils.config import TieredTokenizerConfig
            from generic_networks.tokenizers import TieredTokenizer
            config = TieredTokenizerConfig(os.path.join(base_path, lang_code + "/tokenizer.conf"))
            tokenizer_object = TieredTokenizer(config, tokenizer_encodings, self.embeddings, runtime=True)
            tokenizer_object.load(os.path.join(base_path, lang_code + "/tokenizer"))
            self.models[PipelineComponents.TOKENIZER] = tokenizer_object

        if not os.path.isfile(os.path.join(base_path, lang_code + "/compound.bestAcc")):
            sys.stdout.write(
                "\tCompound disabled. \n")
        else:
            self.compound_enabled = True
            sys.stdout.write("\tCompound enabled.\n")
            compound_encodings = Encodings(verbose=False)
            compound_encodings.load(os.path.join(base_path, lang_code + "/compound.encodings"))
            from io_utils.config import CompoundWordConfig
            from generic_networks.token_expanders import CompoundWordExpander
            config = CompoundWordConfig(os.path.join(base_path, lang_code + "/compound.conf"))
            compound_object = CompoundWordExpander(config, compound_encodings, self.embeddings, runtime=True)
            compound_object.load(os.path.join(base_path, lang_code + "/compound.bestAcc"))
            self.models[PipelineComponents.COMPOUND] = compound_object

        if not os.path.isfile(os.path.join(base_path, lang_code + "/lemmatizer.bestACC")):
            sys.stdout.write(
                "\tLemmatizer disabled. \n")
        else:
            self.lemmatizer_enabled = True
            sys.stdout.write("\tLemmatizer enabled.\n")
            lemmatizer_encodings = Encodings(verbose=False)
            lemmatizer_encodings.load(os.path.join(base_path, lang_code + "/lemmatizer.encodings"))
            from io_utils.config import LemmatizerConfig
            from generic_networks.lemmatizers import FSTLemmatizer
            config = LemmatizerConfig(os.path.join(base_path, lang_code + "/lemmatizer.conf"))
            lemmatizer_object = FSTLemmatizer(config, lemmatizer_encodings, self.embeddings, runtime=True)
            lemmatizer_object.load(os.path.join(base_path, lang_code + "/lemmatizer.bestACC"))
            self.models[PipelineComponents.LEMMATIZER] = lemmatizer_object

        if not os.path.isfile(os.path.join(base_path, lang_code + "/tagger.bestUPOS")):
            sys.stdout.write(
                "\tTagger disabled. \n")
        else:
            self.tagger_enabled = True
            sys.stdout.write("\tTagger enabled.\n")
            tagger_encodings = Encodings(verbose=False)
            tagger_encodings.load(os.path.join(base_path, lang_code + "/tagger.encodings"))
            from io_utils.config import TaggerConfig
            from generic_networks.taggers import BDRNNTagger
            config = TaggerConfig(os.path.join(base_path, lang_code + "/tagger.conf"))

            tagger_upos_object = BDRNNTagger(config, tagger_encodings, self.embeddings, runtime=True)
            tagger_upos_object.load(os.path.join(base_path, lang_code + "/tagger.bestUPOS"))
            tagger_xpos_object = BDRNNTagger(config, tagger_encodings, self.embeddings, runtime=True)
            tagger_xpos_object.load(os.path.join(base_path, tagger_encodings + "/tagger.bestXPOS"))
            tagger_attrs_object = BDRNNTagger(config, tagger_encodings, self.embeddings, runtime=True)
            tagger_attrs_object.load(os.path.join(base_path, lang_code + "/tagger.bestATTRS"))

            self.models[PipelineComponents.TAGGER] = [tagger_upos_object, tagger_xpos_object, tagger_attrs_object]

        if not os.path.isfile(os.path.join(base_path, lang_code + "/parser.bestUAS")):
            sys.stdout.write(
                "\tParser disabled. \n")
        else:
            self.parser_enabled = True
            sys.stdout.write("\tParser enabled.\n")
            lemmatizer_encodings = Encodings(verbose=False)
            lemmatizer_encodings.load(os.path.join(base_path, lang_code + "/parser.encodings"))
            from io_utils.config import ParserConfig
            from generic_networks.parsers import BDRNNParser
            config = ParserConfig(os.path.join(base_path, lang_code + "/parser.conf"))
            parser_object = BDRNNParser(config, lemmatizer_encodings, self.embeddings, runtime=True)
            parser_object.load(os.path.join(base_path, lang_code + "/parser.bestUAS"))
            self.models[PipelineComponents.PARSER] = parser_object

    def process_text(self, text="", pipeline=None):
        """
        Runs the pipeline on the input text. If the pipeline is set to None, Cube will run all available processing models
        @param text: the text to be processed. It can either be raw text format or, a list of sentences, each composed of a list of CONLLEntry
        @param pipeline: a list of PipelineComponents to be used for processing
        @return: A list of sentences, each composed of a list of CONLLEntry items
        """
        if pipeline is None:
            pipeline = [PipelineComponents.TOKENIZER, PipelineComponents.PARSER, PipelineComponents.TAGGER,
                        PipelineComponents.LEMMATIZER, PipelineComponents.COMPOUND]

        if PipelineComponents.TOKENIZER in pipeline and self.tokenizer_enabled:
            sys.stdout.write("\nTokenizing... \n\t")
            sys.stdout.flush()

            lines = text.replace("\r", "").split("\n")
            # analyze use of spaces in first part of the file
            test = "";
            useSpaces = " "
            cnt = 0
            while True:
                test = test + lines[cnt]
                # print(lines[cnt])
                if cnt + 1 >= len(lines) or cnt > 5:
                    break
                cnt += 1

            if float(test.count(' ')) / float(len(test)) < 0.02:
                useSpaces = ""
            # print (str(float(test.count(' '))/float(len(test))))
            input_string = ""
            for i in range(len(lines)):
                input_string = input_string + lines[i].replace("\r", "").replace("\n", "").strip() + useSpaces

            sequences = self.models[PipelineComponents.TOKENIZER].tokenize(input_string)

            sys.stdout.write("\n")
        else:
            sequences = text

        if PipelineComponents.COMPOUND in pipeline and self.compound_enabled:
            sequences = self.models[PipelineComponents.COMPOUND].expand_sequences(sequences)

        if PipelineComponents.PARSER in pipeline and self.parser_enabled:
            sequences = self.models[PipelineComponents.PARSER].parse_sequences(sequences)

        if PipelineComponents.TAGGER in pipeline and self.tagger_enabled:
            new_sequences = []
            for sequence in sequences:
                new_sequence = copy.deepcopy(sequence)
                predicted_tags_UPOS = self.models[PipelineComponents.TAGGER][0].tag(new_sequence)
                predicted_tags_XPOS = self.models[PipelineComponents.TAGGER][1].tag(new_sequence)
                predicted_tags_ATTRS = self.models[PipelineComponents.TAGGER][2].tag(new_sequence)
                for entryIndex in range(len(sequence)):
                    new_sequence[entryIndex].upos = predicted_tags_UPOS[entryIndex][0]
                    new_sequence[entryIndex].xpos = predicted_tags_XPOS[entryIndex][1]
                    new_sequence[entryIndex].attrs = predicted_tags_ATTRS[entryIndex][2]
                new_sequences.append(new_sequence)

            sequences = new_sequences

        if PipelineComponents.LEMMATIZER in pipeline and self.lemmatizer_enabled:
            sequences = self.models[PipelineComponents.LEMMATIZER].lemmatize_sequences(sequences)

        return sequences
    def _load(self, lang_code):
        """
        Load models on the class.
        """
        sys.stdout.write('Loading models for {}\n'.format(lang_code))
        path_for_language = os.path.join(self.disk_path, lang_code)

        # 1. Load word embeddings.
        self.embeddings = WordEmbeddings()
        word_embeddings_for_language = 'wiki.{}.vec'.format(lang_code)
        self.embeddings.read_from_file(os.path.join(
            path_for_language, word_embeddings_for_language),
                                       None,
                                       full_load=False)

        # 2. Load tokenizer.
        if not os.path.isfile(
                os.path.join(path_for_language, 'tokenizer-tok.bestAcc')):
            sys.stdout.write('\tTokenization disabled. \n')
        else:
            self.tokenizer_enabled = True
            sys.stdout.write('\tTokenization enabled.\n')
            tokenizer_encodings = Encodings(verbose=False)
            tokenizer_encodings.load(
                os.path.join(path_for_language, 'tokenizer.encodings'))
            config = TieredTokenizerConfig(
                os.path.join(path_for_language, 'tokenizer.conf'))
            tokenizer_object = TieredTokenizer(config,
                                               tokenizer_encodings,
                                               self.embeddings,
                                               runtime=True)
            tokenizer_object.load(os.path.join(path_for_language, 'tokenizer'))
            self.models[PipelineComponents.TOKENIZER] = tokenizer_object

        # 3. Load compound.
        if not os.path.isfile(
                os.path.join(path_for_language, 'compound.bestAcc')):
            sys.stdout.write('\tCompound disabled. \n')
        else:
            self.compound_enabled = True
            sys.stdout.write('\tCompound enabled.\n')
            compound_encodings = Encodings(verbose=False)
            compound_encodings.load(
                os.path.join(path_for_language, 'compound.encodings'))
            config = CompoundWordConfig(
                os.path.join(path_for_language, 'compound.conf'))
            compound_object = CompoundWordExpander(config,
                                                   compound_encodings,
                                                   self.embeddings,
                                                   runtime=True)
            compound_object.load(
                os.path.join(path_for_language, 'compound.bestAcc'))
            self.models[PipelineComponents.COMPOUND] = compound_object

        if not os.path.isfile(
                os.path.join(path_for_language, 'lemmatizer.bestACC')):
            sys.stdout.write('\tLemmatizer disabled. \n')
        else:
            self.lemmatizer_enabled = True
            sys.stdout.write('\tLemmatizer enabled.\n')
            lemmatizer_encodings = Encodings(verbose=False)
            lemmatizer_encodings.load(
                os.path.join(path_for_language, 'lemmatizer.encodings'))
            config = LemmatizerConfig(
                os.path.join(path_for_language, 'lemmatizer.conf'))
            lemmatizer_object = FSTLemmatizer(config,
                                              lemmatizer_encodings,
                                              self.embeddings,
                                              runtime=True)
            lemmatizer_object.load(
                os.path.join(path_for_language, 'lemmatizer.bestACC'))
            self.models[PipelineComponents.LEMMATIZER] = lemmatizer_object

        if not os.path.isfile(
                os.path.join(path_for_language, 'tagger.bestUPOS')):
            sys.stdout.write('\tTagger disabled. \n')
        else:
            self.tagger_enabled = True
            sys.stdout.write('\tTagger enabled.\n')
            tagger_encodings = Encodings(verbose=False)
            tagger_encodings.load(
                os.path.join(path_for_language, 'tagger.encodings'))
            config = TaggerConfig(
                os.path.join(path_for_language, 'tagger.conf'))

            tagger_upos_object = BDRNNTagger(config,
                                             tagger_encodings,
                                             self.embeddings,
                                             runtime=True)
            tagger_upos_object.load(
                os.path.join(path_for_language, 'tagger.bestUPOS'))
            tagger_xpos_object = BDRNNTagger(config,
                                             tagger_encodings,
                                             self.embeddings,
                                             runtime=True)
            tagger_xpos_object.load(
                os.path.join(path_for_language, 'tagger.bestXPOS'))
            tagger_attrs_object = BDRNNTagger(config,
                                              tagger_encodings,
                                              self.embeddings,
                                              runtime=True)
            tagger_attrs_object.load(
                os.path.join(path_for_language, 'tagger.bestATTRS'))

            self.models[PipelineComponents.TAGGER] = [
                tagger_upos_object, tagger_xpos_object, tagger_attrs_object
            ]

        if not os.path.isfile(os.path.join(path_for_language,
                                           'parser.bestUAS')):
            sys.stdout.write('\tParser disabled. \n')
        else:
            self.parser_enabled = True
            sys.stdout.write('\tParser enabled.\n')
            lemmatizer_encodings = Encodings(verbose=False)
            lemmatizer_encodings.load(
                os.path.join(path_for_language, 'parser.encodings'))
            config = ParserConfig(
                os.path.join(path_for_language, 'parser.conf'))
            parser_object = BDRNNParser(config,
                                        lemmatizer_encodings,
                                        self.embeddings,
                                        runtime=True)
            parser_object.load(
                os.path.join(path_for_language, 'parser.bestUAS'))
            self.models[PipelineComponents.PARSER] = parser_object
class ModelStore(object):
    """
    Abstraction layer for working with models.

    Usage example:
        model_store = ModelStore()

        # Load models for lang_code ro.
        model_store.load('ro')

        # Get latest versions.
        model_store.get_latest_versions()
    """

    MODELS_PATH_LOCAL = 'models'
    MODELS_PATH_CLOUD = 'https://nlpcube.blob.core.windows.net/models'
    MODELS_PATH_CLOUD_ALL = os.path.join(MODELS_PATH_CLOUD,
                                         '?restype=container&comp=list')

    EMBEDDINGS_NAME = 'wiki.{}.vec'
    FACEBOOK_EMBEDDINGS_URL = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/'
    FACEBOOK_EMBEDDINGS_LOCATION = 'corpus/'

    def __init__(self, disk_path=None, cloud_path=None):
        self.disk_path = disk_path or self.MODELS_PATH_LOCAL
        self.cloud_path = cloud_path or self.MODELS_PATH_CLOUD
        self.models = {}

    def load(self, lang_code, check_for_latest=True):
        """
        Contains logic for loading or downloading and loading models for the target language.

        Args:
            lang_code: Target language code.
                See http://opensource.adobe.com/NLP-Cube/ for available languages and their codes
            check_for_latest: Whether or not to get the latest version.
        """
        version_to_download = self.version_to_donwload(lang_code,
                                                       check_for_latest)
        if version_to_download:
            self._download_models_version(lang_code, version_to_download)

        # Now we surely have the language models downloaded
        self._load(lang_code)

    def _load(self, lang_code):
        """
        Load models on the class.
        """
        sys.stdout.write('Loading models for {}\n'.format(lang_code))
        path_for_language = os.path.join(self.disk_path, lang_code)

        # 1. Load word embeddings.
        self.embeddings = WordEmbeddings()
        word_embeddings_for_language = 'wiki.{}.vec'.format(lang_code)
        self.embeddings.read_from_file(os.path.join(
            path_for_language, word_embeddings_for_language),
                                       None,
                                       full_load=False)

        # 2. Load tokenizer.
        if not os.path.isfile(
                os.path.join(path_for_language, 'tokenizer-tok.bestAcc')):
            sys.stdout.write('\tTokenization disabled. \n')
        else:
            self.tokenizer_enabled = True
            sys.stdout.write('\tTokenization enabled.\n')
            tokenizer_encodings = Encodings(verbose=False)
            tokenizer_encodings.load(
                os.path.join(path_for_language, 'tokenizer.encodings'))
            config = TieredTokenizerConfig(
                os.path.join(path_for_language, 'tokenizer.conf'))
            tokenizer_object = TieredTokenizer(config,
                                               tokenizer_encodings,
                                               self.embeddings,
                                               runtime=True)
            tokenizer_object.load(os.path.join(path_for_language, 'tokenizer'))
            self.models[PipelineComponents.TOKENIZER] = tokenizer_object

        # 3. Load compound.
        if not os.path.isfile(
                os.path.join(path_for_language, 'compound.bestAcc')):
            sys.stdout.write('\tCompound disabled. \n')
        else:
            self.compound_enabled = True
            sys.stdout.write('\tCompound enabled.\n')
            compound_encodings = Encodings(verbose=False)
            compound_encodings.load(
                os.path.join(path_for_language, 'compound.encodings'))
            config = CompoundWordConfig(
                os.path.join(path_for_language, 'compound.conf'))
            compound_object = CompoundWordExpander(config,
                                                   compound_encodings,
                                                   self.embeddings,
                                                   runtime=True)
            compound_object.load(
                os.path.join(path_for_language, 'compound.bestAcc'))
            self.models[PipelineComponents.COMPOUND] = compound_object

        if not os.path.isfile(
                os.path.join(path_for_language, 'lemmatizer.bestACC')):
            sys.stdout.write('\tLemmatizer disabled. \n')
        else:
            self.lemmatizer_enabled = True
            sys.stdout.write('\tLemmatizer enabled.\n')
            lemmatizer_encodings = Encodings(verbose=False)
            lemmatizer_encodings.load(
                os.path.join(path_for_language, 'lemmatizer.encodings'))
            config = LemmatizerConfig(
                os.path.join(path_for_language, 'lemmatizer.conf'))
            lemmatizer_object = FSTLemmatizer(config,
                                              lemmatizer_encodings,
                                              self.embeddings,
                                              runtime=True)
            lemmatizer_object.load(
                os.path.join(path_for_language, 'lemmatizer.bestACC'))
            self.models[PipelineComponents.LEMMATIZER] = lemmatizer_object

        if not os.path.isfile(
                os.path.join(path_for_language, 'tagger.bestUPOS')):
            sys.stdout.write('\tTagger disabled. \n')
        else:
            self.tagger_enabled = True
            sys.stdout.write('\tTagger enabled.\n')
            tagger_encodings = Encodings(verbose=False)
            tagger_encodings.load(
                os.path.join(path_for_language, 'tagger.encodings'))
            config = TaggerConfig(
                os.path.join(path_for_language, 'tagger.conf'))

            tagger_upos_object = BDRNNTagger(config,
                                             tagger_encodings,
                                             self.embeddings,
                                             runtime=True)
            tagger_upos_object.load(
                os.path.join(path_for_language, 'tagger.bestUPOS'))
            tagger_xpos_object = BDRNNTagger(config,
                                             tagger_encodings,
                                             self.embeddings,
                                             runtime=True)
            tagger_xpos_object.load(
                os.path.join(path_for_language, 'tagger.bestXPOS'))
            tagger_attrs_object = BDRNNTagger(config,
                                              tagger_encodings,
                                              self.embeddings,
                                              runtime=True)
            tagger_attrs_object.load(
                os.path.join(path_for_language, 'tagger.bestATTRS'))

            self.models[PipelineComponents.TAGGER] = [
                tagger_upos_object, tagger_xpos_object, tagger_attrs_object
            ]

        if not os.path.isfile(os.path.join(path_for_language,
                                           'parser.bestUAS')):
            sys.stdout.write('\tParser disabled. \n')
        else:
            self.parser_enabled = True
            sys.stdout.write('\tParser enabled.\n')
            lemmatizer_encodings = Encodings(verbose=False)
            lemmatizer_encodings.load(
                os.path.join(path_for_language, 'parser.encodings'))
            config = ParserConfig(
                os.path.join(path_for_language, 'parser.conf'))
            parser_object = BDRNNParser(config,
                                        lemmatizer_encodings,
                                        self.embeddings,
                                        runtime=True)
            parser_object.load(
                os.path.join(path_for_language, 'parser.bestUAS'))
            self.models[PipelineComponents.PARSER] = parser_object

    def get_latest_model_versions(self):
        """
        Returns a dictionary with (lang_code, latest_version) for each language code.
        """
        request = requests.get(self.MODELS_PATH_CLOUD_ALL)
        data = xmltodict.parse(request.content)

        # Make a list with all the archives in the container.
        item_names = [
            item['Name']
            for item in data['EnumerationResults']['Blobs']['Blob']
            if item['Name'].endswith('.zip')
        ]

        # Compute latest_versions.
        latest_versions = defaultdict(str)
        for item in item_names:
            language, version = item.replace('.zip', '').split('-')
            latest_versions[language] = max(latest_versions[language], version)

        return latest_versions

    def _download_models_version(self, lang_code, version):
        """
        Downloads pre-trained models for the provided language.

        Args:
            @param lang_code: Target language code.
                See http://opensource.adobe.com/NLP-Cube/ for available languages and their codes
            @param version: Version of the models.
        """
        sys.stdout.write('Downloading models for {} \n'.format(lang_code))

        model_name = '{}-{}'.format(lang_code, version)
        model_path_cloud = os.path.join(self.cloud_path,
                                        '{}.zip'.format(model_name))
        model_path_local = os.path.join(self.disk_path,
                                        '{}.zip'.format(model_name))

        # Download and extract models for provided language.
        self._download_and_extract_lang_models(model_path_cloud,
                                               model_path_local)

        # Download Facebook embeddings.
        self._download_facebook_embeddings(lang_code)

    def _download_and_extract_lang_models(self, url, file_name, force=False):
        if os.path.exists(file_name):
            if force:
                os.remove(file_name)
            return

        # Download and extract zip archive.
        request = requests.get(url)
        request_content = request.content
        zipfile = ZipFile(io.BytesIO(request_content))
        zipfile.extractall(self.disk_path)
        zipfile.close()

    def _download_facebook_embeddings(self, lang_code):
        """
        Download Facebook embeddings for the provided lang_code.
        """
        name = self.EMBEDDINGS_NAME.format(lang_code)
        embeddings_url = self.FACEBOOK_EMBEDDINGS_URL + name
        embeddings_path = os.path.join(self.disk_path, lang_code, name)

        request = requests.get(embeddings_url)
        with fopen(embeddings_path, 'wb') as fd:
            fd.write(request.content)

    def version_to_donwload(self, lang_code, check_for_latest=True):
        """
        Returns the version of the language models that need to be downloaded,
        or None if there's nothing to be done.
        """
        lang_models = os.path.join(self.disk_path, lang_code)
        lang_models_version = os.path.join(lang_models, 'VERSION')

        # Get current version (if any).
        current_version = None
        if os.path.exists(lang_models):
            with fopen(lang_models_version) as fd:
                current_version = fd.read().strip('\n')

        # Get the latest version.
        latest_versions = self.get_latest_model_versions()
        latest_version = latest_versions.get(lang_code)

        if check_for_latest:
            if not latest_version:
                if not current_version:
                    raise ValueError(
                        'No remote version found for {}!'.format(lang_code))

                print('No remote version found for {}, using the local '
                      'version {}'.format(lang_code, current_version))
                return

            if current_version and current_version >= latest_version:
                return

            return latest_version

        if not current_version:
            return latest_version
예제 #10
0
    def _load(self, lang_code, version):
        """
        Load models on the class.
        """
        # Refresh metadata
        self.metadata.read(
            os.path.join(self.disk_path, lang_code + "-" + str(version),
                         "metadata.json"))
        model_folder = os.path.join(self.disk_path,
                                    lang_code + "-" + str(version))
        embeddings_folder = os.path.join(self.disk_path, "embeddings")
        embeddings_file_path = os.path.join(embeddings_folder,
                                            self.metadata.embeddings_file_name)

        #sys.stdout.write('Loading model for {}-{}\n'.format(lang_code,version))

        # 1. Load word embeddings
        self.embeddings = WordEmbeddings(verbose=False)
        sys.stdout.write('\tLoading embeddings... \n')
        self.embeddings.read_from_file(embeddings_file_path,
                                       None,
                                       full_load=False)

        # 2. Load tokenizer
        if not os.path.isfile(
                os.path.join(model_folder, 'tokenizer-tok.bestAcc')):
            sys.stdout.write('\tTokenization disabled. \n')
        else:
            self.tokenizer_enabled = True
            sys.stdout.write('\tTokenization enabled.\n')
            tokenizer_encodings = Encodings(verbose=False)
            tokenizer_encodings.load(
                os.path.join(model_folder, 'tokenizer.encodings'))
            config = TieredTokenizerConfig(
                os.path.join(model_folder, 'tokenizer.conf'))
            tokenizer_object = TieredTokenizer(config,
                                               tokenizer_encodings,
                                               self.embeddings,
                                               runtime=True)
            tokenizer_object.load(os.path.join(model_folder, 'tokenizer'))
            self.model[PipelineComponents.TOKENIZER] = tokenizer_object

        # 3. Load compound
        if not os.path.isfile(os.path.join(model_folder, 'compound.bestAcc')):
            sys.stdout.write('\tCompound disabled. \n')
        else:
            self.compound_enabled = True
            sys.stdout.write('\tCompound enabled.\n')
            compound_encodings = Encodings(verbose=False)
            compound_encodings.load(
                os.path.join(model_folder, 'compound.encodings'))
            config = CompoundWordConfig(
                os.path.join(model_folder, 'compound.conf'))
            compound_object = CompoundWordExpander(config,
                                                   compound_encodings,
                                                   self.embeddings,
                                                   runtime=True)
            compound_object.load(os.path.join(model_folder,
                                              'compound.bestAcc'))
            self.model[PipelineComponents.COMPOUND] = compound_object

        # 4. Load lemmatizer
        if not os.path.isfile(os.path.join(model_folder,
                                           'lemmatizer.bestACC')):
            sys.stdout.write('\tLemmatizer disabled. \n')
        else:
            self.lemmatizer_enabled = True
            sys.stdout.write('\tLemmatizer enabled.\n')
            lemmatizer_encodings = Encodings(verbose=False)
            lemmatizer_encodings.load(
                os.path.join(model_folder, 'lemmatizer.encodings'))
            config = LemmatizerConfig(
                os.path.join(model_folder, 'lemmatizer.conf'))
            lemmatizer_object = FSTLemmatizer(config,
                                              lemmatizer_encodings,
                                              self.embeddings,
                                              runtime=True)
            lemmatizer_object.load(
                os.path.join(model_folder, 'lemmatizer.bestACC'))
            self.model[PipelineComponents.LEMMATIZER] = lemmatizer_object

        # 5. Load taggers
        if not os.path.isfile(os.path.join(model_folder, 'tagger.bestUPOS')):
            sys.stdout.write('\tTagger disabled. \n')
        else:
            self.tagger_enabled = True
            sys.stdout.write('\tTagger enabled.\n')
            tagger_encodings = Encodings(verbose=False)
            tagger_encodings.load(
                os.path.join(model_folder, 'tagger.encodings'))
            config = TaggerConfig(os.path.join(model_folder, 'tagger.conf'))

            tagger_upos_object = BDRNNTagger(config,
                                             tagger_encodings,
                                             self.embeddings,
                                             runtime=True)
            tagger_upos_object.load(
                os.path.join(model_folder, 'tagger.bestUPOS'))
            tagger_xpos_object = BDRNNTagger(config,
                                             tagger_encodings,
                                             self.embeddings,
                                             runtime=True)
            tagger_xpos_object.load(
                os.path.join(model_folder, 'tagger.bestXPOS'))
            tagger_attrs_object = BDRNNTagger(config,
                                              tagger_encodings,
                                              self.embeddings,
                                              runtime=True)
            tagger_attrs_object.load(
                os.path.join(model_folder, 'tagger.bestATTRS'))

            self.model[PipelineComponents.TAGGER] = [
                tagger_upos_object, tagger_xpos_object, tagger_attrs_object
            ]

        # 6. Load parser
        if not os.path.isfile(os.path.join(model_folder, 'parser.bestUAS')):
            sys.stdout.write('\tParser disabled. \n')
        else:
            self.parser_enabled = True
            sys.stdout.write('\tParser enabled.\n')
            lemmatizer_encodings = Encodings(verbose=False)
            lemmatizer_encodings.load(
                os.path.join(model_folder, 'parser.encodings'))
            config = ParserConfig(os.path.join(model_folder, 'parser.conf'))
            parser_object = BDRNNParser(config,
                                        lemmatizer_encodings,
                                        self.embeddings,
                                        runtime=True)
            parser_object.load(os.path.join(model_folder, 'parser.bestUAS'))
            self.model[PipelineComponents.PARSER] = parser_object
예제 #11
0
class ModelStore(object):
    """
    Abstraction layer for working with models.
    """

    MODELS_PATH_LOCAL = 'models'
    MODELS_PATH_CLOUD = 'https://nlpcube.blob.core.windows.net/models'
    MODELS_PATH_CLOUD_ALL = os.path.join(MODELS_PATH_CLOUD,
                                         '?restype=container&comp=list')

    def __init__(self, disk_path=None, cloud_path=None):
        self.disk_path = disk_path or self.MODELS_PATH_LOCAL
        self.cloud_path = cloud_path or self.MODELS_PATH_CLOUD
        self.model = {}
        self.metadata = ModelMetadata()

    def _list_folders(self, lang_code=None):
        output = [
            os.path.basename(os.path.normpath(dI))
            for dI in os.listdir(self.disk_path)
            if os.path.isdir(os.path.join(self.disk_path, dI))
        ]
        if lang_code != None:
            output = [dI for dI in output if lang_code in dI]
        return output

    def find(self, lang_code, version="latest", verbose=True):
        """
        Contains logic for loading or downloading and loading models for the target language.
        
        Description: 
        if version == "latest":
            it checks for the local latest version available, and it loads it
            if it does not find any local version, it downloads the latest one it finds online
        if version == "2.0": (or any other specific version, != "latest")
            it checks for it locally, if it finds it, it loads it
            if it is not found locally, it attempts to download it from the cloud and then loads it
        Args:
            lang_code: Target language code.
                See http://opensource.adobe.com/NLP-Cube/ for available languages and their codes
            version: "latest" to get the latest version, or other specific version in like "1.0", "2.1", etc .
        """
        # check for the latest local version, according to version parameter
        if version == "latest":
            latest_version = None
            local_models = self.list_local_models(lang_code)
            if len(local_models) > 0:
                local_versions = [x[1] for x in local_models]
                local_versions.sort()
                latest_version = local_versions[-1]
                if verbose:
                    print("Loading latest local model: " + lang_code + "-" +
                          str(latest_version))
                #self._load(lang_code,latest_version)
                return os.path.join(self.disk_path,
                                    lang_code + "-" + str(latest_version))
            else:  # no models found, check online according to version parameter
                if version == "latest":
                    version = self._version_to_download(lang_code,
                                                        version=version)
                    if version != None:
                        print("Latest version found online: " + lang_code +
                              "-" + str(version))
                    else:  # nothing was found online
                        raise Exception(
                            "No model version for language [" + lang_code +
                            "] was found in the online repository!")
                self._download_model(lang_code, version)
                return os.path.join(
                    self.disk_path, lang_code + "-" +
                    str(version))  #self._load(lang_code,version)

        else:  # check for a specific local version, according to version parameter
            version = float(version)
            if os.path.isdir(os.path.join(self.disk_path, lang_code, version)):
                return os.path.join(
                    self.disk_path, lang_code + "-" +
                    str(version))  #self._load(lang_code,version)
            else:  # version not found, trying to download it from the cloud
                version = self._version_to_download(lang_code, version=version)
                if version == None:
                    raise Exception(
                        "Version [" + str(version) + "] for language [" +
                        lang_code +
                        "] was not found in the online repository. Maybe try using .find(version='latest') to auto-download the latest model?"
                    )
                self._download_model(lang_code, str(version))
                return os.path.join(
                    self.disk_path, lang_code + "-" +
                    str(version))  #self._load(lang_code,version)

    def _load(self, lang_code, version):
        """
        Load models on the class.
        """
        # Refresh metadata
        self.metadata.read(
            os.path.join(self.disk_path, lang_code + "-" + str(version),
                         "metadata.json"))
        model_folder = os.path.join(self.disk_path,
                                    lang_code + "-" + str(version))
        embeddings_folder = os.path.join(self.disk_path, "embeddings")
        embeddings_file_path = os.path.join(embeddings_folder,
                                            self.metadata.embeddings_file_name)

        #sys.stdout.write('Loading model for {}-{}\n'.format(lang_code,version))

        # 1. Load word embeddings
        self.embeddings = WordEmbeddings(verbose=False)
        sys.stdout.write('\tLoading embeddings... \n')
        self.embeddings.read_from_file(embeddings_file_path,
                                       None,
                                       full_load=False)

        # 2. Load tokenizer
        if not os.path.isfile(
                os.path.join(model_folder, 'tokenizer-tok.bestAcc')):
            sys.stdout.write('\tTokenization disabled. \n')
        else:
            self.tokenizer_enabled = True
            sys.stdout.write('\tTokenization enabled.\n')
            tokenizer_encodings = Encodings(verbose=False)
            tokenizer_encodings.load(
                os.path.join(model_folder, 'tokenizer.encodings'))
            config = TieredTokenizerConfig(
                os.path.join(model_folder, 'tokenizer.conf'))
            tokenizer_object = TieredTokenizer(config,
                                               tokenizer_encodings,
                                               self.embeddings,
                                               runtime=True)
            tokenizer_object.load(os.path.join(model_folder, 'tokenizer'))
            self.model[PipelineComponents.TOKENIZER] = tokenizer_object

        # 3. Load compound
        if not os.path.isfile(os.path.join(model_folder, 'compound.bestAcc')):
            sys.stdout.write('\tCompound disabled. \n')
        else:
            self.compound_enabled = True
            sys.stdout.write('\tCompound enabled.\n')
            compound_encodings = Encodings(verbose=False)
            compound_encodings.load(
                os.path.join(model_folder, 'compound.encodings'))
            config = CompoundWordConfig(
                os.path.join(model_folder, 'compound.conf'))
            compound_object = CompoundWordExpander(config,
                                                   compound_encodings,
                                                   self.embeddings,
                                                   runtime=True)
            compound_object.load(os.path.join(model_folder,
                                              'compound.bestAcc'))
            self.model[PipelineComponents.COMPOUND] = compound_object

        # 4. Load lemmatizer
        if not os.path.isfile(os.path.join(model_folder,
                                           'lemmatizer.bestACC')):
            sys.stdout.write('\tLemmatizer disabled. \n')
        else:
            self.lemmatizer_enabled = True
            sys.stdout.write('\tLemmatizer enabled.\n')
            lemmatizer_encodings = Encodings(verbose=False)
            lemmatizer_encodings.load(
                os.path.join(model_folder, 'lemmatizer.encodings'))
            config = LemmatizerConfig(
                os.path.join(model_folder, 'lemmatizer.conf'))
            lemmatizer_object = FSTLemmatizer(config,
                                              lemmatizer_encodings,
                                              self.embeddings,
                                              runtime=True)
            lemmatizer_object.load(
                os.path.join(model_folder, 'lemmatizer.bestACC'))
            self.model[PipelineComponents.LEMMATIZER] = lemmatizer_object

        # 5. Load taggers
        if not os.path.isfile(os.path.join(model_folder, 'tagger.bestUPOS')):
            sys.stdout.write('\tTagger disabled. \n')
        else:
            self.tagger_enabled = True
            sys.stdout.write('\tTagger enabled.\n')
            tagger_encodings = Encodings(verbose=False)
            tagger_encodings.load(
                os.path.join(model_folder, 'tagger.encodings'))
            config = TaggerConfig(os.path.join(model_folder, 'tagger.conf'))

            tagger_upos_object = BDRNNTagger(config,
                                             tagger_encodings,
                                             self.embeddings,
                                             runtime=True)
            tagger_upos_object.load(
                os.path.join(model_folder, 'tagger.bestUPOS'))
            tagger_xpos_object = BDRNNTagger(config,
                                             tagger_encodings,
                                             self.embeddings,
                                             runtime=True)
            tagger_xpos_object.load(
                os.path.join(model_folder, 'tagger.bestXPOS'))
            tagger_attrs_object = BDRNNTagger(config,
                                              tagger_encodings,
                                              self.embeddings,
                                              runtime=True)
            tagger_attrs_object.load(
                os.path.join(model_folder, 'tagger.bestATTRS'))

            self.model[PipelineComponents.TAGGER] = [
                tagger_upos_object, tagger_xpos_object, tagger_attrs_object
            ]

        # 6. Load parser
        if not os.path.isfile(os.path.join(model_folder, 'parser.bestUAS')):
            sys.stdout.write('\tParser disabled. \n')
        else:
            self.parser_enabled = True
            sys.stdout.write('\tParser enabled.\n')
            lemmatizer_encodings = Encodings(verbose=False)
            lemmatizer_encodings.load(
                os.path.join(model_folder, 'parser.encodings'))
            config = ParserConfig(os.path.join(model_folder, 'parser.conf'))
            parser_object = BDRNNParser(config,
                                        lemmatizer_encodings,
                                        self.embeddings,
                                        runtime=True)
            parser_object.load(os.path.join(model_folder, 'parser.bestUAS'))
            self.model[PipelineComponents.PARSER] = parser_object

    def _download_model(self, lang_code, version):
        """
        Downloads pre-trained models for the provided language.

        Args:
            @param lang_code: Target language code.
                See http://opensource.adobe.com/NLP-Cube/ for available languages and their codes
            @param version: Version of the model.
        """
        #sys.stdout.write('Downloading models for {} \n'.format(lang_code))

        model_name = '{}-{}'.format(lang_code, version)
        model_path_cloud = os.path.join(self.cloud_path,
                                        '{}.zip'.format(model_name))
        model_path_local = os.path.join(self.disk_path,
                                        '{}.zip'.format(model_name))

        # Download and extract models for provided language.
        self._download_and_extract_lang_model(model_path_cloud,
                                              model_path_local)
        self.metadata.read(
            os.path.join(self.disk_path, lang_code + "-" + str(version),
                         "metadata.json"))

        # Download Facebook embeddings based on the metadata read from the model
        self._download_embeddings(self.metadata.embeddings_remote_link,
                                  self.metadata.embeddings_file_name)
        sys.stdout.write("\n")

    def _download_with_progress_bar(self, url, local_filename):
        r = requests.get(url, stream=True)
        total_size = int(r.headers['Content-Length'].strip())
        current_size = 0
        #request_content = []
        f = fopen(local_filename, 'wb')
        for buf in r.iter_content(4096 * 16):
            if buf:
                #request_content.append(buf)
                f.write(buf)
                current_size += len(buf)
                done = int(40 * current_size / total_size)
                sys.stdout.write(
                    "\r[%s%s] %3.1f%%, downloading %.2f/%.2f MB ..." %
                    ('=' * done, ' ' *
                     (40 - done), 100 * current_size / total_size,
                     current_size / 1024 / 1024, total_size / 1024 / 1024))
                sys.stdout.flush()
        #return b"".join(request_content)
        f.close()

    def _download_and_extract_lang_model(self, url, file_name, force=False):
        if os.path.exists(file_name):
            if force:
                os.remove(file_name)
            return

        temp_folder = tempfile.mkdtemp()
        try:
            # Download and extract zip archive.
            zip_file_name = os.path.join(temp_folder, "tmp.zip")
            self._download_with_progress_bar(url, zip_file_name)
            sys.stdout.write(
                "\rDownload complete, decompressing files ...                                         "
            )
            sys.stdout.flush()

            zipfile = ZipFile(zip_file_name, "r")
            zipfile.extractall(self.disk_path)
            zipfile.close()
            sys.stdout.write("\nModel downloaded successfully.")
            sys.stdout.flush()

        except Exception as e:
            print("Error encountered, cleaning up and exiting ...")
            rmtree(temp_folder, ignore_errors=True)
            raise e

        # delete temporary folder
        rmtree(temp_folder, ignore_errors=True)

    def _download_embeddings(self, embeddings_remote_link,
                             embeddings_file_name):
        """
        Download remote embeddings for the provided lang_code.
        Args:
            @param lang_code: Target language code.
                See http://opensource.adobe.com/NLP-Cube/ for available languages and their codes
            @param version: Version of the model to read which embedding file to get.
        """

        embeddings_folder = os.path.join(self.disk_path, "embeddings")
        if not os.path.exists(embeddings_folder):
            os.makedirs(embeddings_folder)
        embeddings_file = os.path.join(embeddings_folder, embeddings_file_name)

        # Check locally for the file
        sys.stdout.write("\nChecking for associated vector embeddings file [" +
                         embeddings_file_name + "] ...\n")
        if os.path.isfile(embeddings_file):
            return

        # We don't have the correct embedding file, download it ...
        self._download_with_progress_bar(embeddings_remote_link,
                                         embeddings_file)
        sys.stdout.write(
            "\rEmbeddings downloaded successfully.                                                  "
        )

    def _version_to_download(self, lang_code, version="latest"):
        """
        Returns the version of the language models that need to be downloaded,
        or None if there's nothing to be done.
        """
        online_models = self.list_online_models(lang_code)

        # filter by lang code
        lang_models = [x for x in online_models if lang_code in x[0]]

        if len(lang_models) == 0:
            return None  # nothing found online

        if version == "latest":
            # Compute latest version.
            remote_versions = [x[1] for x in lang_models]
            remote_versions.sort()
            return remote_versions[-1]
        else:
            for model in lang_models:
                if str(version) in model:
                    return version
            return None  # not found this particular version online

    def delete_model(self, lang_code, version):
        """ 
        Deletes a local model. Also checks for associated embeddings file and cleans it up as well only if not referenced by any other local model
        """
        model = lang_code + "-" + str(version)
        model_folder = os.path.join(self.disk_path, model)
        # check if model exists
        if not os.path.isdir(model_folder):
            print("Model " + model + " not found! Nothing to delete.")
            return

        # determine which embedding file we need to delete
        model_metadata.load(lang_code + "-" + str(version))
        embeddings_file_to_delete = model_metadata.embeddings_file_name

        # delete the model
        import shutil
        try:
            shutil.rmtree(model_folder)
        except OSError as e:
            print("Error removing folder from local disk: %s - %s." %
                  (e.filename, e.strerror))

        # search in other models for referenced embeddings file
        found_in_other_models = False
        lang_models = self._list_folders()
        for lang_model in lang_models:
            model_metadata.load(lang_model)
            other_embeddings_file = model_metadata.embeddings_file_name
            if other_embeddings_file == embeddings_file_to_delete:
                found_in_other_models = True
                print("Embeddings file " + embeddings_file_to_delete +
                      " is still being used by model " + lang_model +
                      " so it will not be deleted.")
                break
        if not found_in_other_models:
            try:
                os.remove(embeddings_file_to_delete)
                print("Removed embeddings file " + embeddings_file_to_delete +
                      ".")
            except OSError as e:  ## if failed, report it back to the user ##
                print("Error removing embeddings file: %s - %s." %
                      (e.filename, e.strerror))

        print("Model cleanup successful.")

    def list_local_models(self, lang_code=None):
        """
        Returns a list of tuples of the models found locally
        ex: [("en",1.0),("en",1.1),("es",1.0)...]
        """
        lang_models = self._list_folders()
        lang_models = [
            x for x in lang_models if "-" in x
        ]  # eliminate the embeddings and any other non-model folder
        if len(lang_models) > 0:
            local_models = [(x.split("-")[0], float(x.split("-")[1]))
                            for x in lang_models]
            if lang_code:
                local_models = [x for x in local_models if lang_code in x[0]]
            return local_models
        else:
            return []

    def list_online_models(self, lang_code):
        """
        Returns a list of tuples of the models found online
        ex: [("en",1.0),("en",1.1),("es",1.0)...]
        """
        request = requests.get(self.MODELS_PATH_CLOUD_ALL)
        data = xmltodict.parse(request.content)

        # Make a list with all the archives in the container.
        online_models = [
            item['Name']
            for item in data['EnumerationResults']['Blobs']['Blob']
            if item['Name'].endswith('.zip')
        ]
        online_models = [(x.replace(".zip", "").split("-")[0],
                          float(x.replace(".zip", "").split("-")[1]))
                         for x in online_models if "-" in x]
        if lang_code:
            online_models = [x for x in online_models if lang_code in x[0]]
        return online_models

    def _copy_file(self, input_folder, output_folder, file_name):
        src_file = os.path.join(input_folder, file_name)
        dst_file = os.path.join(output_folder, file_name)
        if not os.path.isfile(src_file):
            return False
        copyfile(src_file, dst_file)
        return True

    def _zipper(self, dir, zip_file):
        zip = zipfile.ZipFile(zip_file, 'w', compression=zipfile.ZIP_DEFLATED)
        root_len = len(os.path.abspath(dir))
        for root, dirs, files in os.walk(dir):
            archive_root = os.path.abspath(root)[root_len:]
            for f in files:
                fullpath = os.path.join(root, f)
                archive_name = os.path.join(archive_root, f)
                zip.write(fullpath, archive_name, zipfile.ZIP_DEFLATED)
        zip.close()

    def package_model(self,
                      input_folder,
                      output_folder_path,
                      metadata,
                      should_contain_tokenizer=True,
                      should_contain_compound_word_expander=False,
                      should_contain_lemmatizer=True,
                      should_contain_tagger=True,
                      should_contain_parser=True):
        """
            input_folder = "English-GWT"
            output_folder_path = "path_to_where_zip_files_will_be_placed"
        """

        # check input folder exists
        if not os.path.isdir(input_folder):
            raise Exception("Input folder not found")

        # create temporary folder locally
        temp_folder = tempfile.mkdtemp()
        try:
            # create local model sub-folder
            output_folder = os.path.join(
                temp_folder,
                metadata.language_code + "-" + str(metadata.model_version))
            print("\tWriting model to temp folder: " + output_folder)
            os.makedirs(output_folder)

            # write metadata to this folder
            metadata.save(os.path.join(output_folder, "metadata.json"))

            # copy tokenizer files
            if should_contain_tokenizer:
                tokenizer_is_valid = True
                if not self._copy_file(input_folder, output_folder,
                                       "tokenizer.encodings"):
                    tokenizer_is_valid = False
                if not self._copy_file(input_folder, output_folder,
                                       "tokenizer.conf"):
                    tokenizer_is_valid = False
                if not self._copy_file(input_folder, output_folder,
                                       "tokenizer-tok.bestAcc"):
                    tokenizer_is_valid = False
                if not self._copy_file(input_folder, output_folder,
                                       "tokenizer-ss.bestAcc"):
                    tokenizer_is_valid = False
                if tokenizer_is_valid:
                    print("\tTokenizer model found.")
                else:
                    raise Exception(
                        "Tokenizer model not found (or incomplete).")

            # copy compound_word_expander files
            if should_contain_compound_word_expander:
                compound_word_expander = True
                if not self._copy_file(input_folder, output_folder,
                                       "compound.bestAcc"):
                    compound_word_expander = False
                if not self._copy_file(input_folder, output_folder,
                                       "compound.conf"):
                    compound_word_expander = False
                if not self._copy_file(input_folder, output_folder,
                                       "compound.encodings"):
                    compound_word_expander = False
                if compound_word_expander:
                    print("\tCompound word expander model found.")
                else:
                    raise Exception(
                        "Compound word expander model not found (or incomplete)."
                    )

            # copy tagger files
            if should_contain_tagger:
                tagger = True
                if not self._copy_file(input_folder, output_folder,
                                       "tagger.bestUPOS"):
                    tagger = False
                if not self._copy_file(input_folder, output_folder,
                                       "tagger.bestXPOS"):
                    tagger = False
                if not self._copy_file(input_folder, output_folder,
                                       "tagger.bestATTRS"):
                    tagger = False
                if not self._copy_file(input_folder, output_folder,
                                       "tagger.conf"):
                    tagger = False
                if not self._copy_file(input_folder, output_folder,
                                       "tagger.encodings"):
                    tagger = False
                if tagger:
                    print("\tTagger model found.")
                else:
                    raise Exception("Tagger model not found (or incomplete).")

            # copy lemmatizer files
            if should_contain_lemmatizer:
                lemmatizer = True
                # patch
                if os.path.isfile(
                        os.path.join(input_folder, "lemmatizer.bestACC")):
                    os.rename(os.path.join(input_folder, "lemmatizer.bestACC"),
                              os.path.join(input_folder, "lemmatizer.bestAcc"))
                if not self._copy_file(input_folder, output_folder,
                                       "lemmatizer.bestAcc"):
                    lemmatizer = False
                if not self._copy_file(input_folder, output_folder,
                                       "lemmatizer.conf"):
                    lemmatizer = False
                if not self._copy_file(input_folder, output_folder,
                                       "lemmatizer.encodings"):
                    lemmatizer = False
                if lemmatizer:
                    print("\tLemmatizer model found.")
                else:
                    raise Exception(
                        "Lemmatizer model not found (or incomplete).")

            # copy parser files
            if should_contain_parser:
                parser = True
                if not self._copy_file(input_folder, output_folder,
                                       "parser.bestUAS"):
                    parser = False
                if not self._copy_file(input_folder, output_folder,
                                       "parser.bestLAS"):
                    parser = False
                if not self._copy_file(input_folder, output_folder,
                                       "parser.conf"):
                    parser = False
                if not self._copy_file(input_folder, output_folder,
                                       "parser.encodings"):
                    parser = False
                if parser:
                    print("\tParser model found.")
                else:
                    raise Exception("Parser model not found (or incomplete).")

            # package into zip file
            print("\tCompressing model ...")

            model_file = os.path.join(
                output_folder_path, metadata.language_code + "-" +
                str(metadata.model_version) + ".zip")
            self._zipper(temp_folder, model_file)

        except Exception as e:
            print("Error encountered, cleaning up and exiting ...")
            rmtree(temp_folder, ignore_errors=True)
            raise e

        # delete temporary folder
        print("\tCleaning up ...")
        rmtree(temp_folder, ignore_errors=True)

        print("Model packaged successfully as: " + model_file)