def parse_test(params): if params.test == "parser": print ("Running " + params.test) print ("==PARAMETERS==") print ("EMBEDDINGS: " + params.embeddings) print ("MODEL FILE: " + params.model_base) print ("DECODER: " + params.decoder) print ("OUTPUT: " + params.output_file) print ("CONFIG FILE: " + str(params.config)) print ("==============\n") testset = Dataset(params.test_file) encodings = Encodings() encodings.load(params.model_base + ".encodings") encodings.update_wordlist(testset) print ("Updated word list: " + str(len(encodings.word_list))) config = ParserConfig(filename=params.config) embeddings = WordEmbeddings() embeddings.read_from_file(params.embeddings, encodings.word_list) parser = BDRNNParser(config, encodings, embeddings) parser.load(params.model_base + ".bestUAS") if params.decoder == 'mst': print ("!!!!!!!!!!!!!!!!!!!!!!!!!USING MST DECODER") from graph.decoders import MSTDecoder parser.decoder = MSTDecoder() f = fopen(params.output_file, "w") last_proc = 0 index = 0 for seq in testset.sequences: index += 1 proc = index * 100 / len(testset.sequences) if proc % 5 == 0 and proc != last_proc: last_proc = proc sys.stdout.write(" " + str(proc)) sys.stdout.flush() rez = parser.tag(seq) iSeq = 0 iRez = 0 while iSeq < len(seq): while seq[iSeq].is_compound_entry: iSeq += 1 seq[iSeq].xpos = rez[iRez].xpos seq[iSeq].upos = rez[iRez].upos seq[iSeq].attrs = rez[iRez].attrs seq[iSeq].head = rez[iRez].head seq[iSeq].label = rez[iRez].label seq[iSeq].lemma = rez[iRez].lemma iSeq += 1 iRez += 1 for entry in seq: f.write(str(entry.index) + "\t" + str(entry.word) + "\t" + str(entry.lemma) + "\t" + str( entry.upos) + "\t" + str(entry.xpos) + "\t" + str(entry.attrs) + "\t" + str( entry.head) + "\t" + str(entry.label) + "\t" + str(entry.deps) + "\t" + str( entry.space_after) + "\n") f.write("\n") f.close() sys.stdout.write("\n")
def parse_run(params): sys.stdout.write("\nINPUT FILE: " + params.input_file) sys.stdout.write("\nOUTPUT FILE: " + params.output_file) sys.stdout.write("\nMODELS FILE: " + params.models + "\n") sys.stdout.flush() components = params.run.split(",") tokenize = True if "tokenizer" in components else False compound = True if "compound" in components else False lemmatize = True if "lemmatizer" in components else False tag = True if "tagger" in components else False parse = True if "parser" in components else False # common elements load sys.stdout.write("\nLoading embeddings : " + params.embeddings + " ...\n") embeddings = WordEmbeddings() embeddings.read_from_file(params.embeddings, None) encodings = None if tokenize == True: if not os.path.isfile( os.path.join(params.models, "tokenizer-tok.bestAcc")): sys.stdout.write( "\n\tTokenizer model not found! (" + os.path.join(params.models, "tokenizer-tok.bestAcc") + ")") sys.stdout.flush() sys.exit(1) sys.stdout.write("\n\tTokenization enabled.\n") tokenizer_encodings = Encodings(verbose=False) tokenizer_encodings.load( os.path.join(params.models, "tokenizer.encodings")) if compound == True: if not os.path.isfile(os.path.join(params.models, "compound.bestAcc")): sys.stdout.write("\n\tCompound word expander model not found!") sys.stdout.flush() sys.exit(1) sys.stdout.write("\n\tCompound word expander enabled.\n") if encodings == None: encodings = Encodings(verbose=False) encodings.load(os.path.join(params.models, "lemmatizer.encodings")) if lemmatize == True: if not os.path.isfile(os.path.join(params.models, "lemmatizer.bestACC")): sys.stdout.write("\n\tLemmatization model not found!") sys.stdout.flush() sys.exit(1) sys.stdout.write("\n\tLemmatization enabled.\n") if encodings == None: encodings = Encodings(verbose=False) encodings.load(os.path.join(params.models, "lemmatizer.encodings")) if tag == True: if not os.path.isfile(os.path.join(params.models, "tagger.bestOVERALL")): sys.stdout.write("\n\tTagger model not found!") sys.stdout.flush() sys.exit(1) sys.stdout.write("\n\tTagger enabled.\n") if encodings == None: encodings = Encodings(verbose=False) encodings.load(os.path.join(params.models, "tagger.encodings")) if parse == True: if not os.path.isfile(os.path.join(params.models, "parser.bestUAS")): sys.stdout.write("\n\tParser model not found!") sys.stdout.flush() sys.exit(1) sys.stdout.write("\n\tParser enabled.\n") if encodings == None: encodings = Encodings(verbose=False) encodings.load(os.path.join(params.models, "parser.encodings")) sequences = None if tokenize: sys.stdout.write("\nTokenizing " + params.input_file + " ... \n\t") sys.stdout.flush() from io_utils.config import TieredTokenizerConfig from generic_networks.tokenizers import TieredTokenizer config = TieredTokenizerConfig( os.path.join(params.models, "tokenizer.conf")) tokenizer_object = TieredTokenizer(config, tokenizer_encodings, embeddings, runtime=True) tokenizer_object.load(os.path.join(params.models, "tokenizer")) with open(params.input_file, 'r') as file: lines = file.readlines() # analyze use of spaces in first part of the file test = "" useSpaces = " " cnt = 0 while True: test = test + lines[cnt] # print(lines[cnt]) if cnt >= len(lines) or cnt > 5: break cnt += 1 if float(test.count(' ')) / float(len(test)) < 0.02: useSpaces = "" # print (str(float(test.count(' '))/float(len(test)))) i = -1 input_string = "" sequences = [] while i < len(lines) - 1: i += 1 input_string = input_string + lines[i].replace("\r", "").replace( "\n", "").strip() + useSpaces if lines[i].strip() == "" or i == len(lines) - 1: # end of block if input_string.strip() != "": sequences += tokenizer_object.tokenize(input_string) input_string = "" del tokenizer_object # free memory else: ds = Dataset(params.input_file) sequences = ds.sequences sys.stdout.write(" done\n") sys.stdout.flush() if compound: sys.stdout.write("\nCompound word expanding " + params.input_file + " ... \n\t") sys.stdout.flush() from generic_networks.token_expanders import CompoundWordExpander from io_utils.config import CompoundWordConfig config = CompoundWordConfig( os.path.join(params.models, "compound.conf")) compoundwordexpander_object = CompoundWordExpander(config, encodings, embeddings, runtime=True) compoundwordexpander_object.load( os.path.join(params.models, "compound.bestAcc")) sequences = compoundwordexpander_object.expand_sequences(sequences) del compoundwordexpander_object # free memory sys.stdout.write(" done\n") sys.stdout.flush() if parse == True: sys.stdout.write("\nParsing " + params.input_file + " ... \n\t") sys.stdout.flush() from io_utils.config import ParserConfig from generic_networks.parsers import BDRNNParser config = ParserConfig(os.path.join(params.models, "parser.conf")) parser_object = BDRNNParser(config, encodings, embeddings, runtime=True) parser_object.load(os.path.join(params.models, "parser.bestUAS")) sequences = parser_object.parse_sequences(sequences) del parser_object # free memory sys.stdout.write(" done\n") sys.stdout.flush() if tag == True: sys.stdout.write("\nTagging " + params.input_file + " ... \n\t") sys.stdout.flush() from io_utils.config import TaggerConfig from generic_networks.taggers import BDRNNTagger config = TaggerConfig(os.path.join(params.models, "tagger.conf")) tagger_object_UPOS = BDRNNTagger(config, encodings, embeddings, runtime=True) tagger_object_UPOS.load(os.path.join(params.models, "tagger.bestUPOS")) tagger_object_XPOS = BDRNNTagger(config, encodings, embeddings, runtime=True) tagger_object_XPOS.load(os.path.join(params.models, "tagger.bestXPOS")) tagger_object_ATTRS = BDRNNTagger(config, encodings, embeddings, runtime=True) tagger_object_ATTRS.load( os.path.join(params.models, "tagger.bestATTRS")) new_sequences = [] for sequence in sequences: new_sequence = copy.deepcopy(sequence) predicted_tags_UPOS = tagger_object_UPOS.tag(new_sequence) predicted_tags_XPOS = tagger_object_XPOS.tag(new_sequence) predicted_tags_ATTRS = tagger_object_ATTRS.tag(new_sequence) for entryIndex in range(len(sequence)): new_sequence[entryIndex].upos = predicted_tags_UPOS[ entryIndex][0] new_sequence[entryIndex].xpos = predicted_tags_XPOS[ entryIndex][1] new_sequence[entryIndex].attrs = predicted_tags_ATTRS[ entryIndex][2] new_sequences.append(new_sequence) sequences = copy.deepcopy(new_sequences) del tagger_object_UPOS # free memory del tagger_object_XPOS # free memory del tagger_object_ATTRS # free memory sys.stdout.write(" done\n") sys.stdout.flush() if lemmatize: sys.stdout.write("\nLemmatizing " + params.input_file + " ... \n\t") sys.stdout.flush() from generic_networks.lemmatizers import FSTLemmatizer from io_utils.config import LemmatizerConfig config = LemmatizerConfig( os.path.join(params.models, "lemmatizer.conf")) lemmatizer_object = FSTLemmatizer(config, encodings, embeddings, runtime=True) lemmatizer_object.load( os.path.join(params.models, "lemmatizer.bestACC")) sequences = lemmatizer_object.lemmatize_sequences(sequences) del lemmatizer_object # free memory sys.stdout.write(" done\n") sys.stdout.flush() output_dataset = Dataset() output_dataset.sequences = sequences output_dataset.write(params.output_file)
def parse_train(params): if params.train == 'mt': print "Starting training for " + params.train print "==PARAMETERS==" print "SRC TRAIN FILE: " + params.mt_train_src print "SRC DEV FILE: " + params.mt_dev_src print "SRC TEST FILE: " + str(params.mt_test_src) print "SRC EMBEDDINGS FILE: " + params.mt_source_embeddings print "DST TRAIN FILE: " + params.mt_train_dst print "DST DEV FILE: " + params.mt_dev_dst print "DST TEST FILE: " + str(params.mt_test_dst) print "DST EMBEDDINGS FILE: " + params.mt_destination_embeddings print "STOPPING CONDITION: " + str(params.itters) print "OUTPUT BASE: " + params.output_base print "AUX SOFTMAX WEIGHT: " + str(params.aux_softmax_weight) print "CONFIG FILE: " + str(params.config) print "==============\n" trainset = MTDataset(params.mt_train_src, params.mt_train_dst) devset = MTDataset(params.mt_dev_src, params.mt_dev_dst) if params.mt_test_src and params.mt_test_dst: testset = MTDataset(params.mt_test_src, params.mt_test_dst) else: testset = None config = NMTConfig(params.config) sys.stdout.write("--SOURCE--\n") sys.stdout.flush() src_enc = Encodings() src_enc.compute(trainset.to_conll_dataset('src'), devset.to_conll_dataset('src'), word_cutoff=5) sys.stdout.write("--DESTINATION--\n") sys.stdout.flush() dst_enc = Encodings() dst_enc.compute(trainset.to_conll_dataset('dst'), devset.to_conll_dataset('dst'), word_cutoff=5) sys.stdout.write("Reading source embeddings\n") src_we = WordEmbeddings() src_we.read_from_file(params.mt_source_embeddings, 'label', full_load=False) sys.stdout.write("Reading destination embeddings\n") dst_we = WordEmbeddings() dst_we.read_from_file(params.mt_destination_embeddings, 'label', full_load=False) nmt = BRNNMT(src_we, dst_we, src_enc, dst_enc, config) trainer = MTTrainer(nmt, src_enc, dst_enc, src_we, dst_we, params.itters, trainset, devset, testset=testset) trainer.start_training(params.output_base, batch_size=params.batch_size) if params.train == "tagger": print "Starting training for " + params.train print "==PARAMETERS==" print "TRAIN FILE: " + params.train_file print "DEV FILE: " + params.dev_file if params.test_file is not None: print "TEST FILE: " + params.test_file print "EMBEDDINGS FILE: " + params.embeddings print "STOPPING CONDITION: " + str(params.itters) print "OUTPUT BASE: " + params.output_base print "AUX SOFTMAX WEIGHT: " + str(params.aux_softmax_weight) print "CONFIG FILE: " + str(params.config) print "==============\n" trainset = Dataset(params.train_file) devset = Dataset(params.dev_file) if params.test_file: testset = Dataset(params.test_file) else: testset = None config = TaggerConfig(params.config) if not config._valid: return encodings = Encodings() encodings.compute(trainset, devset, 'label') # update wordlist if testset was provided if params.test_file: encodings.update_wordlist(testset) embeddings = WordEmbeddings() embeddings.read_from_file(params.embeddings, encodings.word_list) tagger = BDRNNTagger(config, encodings, embeddings, aux_softmax_weight=params.aux_softmax_weight) trainer = TaggerTrainer(tagger, encodings, params.itters, trainset, devset, testset) trainer.start_training(params.output_base, batch_size=params.batch_size) elif params.train == "parser": print "Starting training for " + params.train print "==PARAMETERS==" print "TRAIN FILE: " + params.train_file print "DEV FILE: " + params.dev_file if params.test_file is not None: print "TEST FILE: " + params.test_file print "EMBEDDINGS FILE: " + params.embeddings print "STOPPING CONDITION: " + str(params.itters) print "OUTPUT BASE: " + params.output_base print "AUX SOFTMAX WEIGHT: " + str(params.aux_softmax_weight) print "CONFIG FILE: " + str(params.config) print "==============\n" trainset = Dataset(params.train_file) devset = Dataset(params.dev_file) if params.test_file: testset = Dataset(params.test_file) else: testset = None config = ParserConfig(params.config) if not config._valid: return # PARAM INJECTION if params.params != None: parts = params.params.split(":") for param in parts: variable = param.split("=")[0] value = param[len(variable) + 1:] print("External param injection: " + variable + "=" + value) exec("config.__dict__[\"" + variable + "\"] = " + value) # END INJECTION encodings = Encodings() encodings.compute(trainset, devset, 'label') # update wordlist if testset was provided if params.test_file: encodings.update_wordlist(testset) embeddings = WordEmbeddings() embeddings.read_from_file(params.embeddings, encodings.word_list) parser = BDRNNParser(config, encodings, embeddings, aux_softmax_weight=params.aux_softmax_weight) trainer = ParserTrainer(parser, encodings, params.itters, trainset, devset, testset) trainer.start_training(params.output_base, params.batch_size) elif params.train == "lemmatizer": print "Starting training for " + params.train print "==PARAMETERS==" print "TRAIN FILE: " + params.train_file print "DEV FILE: " + params.dev_file if params.test_file is not None: print "TEST FILE: " + params.test_file print "EMBEDDINGS FILE: " + params.embeddings print "STOPPING CONDITION: " + str(params.itters) print "OUTPUT BASE: " + params.output_base print "AUX SOFTMAX WEIGHT: " + str(params.aux_softmax_weight) print "CONFIG FILE: " + str(params.config) print "==============\n" trainset = Dataset(params.train_file) devset = Dataset(params.dev_file) if params.test_file: testset = Dataset(params.test_file) else: testset = None config = LemmatizerConfig(params.config) encodings = Encodings() encodings.compute(trainset, devset, 'label') # update wordlist if testset was provided if params.test_file: encodings.update_wordlist(testset) embeddings = None lemmatizer = FSTLemmatizer(config, encodings, embeddings) trainer = LemmatizerTrainer(lemmatizer, encodings, params.itters, trainset, devset, testset) trainer.start_training(params.output_base, batch_size=params.batch_size) elif params.train == "compound": print "Starting training for " + params.train print "==PARAMETERS==" print "TRAIN FILE: " + params.train_file print "DEV FILE: " + params.dev_file if params.test_file is not None: print "TEST FILE: " + params.test_file print "EMBEDDINGS FILE: " + params.embeddings print "STOPPING CONDITION: " + str(params.itters) print "OUTPUT BASE: " + params.output_base print "AUX SOFTMAX WEIGHT: " + str(params.aux_softmax_weight) print "CONFIG FILE: " + str(params.config) print "==============\n" trainset = Dataset(params.train_file) devset = Dataset(params.dev_file) if params.test_file: testset = Dataset(params.test_file) else: testset = None config = CompoundWordConfig(params.config) encodings = Encodings() encodings.compute(trainset, devset, 'label') # update wordlist if testset was provided if params.test_file: encodings.update_wordlist(testset) embeddings = None expander = CompoundWordExpander(config, encodings, embeddings) trainer = CompoundWordTrainer(expander, encodings, params.itters, trainset, devset, testset) trainer.start_training(params.output_base, batch_size=params.batch_size) elif params.train == "tokenizer": print "Starting training for " + params.train print "==PARAMETERS==" print "TRAIN FILE: " + params.train_file print "RAW TRAIN FILE: " + (params.raw_train_file if params. raw_train_file is not None else "n/a") print "DEV FILE: " + params.dev_file print "RAW DEV FILE: " + (params.raw_dev_file if params.raw_dev_file is not None else "n/a") print "TEST FILE: " + (params.test_file if params.test_file is not None else "n/a") print "RAW TEST FILE: " + (params.raw_test_file if params.raw_test_file is not None else "n/a") print "EMBEDDINGS FILE: " + params.embeddings print "STOPPING CONDITION: " + str(params.itters) print "OUTPUT BASE: " + params.output_base print "CONFIG FILE: " + str(params.config) print "==============\n" trainset = Dataset(params.train_file) devset = Dataset(params.dev_file) if params.test_file: testset = Dataset(params.test_file) else: testset = None from generic_networks.tokenizers import TieredTokenizer config = TieredTokenizerConfig(params.config) config.raw_test_file = params.raw_test_file config.base = params.output_base config.patience = params.itters if not config._valid: return encodings = Encodings() encodings.compute(trainset, devset, 'label') # update wordlist if testset was provided if params.test_file: encodings.update_wordlist(testset) embeddings = WordEmbeddings() embeddings.read_from_file( params.embeddings, None ) # setting wordlist to None triggers Word Embeddings to act as cache-only and load offsets for all words tokenizer = TieredTokenizer(config, encodings, embeddings) trainer = TokenizerTrainer(tokenizer, encodings, params.itters, trainset, devset, testset, raw_train_file=params.raw_train_file, raw_dev_file=params.raw_dev_file, raw_test_file=params.raw_test_file, gold_train_file=params.train_file, gold_dev_file=params.dev_file, gold_test_file=params.test_file) trainer.start_training(params.output_base, batch_size=params.batch_size)
print "--dev-file is mandatory" valid = False if not params.embeddings: print "--embeddings is mandatory" valid = False if not params.output_base: print "--store is mandatory" valid = False if valid: parse_train(params) if params.server: from server.webserver import EmbeddedWebserver WordEmbeddings we = WordEmbeddings() we.read_from_file(params.embeddings, None, False) ews = EmbeddedWebserver(we, port=params.port, lemma=params.model_lemmatization, tokenization=params.model_tokenization, tagging=params.model_tagging, parsing=params.model_parsing) if params.test: valid = True if valid: parse_test(params) if params.run: valid = True
def load(self, language_code, version="latest", tokenization=True, compound_word_expanding=False, tagging=True, lemmatization=True, parsing=True): """ Loads the pipeline with all available models for the target language. @param lang_code: Target language code. See http://opensource.adobe.com/NLP-Cube/ for available languages and their codes @param version: "latest" to get the latest version, or other specific version in like "1.0", "2.1", etc . """ # Initialize a ModelStore object model_store_object = ModelStore(disk_path=self._model_repository) # Find a local model or download it if it does not exist, returning the local model folder path model_folder_path = model_store_object.find(lang_code=language_code, version=version, verbose=self._verbose) # Load metadata from the model self.metadata.read(os.path.join(model_folder_path, "metadata.json")) # Load embeddings embeddings = WordEmbeddings(verbose=False) if self._verbose: sys.stdout.write('\tLoading embeddings... \n') embeddings.read_from_file(os.path.join( self._embeddings_repository, self.metadata.embeddings_file_name), None, full_load=False) # 1. Load tokenizer if tokenization: if not os.path.isfile( os.path.join(model_folder_path, 'tokenizer-tok.bestAcc')): sys.stdout.write( '\tTokenization is not available on this model. \n') else: if self._verbose: sys.stdout.write('\tLoading tokenization model ...\n') tokenizer_encodings = Encodings(verbose=False) tokenizer_encodings.load( os.path.join(model_folder_path, 'tokenizer.encodings')) config = TieredTokenizerConfig( os.path.join(model_folder_path, 'tokenizer.conf')) self._tokenizer = TieredTokenizer(config, tokenizer_encodings, embeddings, runtime=True) self._tokenizer.load( os.path.join(model_folder_path, 'tokenizer')) # 3. Load compound if compound_word_expanding: if not os.path.isfile( os.path.join(model_folder_path, 'compound.bestAcc')): if self._verbose: # supress warning here because many languages do not have compund words sys.stdout.write( '\tCompound word expansion is not available on this model. \n' ) else: if self._verbose: sys.stdout.write( '\tLoading compound word expander model ...\n') compound_encodings = Encodings(verbose=False) compound_encodings.load( os.path.join(model_folder_path, 'compound.encodings')) config = CompoundWordConfig( os.path.join(model_folder_path, 'compound.conf')) self._compound_word_expander = CompoundWordExpander( config, compound_encodings, embeddings, runtime=True) self._compound_word_expander.load( os.path.join(model_folder_path, 'compound.bestAcc')) # 4. Load lemmatizer if lemmatization: if not os.path.isfile( os.path.join(model_folder_path, 'lemmatizer.bestACC')): sys.stdout.write( '\tLemmatizer is not available on this model. \n') else: if self._verbose: sys.stdout.write('\tLoading lemmatization model ...\n') lemmatizer_encodings = Encodings(verbose=False) lemmatizer_encodings.load( os.path.join(model_folder_path, 'lemmatizer.encodings')) config = LemmatizerConfig( os.path.join(model_folder_path, 'lemmatizer.conf')) self._lemmatizer = FSTLemmatizer(config, lemmatizer_encodings, embeddings, runtime=True) self._lemmatizer.load( os.path.join(model_folder_path, 'lemmatizer.bestACC')) # 5. Load taggers if tagging or lemmatization: # we need tagging for lemmatization if not os.path.isfile( os.path.join(model_folder_path, 'tagger.bestUPOS')): sys.stdout.write( '\tTagging is not available on this model. \n') if lemmatization: sys.stdout.write( '\t\tDisabling the lemmatization model due to missing tagger. \n' ) self._lemmatizer = None else: if self._verbose: sys.stdout.write('\tLoading tagger model ...\n') tagger_encodings = Encodings(verbose=False) tagger_encodings.load( os.path.join(model_folder_path, 'tagger.encodings')) config = TaggerConfig( os.path.join(model_folder_path, 'tagger.conf')) self._tagger = [None, None, None] self._tagger[0] = BDRNNTagger(config, tagger_encodings, embeddings, runtime=True) self._tagger[0].load( os.path.join(model_folder_path, 'tagger.bestUPOS')) self._tagger[1] = BDRNNTagger(config, tagger_encodings, embeddings, runtime=True) self._tagger[1].load( os.path.join(model_folder_path, 'tagger.bestXPOS')) self._tagger[2] = BDRNNTagger(config, tagger_encodings, embeddings, runtime=True) self._tagger[2].load( os.path.join(model_folder_path, 'tagger.bestATTRS')) # 6. Load parser if parsing: if not os.path.isfile( os.path.join(model_folder_path, 'parser.bestUAS')): sys.stdout.write( '\tParsing is not available on this model... \n') else: if self._verbose: sys.stdout.write('\tLoading parser model ...\n') parser_encodings = Encodings(verbose=False) parser_encodings.load( os.path.join(model_folder_path, 'parser.encodings')) config = ParserConfig( os.path.join(model_folder_path, 'parser.conf')) self._parser = BDRNNParser(config, parser_encodings, embeddings, runtime=True) self._parser.load( os.path.join(model_folder_path, 'parser.bestUAS')) self._loaded = True if self._verbose: sys.stdout.write('Model loading complete.\n\n')
def load(self, lang_code, base_path=None): """ Loads the pipeline with all available models for the target language @param lang_code: Target language code. See http://opensource.adobe.com/NLP-Cube/ for available languages and their codes @param base_path: Base path for models. Only required for custom-trained models. Otherwise, just leave this parameter untouched to use the default model location @return: True if loading was successful, False otherwise """ sys.stdout.write('Loading models for ' + lang_code + "\n") if base_path is None: global BASE_PATH base_path = BASE_PATH self.embeddings = WordEmbeddings() self.embeddings.read_from_file(os.path.join(base_path, lang_code + "/wiki." + lang_code + ".vec"), None, full_load=False) if not os.path.isfile(os.path.join(base_path, lang_code + "/tokenizer-tok.bestAcc")): sys.stdout.write( "\tTokenization disabled. \n") else: self.tokenizer_enabled = True sys.stdout.write("\tTokenization enabled.\n") tokenizer_encodings = Encodings(verbose=False) tokenizer_encodings.load(os.path.join(base_path, lang_code + "/tokenizer.encodings")) from io_utils.config import TieredTokenizerConfig from generic_networks.tokenizers import TieredTokenizer config = TieredTokenizerConfig(os.path.join(base_path, lang_code + "/tokenizer.conf")) tokenizer_object = TieredTokenizer(config, tokenizer_encodings, self.embeddings, runtime=True) tokenizer_object.load(os.path.join(base_path, lang_code + "/tokenizer")) self.models[PipelineComponents.TOKENIZER] = tokenizer_object if not os.path.isfile(os.path.join(base_path, lang_code + "/compound.bestAcc")): sys.stdout.write( "\tCompound disabled. \n") else: self.compound_enabled = True sys.stdout.write("\tCompound enabled.\n") compound_encodings = Encodings(verbose=False) compound_encodings.load(os.path.join(base_path, lang_code + "/compound.encodings")) from io_utils.config import CompoundWordConfig from generic_networks.token_expanders import CompoundWordExpander config = CompoundWordConfig(os.path.join(base_path, lang_code + "/compound.conf")) compound_object = CompoundWordExpander(config, compound_encodings, self.embeddings, runtime=True) compound_object.load(os.path.join(base_path, lang_code + "/compound.bestAcc")) self.models[PipelineComponents.COMPOUND] = compound_object if not os.path.isfile(os.path.join(base_path, lang_code + "/lemmatizer.bestACC")): sys.stdout.write( "\tLemmatizer disabled. \n") else: self.lemmatizer_enabled = True sys.stdout.write("\tLemmatizer enabled.\n") lemmatizer_encodings = Encodings(verbose=False) lemmatizer_encodings.load(os.path.join(base_path, lang_code + "/lemmatizer.encodings")) from io_utils.config import LemmatizerConfig from generic_networks.lemmatizers import FSTLemmatizer config = LemmatizerConfig(os.path.join(base_path, lang_code + "/lemmatizer.conf")) lemmatizer_object = FSTLemmatizer(config, lemmatizer_encodings, self.embeddings, runtime=True) lemmatizer_object.load(os.path.join(base_path, lang_code + "/lemmatizer.bestACC")) self.models[PipelineComponents.LEMMATIZER] = lemmatizer_object if not os.path.isfile(os.path.join(base_path, lang_code + "/tagger.bestUPOS")): sys.stdout.write( "\tTagger disabled. \n") else: self.tagger_enabled = True sys.stdout.write("\tTagger enabled.\n") tagger_encodings = Encodings(verbose=False) tagger_encodings.load(os.path.join(base_path, lang_code + "/tagger.encodings")) from io_utils.config import TaggerConfig from generic_networks.taggers import BDRNNTagger config = TaggerConfig(os.path.join(base_path, lang_code + "/tagger.conf")) tagger_upos_object = BDRNNTagger(config, tagger_encodings, self.embeddings, runtime=True) tagger_upos_object.load(os.path.join(base_path, lang_code + "/tagger.bestUPOS")) tagger_xpos_object = BDRNNTagger(config, tagger_encodings, self.embeddings, runtime=True) tagger_xpos_object.load(os.path.join(base_path, tagger_encodings + "/tagger.bestXPOS")) tagger_attrs_object = BDRNNTagger(config, tagger_encodings, self.embeddings, runtime=True) tagger_attrs_object.load(os.path.join(base_path, lang_code + "/tagger.bestATTRS")) self.models[PipelineComponents.TAGGER] = [tagger_upos_object, tagger_xpos_object, tagger_attrs_object] if not os.path.isfile(os.path.join(base_path, lang_code + "/parser.bestUAS")): sys.stdout.write( "\tParser disabled. \n") else: self.parser_enabled = True sys.stdout.write("\tParser enabled.\n") lemmatizer_encodings = Encodings(verbose=False) lemmatizer_encodings.load(os.path.join(base_path, lang_code + "/parser.encodings")) from io_utils.config import ParserConfig from generic_networks.parsers import BDRNNParser config = ParserConfig(os.path.join(base_path, lang_code + "/parser.conf")) parser_object = BDRNNParser(config, lemmatizer_encodings, self.embeddings, runtime=True) parser_object.load(os.path.join(base_path, lang_code + "/parser.bestUAS")) self.models[PipelineComponents.PARSER] = parser_object
class Cube: def __init__(self): """ Create an empty instance for Cube Before it can be used, you must call @method load with @param language_code set to your target language """ self.loaded = False self.tokenizer_enabled = False self.compound_enabled = False self.lemmatizer_enabled = False self.parser_enabled = False self.tokenizer_enabled = False self.tagger_enabled = False self.models = {} self.embeddings = None def download_models(self, lang_code): """ Downloads pre-trained models for the desired language. All existing models will be overwritten @param lang_code: Target language code. See http://opensource.adobe.com/NLP-Cube/ for available languages and their codes @return: True if the download was successful, False otherwise """ sys.stdout.write('TODO: Downloading models for ' + lang_code + "\n") def load(self, lang_code, base_path=None): """ Loads the pipeline with all available models for the target language @param lang_code: Target language code. See http://opensource.adobe.com/NLP-Cube/ for available languages and their codes @param base_path: Base path for models. Only required for custom-trained models. Otherwise, just leave this parameter untouched to use the default model location @return: True if loading was successful, False otherwise """ sys.stdout.write('Loading models for ' + lang_code + "\n") if base_path is None: global BASE_PATH base_path = BASE_PATH self.embeddings = WordEmbeddings() self.embeddings.read_from_file(os.path.join(base_path, lang_code + "/wiki." + lang_code + ".vec"), None, full_load=False) if not os.path.isfile(os.path.join(base_path, lang_code + "/tokenizer-tok.bestAcc")): sys.stdout.write( "\tTokenization disabled. \n") else: self.tokenizer_enabled = True sys.stdout.write("\tTokenization enabled.\n") tokenizer_encodings = Encodings(verbose=False) tokenizer_encodings.load(os.path.join(base_path, lang_code + "/tokenizer.encodings")) from io_utils.config import TieredTokenizerConfig from generic_networks.tokenizers import TieredTokenizer config = TieredTokenizerConfig(os.path.join(base_path, lang_code + "/tokenizer.conf")) tokenizer_object = TieredTokenizer(config, tokenizer_encodings, self.embeddings, runtime=True) tokenizer_object.load(os.path.join(base_path, lang_code + "/tokenizer")) self.models[PipelineComponents.TOKENIZER] = tokenizer_object if not os.path.isfile(os.path.join(base_path, lang_code + "/compound.bestAcc")): sys.stdout.write( "\tCompound disabled. \n") else: self.compound_enabled = True sys.stdout.write("\tCompound enabled.\n") compound_encodings = Encodings(verbose=False) compound_encodings.load(os.path.join(base_path, lang_code + "/compound.encodings")) from io_utils.config import CompoundWordConfig from generic_networks.token_expanders import CompoundWordExpander config = CompoundWordConfig(os.path.join(base_path, lang_code + "/compound.conf")) compound_object = CompoundWordExpander(config, compound_encodings, self.embeddings, runtime=True) compound_object.load(os.path.join(base_path, lang_code + "/compound.bestAcc")) self.models[PipelineComponents.COMPOUND] = compound_object if not os.path.isfile(os.path.join(base_path, lang_code + "/lemmatizer.bestACC")): sys.stdout.write( "\tLemmatizer disabled. \n") else: self.lemmatizer_enabled = True sys.stdout.write("\tLemmatizer enabled.\n") lemmatizer_encodings = Encodings(verbose=False) lemmatizer_encodings.load(os.path.join(base_path, lang_code + "/lemmatizer.encodings")) from io_utils.config import LemmatizerConfig from generic_networks.lemmatizers import FSTLemmatizer config = LemmatizerConfig(os.path.join(base_path, lang_code + "/lemmatizer.conf")) lemmatizer_object = FSTLemmatizer(config, lemmatizer_encodings, self.embeddings, runtime=True) lemmatizer_object.load(os.path.join(base_path, lang_code + "/lemmatizer.bestACC")) self.models[PipelineComponents.LEMMATIZER] = lemmatizer_object if not os.path.isfile(os.path.join(base_path, lang_code + "/tagger.bestUPOS")): sys.stdout.write( "\tTagger disabled. \n") else: self.tagger_enabled = True sys.stdout.write("\tTagger enabled.\n") tagger_encodings = Encodings(verbose=False) tagger_encodings.load(os.path.join(base_path, lang_code + "/tagger.encodings")) from io_utils.config import TaggerConfig from generic_networks.taggers import BDRNNTagger config = TaggerConfig(os.path.join(base_path, lang_code + "/tagger.conf")) tagger_upos_object = BDRNNTagger(config, tagger_encodings, self.embeddings, runtime=True) tagger_upos_object.load(os.path.join(base_path, lang_code + "/tagger.bestUPOS")) tagger_xpos_object = BDRNNTagger(config, tagger_encodings, self.embeddings, runtime=True) tagger_xpos_object.load(os.path.join(base_path, tagger_encodings + "/tagger.bestXPOS")) tagger_attrs_object = BDRNNTagger(config, tagger_encodings, self.embeddings, runtime=True) tagger_attrs_object.load(os.path.join(base_path, lang_code + "/tagger.bestATTRS")) self.models[PipelineComponents.TAGGER] = [tagger_upos_object, tagger_xpos_object, tagger_attrs_object] if not os.path.isfile(os.path.join(base_path, lang_code + "/parser.bestUAS")): sys.stdout.write( "\tParser disabled. \n") else: self.parser_enabled = True sys.stdout.write("\tParser enabled.\n") lemmatizer_encodings = Encodings(verbose=False) lemmatizer_encodings.load(os.path.join(base_path, lang_code + "/parser.encodings")) from io_utils.config import ParserConfig from generic_networks.parsers import BDRNNParser config = ParserConfig(os.path.join(base_path, lang_code + "/parser.conf")) parser_object = BDRNNParser(config, lemmatizer_encodings, self.embeddings, runtime=True) parser_object.load(os.path.join(base_path, lang_code + "/parser.bestUAS")) self.models[PipelineComponents.PARSER] = parser_object def process_text(self, text="", pipeline=None): """ Runs the pipeline on the input text. If the pipeline is set to None, Cube will run all available processing models @param text: the text to be processed. It can either be raw text format or, a list of sentences, each composed of a list of CONLLEntry @param pipeline: a list of PipelineComponents to be used for processing @return: A list of sentences, each composed of a list of CONLLEntry items """ if pipeline is None: pipeline = [PipelineComponents.TOKENIZER, PipelineComponents.PARSER, PipelineComponents.TAGGER, PipelineComponents.LEMMATIZER, PipelineComponents.COMPOUND] if PipelineComponents.TOKENIZER in pipeline and self.tokenizer_enabled: sys.stdout.write("\nTokenizing... \n\t") sys.stdout.flush() lines = text.replace("\r", "").split("\n") # analyze use of spaces in first part of the file test = ""; useSpaces = " " cnt = 0 while True: test = test + lines[cnt] # print(lines[cnt]) if cnt + 1 >= len(lines) or cnt > 5: break cnt += 1 if float(test.count(' ')) / float(len(test)) < 0.02: useSpaces = "" # print (str(float(test.count(' '))/float(len(test)))) input_string = "" for i in range(len(lines)): input_string = input_string + lines[i].replace("\r", "").replace("\n", "").strip() + useSpaces sequences = self.models[PipelineComponents.TOKENIZER].tokenize(input_string) sys.stdout.write("\n") else: sequences = text if PipelineComponents.COMPOUND in pipeline and self.compound_enabled: sequences = self.models[PipelineComponents.COMPOUND].expand_sequences(sequences) if PipelineComponents.PARSER in pipeline and self.parser_enabled: sequences = self.models[PipelineComponents.PARSER].parse_sequences(sequences) if PipelineComponents.TAGGER in pipeline and self.tagger_enabled: new_sequences = [] for sequence in sequences: new_sequence = copy.deepcopy(sequence) predicted_tags_UPOS = self.models[PipelineComponents.TAGGER][0].tag(new_sequence) predicted_tags_XPOS = self.models[PipelineComponents.TAGGER][1].tag(new_sequence) predicted_tags_ATTRS = self.models[PipelineComponents.TAGGER][2].tag(new_sequence) for entryIndex in range(len(sequence)): new_sequence[entryIndex].upos = predicted_tags_UPOS[entryIndex][0] new_sequence[entryIndex].xpos = predicted_tags_XPOS[entryIndex][1] new_sequence[entryIndex].attrs = predicted_tags_ATTRS[entryIndex][2] new_sequences.append(new_sequence) sequences = new_sequences if PipelineComponents.LEMMATIZER in pipeline and self.lemmatizer_enabled: sequences = self.models[PipelineComponents.LEMMATIZER].lemmatize_sequences(sequences) return sequences
def _load(self, lang_code): """ Load models on the class. """ sys.stdout.write('Loading models for {}\n'.format(lang_code)) path_for_language = os.path.join(self.disk_path, lang_code) # 1. Load word embeddings. self.embeddings = WordEmbeddings() word_embeddings_for_language = 'wiki.{}.vec'.format(lang_code) self.embeddings.read_from_file(os.path.join( path_for_language, word_embeddings_for_language), None, full_load=False) # 2. Load tokenizer. if not os.path.isfile( os.path.join(path_for_language, 'tokenizer-tok.bestAcc')): sys.stdout.write('\tTokenization disabled. \n') else: self.tokenizer_enabled = True sys.stdout.write('\tTokenization enabled.\n') tokenizer_encodings = Encodings(verbose=False) tokenizer_encodings.load( os.path.join(path_for_language, 'tokenizer.encodings')) config = TieredTokenizerConfig( os.path.join(path_for_language, 'tokenizer.conf')) tokenizer_object = TieredTokenizer(config, tokenizer_encodings, self.embeddings, runtime=True) tokenizer_object.load(os.path.join(path_for_language, 'tokenizer')) self.models[PipelineComponents.TOKENIZER] = tokenizer_object # 3. Load compound. if not os.path.isfile( os.path.join(path_for_language, 'compound.bestAcc')): sys.stdout.write('\tCompound disabled. \n') else: self.compound_enabled = True sys.stdout.write('\tCompound enabled.\n') compound_encodings = Encodings(verbose=False) compound_encodings.load( os.path.join(path_for_language, 'compound.encodings')) config = CompoundWordConfig( os.path.join(path_for_language, 'compound.conf')) compound_object = CompoundWordExpander(config, compound_encodings, self.embeddings, runtime=True) compound_object.load( os.path.join(path_for_language, 'compound.bestAcc')) self.models[PipelineComponents.COMPOUND] = compound_object if not os.path.isfile( os.path.join(path_for_language, 'lemmatizer.bestACC')): sys.stdout.write('\tLemmatizer disabled. \n') else: self.lemmatizer_enabled = True sys.stdout.write('\tLemmatizer enabled.\n') lemmatizer_encodings = Encodings(verbose=False) lemmatizer_encodings.load( os.path.join(path_for_language, 'lemmatizer.encodings')) config = LemmatizerConfig( os.path.join(path_for_language, 'lemmatizer.conf')) lemmatizer_object = FSTLemmatizer(config, lemmatizer_encodings, self.embeddings, runtime=True) lemmatizer_object.load( os.path.join(path_for_language, 'lemmatizer.bestACC')) self.models[PipelineComponents.LEMMATIZER] = lemmatizer_object if not os.path.isfile( os.path.join(path_for_language, 'tagger.bestUPOS')): sys.stdout.write('\tTagger disabled. \n') else: self.tagger_enabled = True sys.stdout.write('\tTagger enabled.\n') tagger_encodings = Encodings(verbose=False) tagger_encodings.load( os.path.join(path_for_language, 'tagger.encodings')) config = TaggerConfig( os.path.join(path_for_language, 'tagger.conf')) tagger_upos_object = BDRNNTagger(config, tagger_encodings, self.embeddings, runtime=True) tagger_upos_object.load( os.path.join(path_for_language, 'tagger.bestUPOS')) tagger_xpos_object = BDRNNTagger(config, tagger_encodings, self.embeddings, runtime=True) tagger_xpos_object.load( os.path.join(path_for_language, 'tagger.bestXPOS')) tagger_attrs_object = BDRNNTagger(config, tagger_encodings, self.embeddings, runtime=True) tagger_attrs_object.load( os.path.join(path_for_language, 'tagger.bestATTRS')) self.models[PipelineComponents.TAGGER] = [ tagger_upos_object, tagger_xpos_object, tagger_attrs_object ] if not os.path.isfile(os.path.join(path_for_language, 'parser.bestUAS')): sys.stdout.write('\tParser disabled. \n') else: self.parser_enabled = True sys.stdout.write('\tParser enabled.\n') lemmatizer_encodings = Encodings(verbose=False) lemmatizer_encodings.load( os.path.join(path_for_language, 'parser.encodings')) config = ParserConfig( os.path.join(path_for_language, 'parser.conf')) parser_object = BDRNNParser(config, lemmatizer_encodings, self.embeddings, runtime=True) parser_object.load( os.path.join(path_for_language, 'parser.bestUAS')) self.models[PipelineComponents.PARSER] = parser_object
class ModelStore(object): """ Abstraction layer for working with models. Usage example: model_store = ModelStore() # Load models for lang_code ro. model_store.load('ro') # Get latest versions. model_store.get_latest_versions() """ MODELS_PATH_LOCAL = 'models' MODELS_PATH_CLOUD = 'https://nlpcube.blob.core.windows.net/models' MODELS_PATH_CLOUD_ALL = os.path.join(MODELS_PATH_CLOUD, '?restype=container&comp=list') EMBEDDINGS_NAME = 'wiki.{}.vec' FACEBOOK_EMBEDDINGS_URL = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/' FACEBOOK_EMBEDDINGS_LOCATION = 'corpus/' def __init__(self, disk_path=None, cloud_path=None): self.disk_path = disk_path or self.MODELS_PATH_LOCAL self.cloud_path = cloud_path or self.MODELS_PATH_CLOUD self.models = {} def load(self, lang_code, check_for_latest=True): """ Contains logic for loading or downloading and loading models for the target language. Args: lang_code: Target language code. See http://opensource.adobe.com/NLP-Cube/ for available languages and their codes check_for_latest: Whether or not to get the latest version. """ version_to_download = self.version_to_donwload(lang_code, check_for_latest) if version_to_download: self._download_models_version(lang_code, version_to_download) # Now we surely have the language models downloaded self._load(lang_code) def _load(self, lang_code): """ Load models on the class. """ sys.stdout.write('Loading models for {}\n'.format(lang_code)) path_for_language = os.path.join(self.disk_path, lang_code) # 1. Load word embeddings. self.embeddings = WordEmbeddings() word_embeddings_for_language = 'wiki.{}.vec'.format(lang_code) self.embeddings.read_from_file(os.path.join( path_for_language, word_embeddings_for_language), None, full_load=False) # 2. Load tokenizer. if not os.path.isfile( os.path.join(path_for_language, 'tokenizer-tok.bestAcc')): sys.stdout.write('\tTokenization disabled. \n') else: self.tokenizer_enabled = True sys.stdout.write('\tTokenization enabled.\n') tokenizer_encodings = Encodings(verbose=False) tokenizer_encodings.load( os.path.join(path_for_language, 'tokenizer.encodings')) config = TieredTokenizerConfig( os.path.join(path_for_language, 'tokenizer.conf')) tokenizer_object = TieredTokenizer(config, tokenizer_encodings, self.embeddings, runtime=True) tokenizer_object.load(os.path.join(path_for_language, 'tokenizer')) self.models[PipelineComponents.TOKENIZER] = tokenizer_object # 3. Load compound. if not os.path.isfile( os.path.join(path_for_language, 'compound.bestAcc')): sys.stdout.write('\tCompound disabled. \n') else: self.compound_enabled = True sys.stdout.write('\tCompound enabled.\n') compound_encodings = Encodings(verbose=False) compound_encodings.load( os.path.join(path_for_language, 'compound.encodings')) config = CompoundWordConfig( os.path.join(path_for_language, 'compound.conf')) compound_object = CompoundWordExpander(config, compound_encodings, self.embeddings, runtime=True) compound_object.load( os.path.join(path_for_language, 'compound.bestAcc')) self.models[PipelineComponents.COMPOUND] = compound_object if not os.path.isfile( os.path.join(path_for_language, 'lemmatizer.bestACC')): sys.stdout.write('\tLemmatizer disabled. \n') else: self.lemmatizer_enabled = True sys.stdout.write('\tLemmatizer enabled.\n') lemmatizer_encodings = Encodings(verbose=False) lemmatizer_encodings.load( os.path.join(path_for_language, 'lemmatizer.encodings')) config = LemmatizerConfig( os.path.join(path_for_language, 'lemmatizer.conf')) lemmatizer_object = FSTLemmatizer(config, lemmatizer_encodings, self.embeddings, runtime=True) lemmatizer_object.load( os.path.join(path_for_language, 'lemmatizer.bestACC')) self.models[PipelineComponents.LEMMATIZER] = lemmatizer_object if not os.path.isfile( os.path.join(path_for_language, 'tagger.bestUPOS')): sys.stdout.write('\tTagger disabled. \n') else: self.tagger_enabled = True sys.stdout.write('\tTagger enabled.\n') tagger_encodings = Encodings(verbose=False) tagger_encodings.load( os.path.join(path_for_language, 'tagger.encodings')) config = TaggerConfig( os.path.join(path_for_language, 'tagger.conf')) tagger_upos_object = BDRNNTagger(config, tagger_encodings, self.embeddings, runtime=True) tagger_upos_object.load( os.path.join(path_for_language, 'tagger.bestUPOS')) tagger_xpos_object = BDRNNTagger(config, tagger_encodings, self.embeddings, runtime=True) tagger_xpos_object.load( os.path.join(path_for_language, 'tagger.bestXPOS')) tagger_attrs_object = BDRNNTagger(config, tagger_encodings, self.embeddings, runtime=True) tagger_attrs_object.load( os.path.join(path_for_language, 'tagger.bestATTRS')) self.models[PipelineComponents.TAGGER] = [ tagger_upos_object, tagger_xpos_object, tagger_attrs_object ] if not os.path.isfile(os.path.join(path_for_language, 'parser.bestUAS')): sys.stdout.write('\tParser disabled. \n') else: self.parser_enabled = True sys.stdout.write('\tParser enabled.\n') lemmatizer_encodings = Encodings(verbose=False) lemmatizer_encodings.load( os.path.join(path_for_language, 'parser.encodings')) config = ParserConfig( os.path.join(path_for_language, 'parser.conf')) parser_object = BDRNNParser(config, lemmatizer_encodings, self.embeddings, runtime=True) parser_object.load( os.path.join(path_for_language, 'parser.bestUAS')) self.models[PipelineComponents.PARSER] = parser_object def get_latest_model_versions(self): """ Returns a dictionary with (lang_code, latest_version) for each language code. """ request = requests.get(self.MODELS_PATH_CLOUD_ALL) data = xmltodict.parse(request.content) # Make a list with all the archives in the container. item_names = [ item['Name'] for item in data['EnumerationResults']['Blobs']['Blob'] if item['Name'].endswith('.zip') ] # Compute latest_versions. latest_versions = defaultdict(str) for item in item_names: language, version = item.replace('.zip', '').split('-') latest_versions[language] = max(latest_versions[language], version) return latest_versions def _download_models_version(self, lang_code, version): """ Downloads pre-trained models for the provided language. Args: @param lang_code: Target language code. See http://opensource.adobe.com/NLP-Cube/ for available languages and their codes @param version: Version of the models. """ sys.stdout.write('Downloading models for {} \n'.format(lang_code)) model_name = '{}-{}'.format(lang_code, version) model_path_cloud = os.path.join(self.cloud_path, '{}.zip'.format(model_name)) model_path_local = os.path.join(self.disk_path, '{}.zip'.format(model_name)) # Download and extract models for provided language. self._download_and_extract_lang_models(model_path_cloud, model_path_local) # Download Facebook embeddings. self._download_facebook_embeddings(lang_code) def _download_and_extract_lang_models(self, url, file_name, force=False): if os.path.exists(file_name): if force: os.remove(file_name) return # Download and extract zip archive. request = requests.get(url) request_content = request.content zipfile = ZipFile(io.BytesIO(request_content)) zipfile.extractall(self.disk_path) zipfile.close() def _download_facebook_embeddings(self, lang_code): """ Download Facebook embeddings for the provided lang_code. """ name = self.EMBEDDINGS_NAME.format(lang_code) embeddings_url = self.FACEBOOK_EMBEDDINGS_URL + name embeddings_path = os.path.join(self.disk_path, lang_code, name) request = requests.get(embeddings_url) with fopen(embeddings_path, 'wb') as fd: fd.write(request.content) def version_to_donwload(self, lang_code, check_for_latest=True): """ Returns the version of the language models that need to be downloaded, or None if there's nothing to be done. """ lang_models = os.path.join(self.disk_path, lang_code) lang_models_version = os.path.join(lang_models, 'VERSION') # Get current version (if any). current_version = None if os.path.exists(lang_models): with fopen(lang_models_version) as fd: current_version = fd.read().strip('\n') # Get the latest version. latest_versions = self.get_latest_model_versions() latest_version = latest_versions.get(lang_code) if check_for_latest: if not latest_version: if not current_version: raise ValueError( 'No remote version found for {}!'.format(lang_code)) print('No remote version found for {}, using the local ' 'version {}'.format(lang_code, current_version)) return if current_version and current_version >= latest_version: return return latest_version if not current_version: return latest_version
def _load(self, lang_code, version): """ Load models on the class. """ # Refresh metadata self.metadata.read( os.path.join(self.disk_path, lang_code + "-" + str(version), "metadata.json")) model_folder = os.path.join(self.disk_path, lang_code + "-" + str(version)) embeddings_folder = os.path.join(self.disk_path, "embeddings") embeddings_file_path = os.path.join(embeddings_folder, self.metadata.embeddings_file_name) #sys.stdout.write('Loading model for {}-{}\n'.format(lang_code,version)) # 1. Load word embeddings self.embeddings = WordEmbeddings(verbose=False) sys.stdout.write('\tLoading embeddings... \n') self.embeddings.read_from_file(embeddings_file_path, None, full_load=False) # 2. Load tokenizer if not os.path.isfile( os.path.join(model_folder, 'tokenizer-tok.bestAcc')): sys.stdout.write('\tTokenization disabled. \n') else: self.tokenizer_enabled = True sys.stdout.write('\tTokenization enabled.\n') tokenizer_encodings = Encodings(verbose=False) tokenizer_encodings.load( os.path.join(model_folder, 'tokenizer.encodings')) config = TieredTokenizerConfig( os.path.join(model_folder, 'tokenizer.conf')) tokenizer_object = TieredTokenizer(config, tokenizer_encodings, self.embeddings, runtime=True) tokenizer_object.load(os.path.join(model_folder, 'tokenizer')) self.model[PipelineComponents.TOKENIZER] = tokenizer_object # 3. Load compound if not os.path.isfile(os.path.join(model_folder, 'compound.bestAcc')): sys.stdout.write('\tCompound disabled. \n') else: self.compound_enabled = True sys.stdout.write('\tCompound enabled.\n') compound_encodings = Encodings(verbose=False) compound_encodings.load( os.path.join(model_folder, 'compound.encodings')) config = CompoundWordConfig( os.path.join(model_folder, 'compound.conf')) compound_object = CompoundWordExpander(config, compound_encodings, self.embeddings, runtime=True) compound_object.load(os.path.join(model_folder, 'compound.bestAcc')) self.model[PipelineComponents.COMPOUND] = compound_object # 4. Load lemmatizer if not os.path.isfile(os.path.join(model_folder, 'lemmatizer.bestACC')): sys.stdout.write('\tLemmatizer disabled. \n') else: self.lemmatizer_enabled = True sys.stdout.write('\tLemmatizer enabled.\n') lemmatizer_encodings = Encodings(verbose=False) lemmatizer_encodings.load( os.path.join(model_folder, 'lemmatizer.encodings')) config = LemmatizerConfig( os.path.join(model_folder, 'lemmatizer.conf')) lemmatizer_object = FSTLemmatizer(config, lemmatizer_encodings, self.embeddings, runtime=True) lemmatizer_object.load( os.path.join(model_folder, 'lemmatizer.bestACC')) self.model[PipelineComponents.LEMMATIZER] = lemmatizer_object # 5. Load taggers if not os.path.isfile(os.path.join(model_folder, 'tagger.bestUPOS')): sys.stdout.write('\tTagger disabled. \n') else: self.tagger_enabled = True sys.stdout.write('\tTagger enabled.\n') tagger_encodings = Encodings(verbose=False) tagger_encodings.load( os.path.join(model_folder, 'tagger.encodings')) config = TaggerConfig(os.path.join(model_folder, 'tagger.conf')) tagger_upos_object = BDRNNTagger(config, tagger_encodings, self.embeddings, runtime=True) tagger_upos_object.load( os.path.join(model_folder, 'tagger.bestUPOS')) tagger_xpos_object = BDRNNTagger(config, tagger_encodings, self.embeddings, runtime=True) tagger_xpos_object.load( os.path.join(model_folder, 'tagger.bestXPOS')) tagger_attrs_object = BDRNNTagger(config, tagger_encodings, self.embeddings, runtime=True) tagger_attrs_object.load( os.path.join(model_folder, 'tagger.bestATTRS')) self.model[PipelineComponents.TAGGER] = [ tagger_upos_object, tagger_xpos_object, tagger_attrs_object ] # 6. Load parser if not os.path.isfile(os.path.join(model_folder, 'parser.bestUAS')): sys.stdout.write('\tParser disabled. \n') else: self.parser_enabled = True sys.stdout.write('\tParser enabled.\n') lemmatizer_encodings = Encodings(verbose=False) lemmatizer_encodings.load( os.path.join(model_folder, 'parser.encodings')) config = ParserConfig(os.path.join(model_folder, 'parser.conf')) parser_object = BDRNNParser(config, lemmatizer_encodings, self.embeddings, runtime=True) parser_object.load(os.path.join(model_folder, 'parser.bestUAS')) self.model[PipelineComponents.PARSER] = parser_object
class ModelStore(object): """ Abstraction layer for working with models. """ MODELS_PATH_LOCAL = 'models' MODELS_PATH_CLOUD = 'https://nlpcube.blob.core.windows.net/models' MODELS_PATH_CLOUD_ALL = os.path.join(MODELS_PATH_CLOUD, '?restype=container&comp=list') def __init__(self, disk_path=None, cloud_path=None): self.disk_path = disk_path or self.MODELS_PATH_LOCAL self.cloud_path = cloud_path or self.MODELS_PATH_CLOUD self.model = {} self.metadata = ModelMetadata() def _list_folders(self, lang_code=None): output = [ os.path.basename(os.path.normpath(dI)) for dI in os.listdir(self.disk_path) if os.path.isdir(os.path.join(self.disk_path, dI)) ] if lang_code != None: output = [dI for dI in output if lang_code in dI] return output def find(self, lang_code, version="latest", verbose=True): """ Contains logic for loading or downloading and loading models for the target language. Description: if version == "latest": it checks for the local latest version available, and it loads it if it does not find any local version, it downloads the latest one it finds online if version == "2.0": (or any other specific version, != "latest") it checks for it locally, if it finds it, it loads it if it is not found locally, it attempts to download it from the cloud and then loads it Args: lang_code: Target language code. See http://opensource.adobe.com/NLP-Cube/ for available languages and their codes version: "latest" to get the latest version, or other specific version in like "1.0", "2.1", etc . """ # check for the latest local version, according to version parameter if version == "latest": latest_version = None local_models = self.list_local_models(lang_code) if len(local_models) > 0: local_versions = [x[1] for x in local_models] local_versions.sort() latest_version = local_versions[-1] if verbose: print("Loading latest local model: " + lang_code + "-" + str(latest_version)) #self._load(lang_code,latest_version) return os.path.join(self.disk_path, lang_code + "-" + str(latest_version)) else: # no models found, check online according to version parameter if version == "latest": version = self._version_to_download(lang_code, version=version) if version != None: print("Latest version found online: " + lang_code + "-" + str(version)) else: # nothing was found online raise Exception( "No model version for language [" + lang_code + "] was found in the online repository!") self._download_model(lang_code, version) return os.path.join( self.disk_path, lang_code + "-" + str(version)) #self._load(lang_code,version) else: # check for a specific local version, according to version parameter version = float(version) if os.path.isdir(os.path.join(self.disk_path, lang_code, version)): return os.path.join( self.disk_path, lang_code + "-" + str(version)) #self._load(lang_code,version) else: # version not found, trying to download it from the cloud version = self._version_to_download(lang_code, version=version) if version == None: raise Exception( "Version [" + str(version) + "] for language [" + lang_code + "] was not found in the online repository. Maybe try using .find(version='latest') to auto-download the latest model?" ) self._download_model(lang_code, str(version)) return os.path.join( self.disk_path, lang_code + "-" + str(version)) #self._load(lang_code,version) def _load(self, lang_code, version): """ Load models on the class. """ # Refresh metadata self.metadata.read( os.path.join(self.disk_path, lang_code + "-" + str(version), "metadata.json")) model_folder = os.path.join(self.disk_path, lang_code + "-" + str(version)) embeddings_folder = os.path.join(self.disk_path, "embeddings") embeddings_file_path = os.path.join(embeddings_folder, self.metadata.embeddings_file_name) #sys.stdout.write('Loading model for {}-{}\n'.format(lang_code,version)) # 1. Load word embeddings self.embeddings = WordEmbeddings(verbose=False) sys.stdout.write('\tLoading embeddings... \n') self.embeddings.read_from_file(embeddings_file_path, None, full_load=False) # 2. Load tokenizer if not os.path.isfile( os.path.join(model_folder, 'tokenizer-tok.bestAcc')): sys.stdout.write('\tTokenization disabled. \n') else: self.tokenizer_enabled = True sys.stdout.write('\tTokenization enabled.\n') tokenizer_encodings = Encodings(verbose=False) tokenizer_encodings.load( os.path.join(model_folder, 'tokenizer.encodings')) config = TieredTokenizerConfig( os.path.join(model_folder, 'tokenizer.conf')) tokenizer_object = TieredTokenizer(config, tokenizer_encodings, self.embeddings, runtime=True) tokenizer_object.load(os.path.join(model_folder, 'tokenizer')) self.model[PipelineComponents.TOKENIZER] = tokenizer_object # 3. Load compound if not os.path.isfile(os.path.join(model_folder, 'compound.bestAcc')): sys.stdout.write('\tCompound disabled. \n') else: self.compound_enabled = True sys.stdout.write('\tCompound enabled.\n') compound_encodings = Encodings(verbose=False) compound_encodings.load( os.path.join(model_folder, 'compound.encodings')) config = CompoundWordConfig( os.path.join(model_folder, 'compound.conf')) compound_object = CompoundWordExpander(config, compound_encodings, self.embeddings, runtime=True) compound_object.load(os.path.join(model_folder, 'compound.bestAcc')) self.model[PipelineComponents.COMPOUND] = compound_object # 4. Load lemmatizer if not os.path.isfile(os.path.join(model_folder, 'lemmatizer.bestACC')): sys.stdout.write('\tLemmatizer disabled. \n') else: self.lemmatizer_enabled = True sys.stdout.write('\tLemmatizer enabled.\n') lemmatizer_encodings = Encodings(verbose=False) lemmatizer_encodings.load( os.path.join(model_folder, 'lemmatizer.encodings')) config = LemmatizerConfig( os.path.join(model_folder, 'lemmatizer.conf')) lemmatizer_object = FSTLemmatizer(config, lemmatizer_encodings, self.embeddings, runtime=True) lemmatizer_object.load( os.path.join(model_folder, 'lemmatizer.bestACC')) self.model[PipelineComponents.LEMMATIZER] = lemmatizer_object # 5. Load taggers if not os.path.isfile(os.path.join(model_folder, 'tagger.bestUPOS')): sys.stdout.write('\tTagger disabled. \n') else: self.tagger_enabled = True sys.stdout.write('\tTagger enabled.\n') tagger_encodings = Encodings(verbose=False) tagger_encodings.load( os.path.join(model_folder, 'tagger.encodings')) config = TaggerConfig(os.path.join(model_folder, 'tagger.conf')) tagger_upos_object = BDRNNTagger(config, tagger_encodings, self.embeddings, runtime=True) tagger_upos_object.load( os.path.join(model_folder, 'tagger.bestUPOS')) tagger_xpos_object = BDRNNTagger(config, tagger_encodings, self.embeddings, runtime=True) tagger_xpos_object.load( os.path.join(model_folder, 'tagger.bestXPOS')) tagger_attrs_object = BDRNNTagger(config, tagger_encodings, self.embeddings, runtime=True) tagger_attrs_object.load( os.path.join(model_folder, 'tagger.bestATTRS')) self.model[PipelineComponents.TAGGER] = [ tagger_upos_object, tagger_xpos_object, tagger_attrs_object ] # 6. Load parser if not os.path.isfile(os.path.join(model_folder, 'parser.bestUAS')): sys.stdout.write('\tParser disabled. \n') else: self.parser_enabled = True sys.stdout.write('\tParser enabled.\n') lemmatizer_encodings = Encodings(verbose=False) lemmatizer_encodings.load( os.path.join(model_folder, 'parser.encodings')) config = ParserConfig(os.path.join(model_folder, 'parser.conf')) parser_object = BDRNNParser(config, lemmatizer_encodings, self.embeddings, runtime=True) parser_object.load(os.path.join(model_folder, 'parser.bestUAS')) self.model[PipelineComponents.PARSER] = parser_object def _download_model(self, lang_code, version): """ Downloads pre-trained models for the provided language. Args: @param lang_code: Target language code. See http://opensource.adobe.com/NLP-Cube/ for available languages and their codes @param version: Version of the model. """ #sys.stdout.write('Downloading models for {} \n'.format(lang_code)) model_name = '{}-{}'.format(lang_code, version) model_path_cloud = os.path.join(self.cloud_path, '{}.zip'.format(model_name)) model_path_local = os.path.join(self.disk_path, '{}.zip'.format(model_name)) # Download and extract models for provided language. self._download_and_extract_lang_model(model_path_cloud, model_path_local) self.metadata.read( os.path.join(self.disk_path, lang_code + "-" + str(version), "metadata.json")) # Download Facebook embeddings based on the metadata read from the model self._download_embeddings(self.metadata.embeddings_remote_link, self.metadata.embeddings_file_name) sys.stdout.write("\n") def _download_with_progress_bar(self, url, local_filename): r = requests.get(url, stream=True) total_size = int(r.headers['Content-Length'].strip()) current_size = 0 #request_content = [] f = fopen(local_filename, 'wb') for buf in r.iter_content(4096 * 16): if buf: #request_content.append(buf) f.write(buf) current_size += len(buf) done = int(40 * current_size / total_size) sys.stdout.write( "\r[%s%s] %3.1f%%, downloading %.2f/%.2f MB ..." % ('=' * done, ' ' * (40 - done), 100 * current_size / total_size, current_size / 1024 / 1024, total_size / 1024 / 1024)) sys.stdout.flush() #return b"".join(request_content) f.close() def _download_and_extract_lang_model(self, url, file_name, force=False): if os.path.exists(file_name): if force: os.remove(file_name) return temp_folder = tempfile.mkdtemp() try: # Download and extract zip archive. zip_file_name = os.path.join(temp_folder, "tmp.zip") self._download_with_progress_bar(url, zip_file_name) sys.stdout.write( "\rDownload complete, decompressing files ... " ) sys.stdout.flush() zipfile = ZipFile(zip_file_name, "r") zipfile.extractall(self.disk_path) zipfile.close() sys.stdout.write("\nModel downloaded successfully.") sys.stdout.flush() except Exception as e: print("Error encountered, cleaning up and exiting ...") rmtree(temp_folder, ignore_errors=True) raise e # delete temporary folder rmtree(temp_folder, ignore_errors=True) def _download_embeddings(self, embeddings_remote_link, embeddings_file_name): """ Download remote embeddings for the provided lang_code. Args: @param lang_code: Target language code. See http://opensource.adobe.com/NLP-Cube/ for available languages and their codes @param version: Version of the model to read which embedding file to get. """ embeddings_folder = os.path.join(self.disk_path, "embeddings") if not os.path.exists(embeddings_folder): os.makedirs(embeddings_folder) embeddings_file = os.path.join(embeddings_folder, embeddings_file_name) # Check locally for the file sys.stdout.write("\nChecking for associated vector embeddings file [" + embeddings_file_name + "] ...\n") if os.path.isfile(embeddings_file): return # We don't have the correct embedding file, download it ... self._download_with_progress_bar(embeddings_remote_link, embeddings_file) sys.stdout.write( "\rEmbeddings downloaded successfully. " ) def _version_to_download(self, lang_code, version="latest"): """ Returns the version of the language models that need to be downloaded, or None if there's nothing to be done. """ online_models = self.list_online_models(lang_code) # filter by lang code lang_models = [x for x in online_models if lang_code in x[0]] if len(lang_models) == 0: return None # nothing found online if version == "latest": # Compute latest version. remote_versions = [x[1] for x in lang_models] remote_versions.sort() return remote_versions[-1] else: for model in lang_models: if str(version) in model: return version return None # not found this particular version online def delete_model(self, lang_code, version): """ Deletes a local model. Also checks for associated embeddings file and cleans it up as well only if not referenced by any other local model """ model = lang_code + "-" + str(version) model_folder = os.path.join(self.disk_path, model) # check if model exists if not os.path.isdir(model_folder): print("Model " + model + " not found! Nothing to delete.") return # determine which embedding file we need to delete model_metadata.load(lang_code + "-" + str(version)) embeddings_file_to_delete = model_metadata.embeddings_file_name # delete the model import shutil try: shutil.rmtree(model_folder) except OSError as e: print("Error removing folder from local disk: %s - %s." % (e.filename, e.strerror)) # search in other models for referenced embeddings file found_in_other_models = False lang_models = self._list_folders() for lang_model in lang_models: model_metadata.load(lang_model) other_embeddings_file = model_metadata.embeddings_file_name if other_embeddings_file == embeddings_file_to_delete: found_in_other_models = True print("Embeddings file " + embeddings_file_to_delete + " is still being used by model " + lang_model + " so it will not be deleted.") break if not found_in_other_models: try: os.remove(embeddings_file_to_delete) print("Removed embeddings file " + embeddings_file_to_delete + ".") except OSError as e: ## if failed, report it back to the user ## print("Error removing embeddings file: %s - %s." % (e.filename, e.strerror)) print("Model cleanup successful.") def list_local_models(self, lang_code=None): """ Returns a list of tuples of the models found locally ex: [("en",1.0),("en",1.1),("es",1.0)...] """ lang_models = self._list_folders() lang_models = [ x for x in lang_models if "-" in x ] # eliminate the embeddings and any other non-model folder if len(lang_models) > 0: local_models = [(x.split("-")[0], float(x.split("-")[1])) for x in lang_models] if lang_code: local_models = [x for x in local_models if lang_code in x[0]] return local_models else: return [] def list_online_models(self, lang_code): """ Returns a list of tuples of the models found online ex: [("en",1.0),("en",1.1),("es",1.0)...] """ request = requests.get(self.MODELS_PATH_CLOUD_ALL) data = xmltodict.parse(request.content) # Make a list with all the archives in the container. online_models = [ item['Name'] for item in data['EnumerationResults']['Blobs']['Blob'] if item['Name'].endswith('.zip') ] online_models = [(x.replace(".zip", "").split("-")[0], float(x.replace(".zip", "").split("-")[1])) for x in online_models if "-" in x] if lang_code: online_models = [x for x in online_models if lang_code in x[0]] return online_models def _copy_file(self, input_folder, output_folder, file_name): src_file = os.path.join(input_folder, file_name) dst_file = os.path.join(output_folder, file_name) if not os.path.isfile(src_file): return False copyfile(src_file, dst_file) return True def _zipper(self, dir, zip_file): zip = zipfile.ZipFile(zip_file, 'w', compression=zipfile.ZIP_DEFLATED) root_len = len(os.path.abspath(dir)) for root, dirs, files in os.walk(dir): archive_root = os.path.abspath(root)[root_len:] for f in files: fullpath = os.path.join(root, f) archive_name = os.path.join(archive_root, f) zip.write(fullpath, archive_name, zipfile.ZIP_DEFLATED) zip.close() def package_model(self, input_folder, output_folder_path, metadata, should_contain_tokenizer=True, should_contain_compound_word_expander=False, should_contain_lemmatizer=True, should_contain_tagger=True, should_contain_parser=True): """ input_folder = "English-GWT" output_folder_path = "path_to_where_zip_files_will_be_placed" """ # check input folder exists if not os.path.isdir(input_folder): raise Exception("Input folder not found") # create temporary folder locally temp_folder = tempfile.mkdtemp() try: # create local model sub-folder output_folder = os.path.join( temp_folder, metadata.language_code + "-" + str(metadata.model_version)) print("\tWriting model to temp folder: " + output_folder) os.makedirs(output_folder) # write metadata to this folder metadata.save(os.path.join(output_folder, "metadata.json")) # copy tokenizer files if should_contain_tokenizer: tokenizer_is_valid = True if not self._copy_file(input_folder, output_folder, "tokenizer.encodings"): tokenizer_is_valid = False if not self._copy_file(input_folder, output_folder, "tokenizer.conf"): tokenizer_is_valid = False if not self._copy_file(input_folder, output_folder, "tokenizer-tok.bestAcc"): tokenizer_is_valid = False if not self._copy_file(input_folder, output_folder, "tokenizer-ss.bestAcc"): tokenizer_is_valid = False if tokenizer_is_valid: print("\tTokenizer model found.") else: raise Exception( "Tokenizer model not found (or incomplete).") # copy compound_word_expander files if should_contain_compound_word_expander: compound_word_expander = True if not self._copy_file(input_folder, output_folder, "compound.bestAcc"): compound_word_expander = False if not self._copy_file(input_folder, output_folder, "compound.conf"): compound_word_expander = False if not self._copy_file(input_folder, output_folder, "compound.encodings"): compound_word_expander = False if compound_word_expander: print("\tCompound word expander model found.") else: raise Exception( "Compound word expander model not found (or incomplete)." ) # copy tagger files if should_contain_tagger: tagger = True if not self._copy_file(input_folder, output_folder, "tagger.bestUPOS"): tagger = False if not self._copy_file(input_folder, output_folder, "tagger.bestXPOS"): tagger = False if not self._copy_file(input_folder, output_folder, "tagger.bestATTRS"): tagger = False if not self._copy_file(input_folder, output_folder, "tagger.conf"): tagger = False if not self._copy_file(input_folder, output_folder, "tagger.encodings"): tagger = False if tagger: print("\tTagger model found.") else: raise Exception("Tagger model not found (or incomplete).") # copy lemmatizer files if should_contain_lemmatizer: lemmatizer = True # patch if os.path.isfile( os.path.join(input_folder, "lemmatizer.bestACC")): os.rename(os.path.join(input_folder, "lemmatizer.bestACC"), os.path.join(input_folder, "lemmatizer.bestAcc")) if not self._copy_file(input_folder, output_folder, "lemmatizer.bestAcc"): lemmatizer = False if not self._copy_file(input_folder, output_folder, "lemmatizer.conf"): lemmatizer = False if not self._copy_file(input_folder, output_folder, "lemmatizer.encodings"): lemmatizer = False if lemmatizer: print("\tLemmatizer model found.") else: raise Exception( "Lemmatizer model not found (or incomplete).") # copy parser files if should_contain_parser: parser = True if not self._copy_file(input_folder, output_folder, "parser.bestUAS"): parser = False if not self._copy_file(input_folder, output_folder, "parser.bestLAS"): parser = False if not self._copy_file(input_folder, output_folder, "parser.conf"): parser = False if not self._copy_file(input_folder, output_folder, "parser.encodings"): parser = False if parser: print("\tParser model found.") else: raise Exception("Parser model not found (or incomplete).") # package into zip file print("\tCompressing model ...") model_file = os.path.join( output_folder_path, metadata.language_code + "-" + str(metadata.model_version) + ".zip") self._zipper(temp_folder, model_file) except Exception as e: print("Error encountered, cleaning up and exiting ...") rmtree(temp_folder, ignore_errors=True) raise e # delete temporary folder print("\tCleaning up ...") rmtree(temp_folder, ignore_errors=True) print("Model packaged successfully as: " + model_file)