def parse_run(params): sys.stdout.write("\nINPUT FILE: " + params.input_file) sys.stdout.write("\nOUTPUT FILE: " + params.output_file) sys.stdout.write("\nMODELS FILE: " + params.models + "\n") sys.stdout.flush() components = params.run.split(",") tokenize = True if "tokenizer" in components else False compound = True if "compound" in components else False lemmatize = True if "lemmatizer" in components else False tag = True if "tagger" in components else False parse = True if "parser" in components else False # common elements load sys.stdout.write("\nLoading embeddings : " + params.embeddings + " ...\n") embeddings = WordEmbeddings() embeddings.read_from_file(params.embeddings, None) encodings = None if tokenize == True: if not os.path.isfile( os.path.join(params.models, "tokenizer-tok.bestAcc")): sys.stdout.write( "\n\tTokenizer model not found! (" + os.path.join(params.models, "tokenizer-tok.bestAcc") + ")") sys.stdout.flush() sys.exit(1) sys.stdout.write("\n\tTokenization enabled.\n") tokenizer_encodings = Encodings(verbose=False) tokenizer_encodings.load( os.path.join(params.models, "tokenizer.encodings")) if compound == True: if not os.path.isfile(os.path.join(params.models, "compound.bestAcc")): sys.stdout.write("\n\tCompound word expander model not found!") sys.stdout.flush() sys.exit(1) sys.stdout.write("\n\tCompound word expander enabled.\n") if encodings == None: encodings = Encodings(verbose=False) encodings.load(os.path.join(params.models, "lemmatizer.encodings")) if lemmatize == True: if not os.path.isfile(os.path.join(params.models, "lemmatizer.bestACC")): sys.stdout.write("\n\tLemmatization model not found!") sys.stdout.flush() sys.exit(1) sys.stdout.write("\n\tLemmatization enabled.\n") if encodings == None: encodings = Encodings(verbose=False) encodings.load(os.path.join(params.models, "lemmatizer.encodings")) if tag == True: if not os.path.isfile(os.path.join(params.models, "tagger.bestOVERALL")): sys.stdout.write("\n\tTagger model not found!") sys.stdout.flush() sys.exit(1) sys.stdout.write("\n\tTagger enabled.\n") if encodings == None: encodings = Encodings(verbose=False) encodings.load(os.path.join(params.models, "tagger.encodings")) if parse == True: if not os.path.isfile(os.path.join(params.models, "parser.bestUAS")): sys.stdout.write("\n\tParser model not found!") sys.stdout.flush() sys.exit(1) sys.stdout.write("\n\tParser enabled.\n") if encodings == None: encodings = Encodings(verbose=False) encodings.load(os.path.join(params.models, "parser.encodings")) sequences = None if tokenize: sys.stdout.write("\nTokenizing " + params.input_file + " ... \n\t") sys.stdout.flush() from io_utils.config import TieredTokenizerConfig from generic_networks.tokenizers import TieredTokenizer config = TieredTokenizerConfig( os.path.join(params.models, "tokenizer.conf")) tokenizer_object = TieredTokenizer(config, tokenizer_encodings, embeddings, runtime=True) tokenizer_object.load(os.path.join(params.models, "tokenizer")) with open(params.input_file, 'r') as file: lines = file.readlines() # analyze use of spaces in first part of the file test = "" useSpaces = " " cnt = 0 while True: test = test + lines[cnt] # print(lines[cnt]) if cnt >= len(lines) or cnt > 5: break cnt += 1 if float(test.count(' ')) / float(len(test)) < 0.02: useSpaces = "" # print (str(float(test.count(' '))/float(len(test)))) i = -1 input_string = "" sequences = [] while i < len(lines) - 1: i += 1 input_string = input_string + lines[i].replace("\r", "").replace( "\n", "").strip() + useSpaces if lines[i].strip() == "" or i == len(lines) - 1: # end of block if input_string.strip() != "": sequences += tokenizer_object.tokenize(input_string) input_string = "" del tokenizer_object # free memory else: ds = Dataset(params.input_file) sequences = ds.sequences sys.stdout.write(" done\n") sys.stdout.flush() if compound: sys.stdout.write("\nCompound word expanding " + params.input_file + " ... \n\t") sys.stdout.flush() from generic_networks.token_expanders import CompoundWordExpander from io_utils.config import CompoundWordConfig config = CompoundWordConfig( os.path.join(params.models, "compound.conf")) compoundwordexpander_object = CompoundWordExpander(config, encodings, embeddings, runtime=True) compoundwordexpander_object.load( os.path.join(params.models, "compound.bestAcc")) sequences = compoundwordexpander_object.expand_sequences(sequences) del compoundwordexpander_object # free memory sys.stdout.write(" done\n") sys.stdout.flush() if parse == True: sys.stdout.write("\nParsing " + params.input_file + " ... \n\t") sys.stdout.flush() from io_utils.config import ParserConfig from generic_networks.parsers import BDRNNParser config = ParserConfig(os.path.join(params.models, "parser.conf")) parser_object = BDRNNParser(config, encodings, embeddings, runtime=True) parser_object.load(os.path.join(params.models, "parser.bestUAS")) sequences = parser_object.parse_sequences(sequences) del parser_object # free memory sys.stdout.write(" done\n") sys.stdout.flush() if tag == True: sys.stdout.write("\nTagging " + params.input_file + " ... \n\t") sys.stdout.flush() from io_utils.config import TaggerConfig from generic_networks.taggers import BDRNNTagger config = TaggerConfig(os.path.join(params.models, "tagger.conf")) tagger_object_UPOS = BDRNNTagger(config, encodings, embeddings, runtime=True) tagger_object_UPOS.load(os.path.join(params.models, "tagger.bestUPOS")) tagger_object_XPOS = BDRNNTagger(config, encodings, embeddings, runtime=True) tagger_object_XPOS.load(os.path.join(params.models, "tagger.bestXPOS")) tagger_object_ATTRS = BDRNNTagger(config, encodings, embeddings, runtime=True) tagger_object_ATTRS.load( os.path.join(params.models, "tagger.bestATTRS")) new_sequences = [] for sequence in sequences: new_sequence = copy.deepcopy(sequence) predicted_tags_UPOS = tagger_object_UPOS.tag(new_sequence) predicted_tags_XPOS = tagger_object_XPOS.tag(new_sequence) predicted_tags_ATTRS = tagger_object_ATTRS.tag(new_sequence) for entryIndex in range(len(sequence)): new_sequence[entryIndex].upos = predicted_tags_UPOS[ entryIndex][0] new_sequence[entryIndex].xpos = predicted_tags_XPOS[ entryIndex][1] new_sequence[entryIndex].attrs = predicted_tags_ATTRS[ entryIndex][2] new_sequences.append(new_sequence) sequences = copy.deepcopy(new_sequences) del tagger_object_UPOS # free memory del tagger_object_XPOS # free memory del tagger_object_ATTRS # free memory sys.stdout.write(" done\n") sys.stdout.flush() if lemmatize: sys.stdout.write("\nLemmatizing " + params.input_file + " ... \n\t") sys.stdout.flush() from generic_networks.lemmatizers import FSTLemmatizer from io_utils.config import LemmatizerConfig config = LemmatizerConfig( os.path.join(params.models, "lemmatizer.conf")) lemmatizer_object = FSTLemmatizer(config, encodings, embeddings, runtime=True) lemmatizer_object.load( os.path.join(params.models, "lemmatizer.bestACC")) sequences = lemmatizer_object.lemmatize_sequences(sequences) del lemmatizer_object # free memory sys.stdout.write(" done\n") sys.stdout.flush() output_dataset = Dataset() output_dataset.sequences = sequences output_dataset.write(params.output_file)
class Cube(object): def __init__(self, verbose=False): """ Create an empty instance for Cube Before it can be used, you must call @method load with @param language_code set to your target language """ self._loaded = False self._verbose = verbose self._tokenizer = None # tokenizer object, default is None self._compound_word_expander = False # compound word expander, default is None self._lemmatizer = False # lemmatizer object, default is None self._parser = False # parser object, default is None self._tagger = False # tagger object, default is None self.embeddings = None # ?? needed? self.metadata = ModelMetadata() self._model_repository = "models" self._embeddings_repository = os.path.join("models", "embeddings") #self.model_store = ModelStore() # needed??? def load(self, language_code, version="latest", tokenization=True, compound_word_expanding=False, tagging=True, lemmatization=True, parsing=True): """ Loads the pipeline with all available models for the target language. @param lang_code: Target language code. See http://opensource.adobe.com/NLP-Cube/ for available languages and their codes @param version: "latest" to get the latest version, or other specific version in like "1.0", "2.1", etc . """ # Initialize a ModelStore object model_store_object = ModelStore(disk_path=self._model_repository) # Find a local model or download it if it does not exist, returning the local model folder path model_folder_path = model_store_object.find(lang_code=language_code, version=version, verbose=self._verbose) # Load metadata from the model self.metadata.read(os.path.join(model_folder_path, "metadata.json")) # Load embeddings embeddings = WordEmbeddings(verbose=False) if self._verbose: sys.stdout.write('\tLoading embeddings... \n') embeddings.read_from_file(os.path.join( self._embeddings_repository, self.metadata.embeddings_file_name), None, full_load=False) # 1. Load tokenizer if tokenization: if not os.path.isfile( os.path.join(model_folder_path, 'tokenizer-tok.bestAcc')): sys.stdout.write( '\tTokenization is not available on this model. \n') else: if self._verbose: sys.stdout.write('\tLoading tokenization model ...\n') tokenizer_encodings = Encodings(verbose=False) tokenizer_encodings.load( os.path.join(model_folder_path, 'tokenizer.encodings')) config = TieredTokenizerConfig( os.path.join(model_folder_path, 'tokenizer.conf')) self._tokenizer = TieredTokenizer(config, tokenizer_encodings, embeddings, runtime=True) self._tokenizer.load( os.path.join(model_folder_path, 'tokenizer')) # 3. Load compound if compound_word_expanding: if not os.path.isfile( os.path.join(model_folder_path, 'compound.bestAcc')): if self._verbose: # supress warning here because many languages do not have compund words sys.stdout.write( '\tCompound word expansion is not available on this model. \n' ) else: if self._verbose: sys.stdout.write( '\tLoading compound word expander model ...\n') compound_encodings = Encodings(verbose=False) compound_encodings.load( os.path.join(model_folder_path, 'compound.encodings')) config = CompoundWordConfig( os.path.join(model_folder_path, 'compound.conf')) self._compound_word_expander = CompoundWordExpander( config, compound_encodings, embeddings, runtime=True) self._compound_word_expander.load( os.path.join(model_folder_path, 'compound.bestAcc')) # 4. Load lemmatizer if lemmatization: if not os.path.isfile( os.path.join(model_folder_path, 'lemmatizer.bestACC')): sys.stdout.write( '\tLemmatizer is not available on this model. \n') else: if self._verbose: sys.stdout.write('\tLoading lemmatization model ...\n') lemmatizer_encodings = Encodings(verbose=False) lemmatizer_encodings.load( os.path.join(model_folder_path, 'lemmatizer.encodings')) config = LemmatizerConfig( os.path.join(model_folder_path, 'lemmatizer.conf')) self._lemmatizer = FSTLemmatizer(config, lemmatizer_encodings, embeddings, runtime=True) self._lemmatizer.load( os.path.join(model_folder_path, 'lemmatizer.bestACC')) # 5. Load taggers if tagging or lemmatization: # we need tagging for lemmatization if not os.path.isfile( os.path.join(model_folder_path, 'tagger.bestUPOS')): sys.stdout.write( '\tTagging is not available on this model. \n') if lemmatization: sys.stdout.write( '\t\tDisabling the lemmatization model due to missing tagger. \n' ) self._lemmatizer = None else: if self._verbose: sys.stdout.write('\tLoading tagger model ...\n') tagger_encodings = Encodings(verbose=False) tagger_encodings.load( os.path.join(model_folder_path, 'tagger.encodings')) config = TaggerConfig( os.path.join(model_folder_path, 'tagger.conf')) self._tagger = [None, None, None] self._tagger[0] = BDRNNTagger(config, tagger_encodings, embeddings, runtime=True) self._tagger[0].load( os.path.join(model_folder_path, 'tagger.bestUPOS')) self._tagger[1] = BDRNNTagger(config, tagger_encodings, embeddings, runtime=True) self._tagger[1].load( os.path.join(model_folder_path, 'tagger.bestXPOS')) self._tagger[2] = BDRNNTagger(config, tagger_encodings, embeddings, runtime=True) self._tagger[2].load( os.path.join(model_folder_path, 'tagger.bestATTRS')) # 6. Load parser if parsing: if not os.path.isfile( os.path.join(model_folder_path, 'parser.bestUAS')): sys.stdout.write( '\tParsing is not available on this model... \n') else: if self._verbose: sys.stdout.write('\tLoading parser model ...\n') parser_encodings = Encodings(verbose=False) parser_encodings.load( os.path.join(model_folder_path, 'parser.encodings')) config = ParserConfig( os.path.join(model_folder_path, 'parser.conf')) self._parser = BDRNNParser(config, parser_encodings, embeddings, runtime=True) self._parser.load( os.path.join(model_folder_path, 'parser.bestUAS')) self._loaded = True if self._verbose: sys.stdout.write('Model loading complete.\n\n') def __call__(self, text): if not self._loaded: raise Exception( "Cube object is initialized but no model is loaded (eg.: call cube.load('en') )" ) sequences = [] if self._tokenizer: # split text by lines input_lines = text.split("\n") for input_line in input_lines: sequences += self._tokenizer.tokenize(input_line) if self._compound_word_expander: sequences = self._compound_word_expander.expand_sequences( sequences) if self._parser: sequences = self._parser.parse_sequences(sequences) if self._tagger or self._lemmatizer: import copy new_sequences = [] for sequence in sequences: new_sequence = copy.deepcopy(sequence) predicted_tags_UPOS = self._tagger[0].tag(new_sequence) predicted_tags_XPOS = self._tagger[1].tag(new_sequence) predicted_tags_ATTRS = self._tagger[2].tag(new_sequence) for entryIndex in range(len(sequence)): new_sequence[entryIndex].upos = predicted_tags_UPOS[ entryIndex][0] new_sequence[entryIndex].xpos = predicted_tags_XPOS[ entryIndex][1] new_sequence[entryIndex].attrs = predicted_tags_ATTRS[ entryIndex][2] new_sequences.append(new_sequence) sequences = new_sequences if self._lemmatizer: sequences = self._lemmatizer.lemmatize_sequences(sequences) return sequences