def eval(self, raw_text_file, gold_conllu_file): input_string = "" useSpaces = " " # True lines = [] with fopen(raw_text_file, "r") as file: lines = file.readlines() # analyze use of spaces in first part of the file test = ""; cnt = 0 while True: test = test + lines[cnt] # print(lines[cnt]) cnt += 1 if cnt >= len(lines) or cnt > 5: break if float(test.count(' ')) / float(len(test)) < 0.02: useSpaces = "" # print (str(float(test.count(' '))/float(len(test)))) i = -1 input_string = "" sentences = [] while i < len(lines) - 1: i += 1 input_string = input_string + lines[i].replace("\r", "").replace("\n", "").strip() + useSpaces if lines[i].strip() == "" or i == len(lines) - 1: # end of block if input_string.strip() != "": sentences += self.tokenizer.tokenize(input_string) input_string = "" with fopen(self.tokenizer.config.base + "-temporary.conllu", 'w') as file: for sentence in sentences: # print ("Sentence has entries: "+str(len(sentence))) for entry in sentence: line = str( entry.index) + "\t" + entry.word + "\t" + entry.lemma + "\t" + entry.upos + "\t" + entry.xpos + "\t" + entry.attrs + "\t" + str( entry.head) + "\t" + entry.label + "\t" + entry.deps + "\t" + entry.space_after + "\n" file.write(line) file.write("\n") # run eval script metrics = conll_eval(self.tokenizer.config.base + "-temporary.conllu", gold_conllu_file) return metrics["Tokens"].f1 * 100., metrics["Sentences"].f1 * 100.
def parse_test(params): if params.test == "parser": print ("Running " + params.test) print ("==PARAMETERS==") print ("EMBEDDINGS: " + params.embeddings) print ("MODEL FILE: " + params.model_base) print ("DECODER: " + params.decoder) print ("OUTPUT: " + params.output_file) print ("CONFIG FILE: " + str(params.config)) print ("==============\n") testset = Dataset(params.test_file) encodings = Encodings() encodings.load(params.model_base + ".encodings") encodings.update_wordlist(testset) print ("Updated word list: " + str(len(encodings.word_list))) config = ParserConfig(filename=params.config) embeddings = WordEmbeddings() embeddings.read_from_file(params.embeddings, encodings.word_list) parser = BDRNNParser(config, encodings, embeddings) parser.load(params.model_base + ".bestUAS") if params.decoder == 'mst': print ("!!!!!!!!!!!!!!!!!!!!!!!!!USING MST DECODER") from graph.decoders import MSTDecoder parser.decoder = MSTDecoder() f = fopen(params.output_file, "w") last_proc = 0 index = 0 for seq in testset.sequences: index += 1 proc = index * 100 / len(testset.sequences) if proc % 5 == 0 and proc != last_proc: last_proc = proc sys.stdout.write(" " + str(proc)) sys.stdout.flush() rez = parser.tag(seq) iSeq = 0 iRez = 0 while iSeq < len(seq): while seq[iSeq].is_compound_entry: iSeq += 1 seq[iSeq].xpos = rez[iRez].xpos seq[iSeq].upos = rez[iRez].upos seq[iSeq].attrs = rez[iRez].attrs seq[iSeq].head = rez[iRez].head seq[iSeq].label = rez[iRez].label seq[iSeq].lemma = rez[iRez].lemma iSeq += 1 iRez += 1 for entry in seq: f.write(str(entry.index) + "\t" + str(entry.word) + "\t" + str(entry.lemma) + "\t" + str( entry.upos) + "\t" + str(entry.xpos) + "\t" + str(entry.attrs) + "\t" + str( entry.head) + "\t" + str(entry.label) + "\t" + str(entry.deps) + "\t" + str( entry.space_after) + "\n") f.write("\n") f.close() sys.stdout.write("\n")
def save(self, filename): f = fopen(filename, "w") f.write("LABELS " + str(len(self.label2int)) + "\n") for label in self.label2int: f.write(str(label) + "\t" + str(self.label2int[label]) + "\n") f.write("CHARACTERS " + str(len(self.char2int)) + "\n") for character in self.char2int: if sys.version_info[0] == 2: f.write(character.encode('utf-8') + "\t" + str(self.char2int[character]) + "\n") else: f.write(character + "\t" + str(self.char2int[character]) + "\n") f.write("WORDS " + str(len(self.word2int)) + "\n") for word in self.word2int: if sys.version_info[0] == 2: f.write(word.encode('utf-8') + "\t" + str(self.word2int[word]) + "\n") else: f.write(word + "\t" + str(self.word2int[word]) + "\n") f.write("UPOS " + str(len(self.upos2int)) + "\n") for label in self.upos2int: f.write(label + "\t" + str(self.upos2int[label]) + "\n") f.write("XPOS " + str(len(self.xpos2int)) + "\n") for label in self.xpos2int: f.write(label + "\t" + str(self.xpos2int[label]) + "\n") f.write("ATTRS " + str(len(self.attrs2int)) + "\n") for label in self.attrs2int: f.write(label + "\t" + str(self.attrs2int[label]) + "\n") f.close()
def write(self, filename): with fopen(filename, 'w') as file: for sequence in self.sequences: for entry in sequence: file.write(str(entry.index)) file.write("\t") if isinstance(entry.word, str): file.write(entry.word) else: file.write(entry.word.encode('utf-8')) file.write("\t") if isinstance(entry.lemma, str): file.write(entry.lemma) else: file.write(entry.lemma.encode('utf-8')) file.write("\t") file.write(entry.upos) file.write("\t") file.write(entry.xpos) file.write("\t") file.write(entry.attrs) file.write("\t") file.write(str(entry.head)) file.write("\t") file.write(entry.label) file.write("\t") file.write(entry.deps) file.write("\t") file.write(entry.space_after) file.write("\n") file.write("\n")
def version_to_donwload(self, lang_code, check_for_latest=True): """ Returns the version of the language models that need to be downloaded, or None if there's nothing to be done. """ lang_models = os.path.join(self.disk_path, lang_code) lang_models_version = os.path.join(lang_models, 'VERSION') # Get current version (if any). current_version = None if os.path.exists(lang_models): with fopen(lang_models_version) as fd: current_version = fd.read().strip('\n') # Get the latest version. latest_versions = self.get_latest_model_versions() latest_version = latest_versions.get(lang_code) if check_for_latest: if not latest_version: if not current_version: raise ValueError( 'No remote version found for {}!'.format(lang_code)) print('No remote version found for {}, using the local ' 'version {}'.format(lang_code, current_version)) return if current_version and current_version >= latest_version: return return latest_version if not current_version: return latest_version
def __init__(self, file=None): if file is not None: sys.stdout.write("Reading " + file + "... ") sys.stdout.flush() with fopen(file, "r") as f: lines = f.readlines() self.sequences = self._make_sequences(lines) sys.stdout.write("found " + str(len(self.sequences)) + " sequences\n")
def _download_facebook_embeddings(self, lang_code): """ Download Facebook embeddings for the provided lang_code. """ name = self.EMBEDDINGS_NAME.format(lang_code) embeddings_url = self.FACEBOOK_EMBEDDINGS_URL + name embeddings_path = os.path.join(self.disk_path, lang_code, name) request = requests.get(embeddings_url) with fopen(embeddings_path, 'wb') as fd: fd.write(request.content)
def read_from_file(self, word_embeddings_file, word_list, full_load=False): self.word2vec = {} self.num_embeddings = 0 if word_list is None and not full_load: self.cache_only = True f = fopen(word_embeddings_file, "r") first_line = True while True: ofs = f.tell() line = f.readline() if line == '': break # print ofs line = line.replace("\n", "").replace("\r", "") if first_line: first_line = False else: self.num_embeddings += 1 if self.verbose: if self.num_embeddings % 10000 == 0: sys.stdout.write(" Scanned " + str(self.num_embeddings) + " word embeddings and added " + str(len(self.word2vec)) + " \n") parts = line.split(" ") if sys.version_info[0] == 2: word = parts[0].decode('utf-8') else: word = parts[0] if self.cache_only: self.word2ofs[word] = ofs elif full_load or word in word_list: embeddings = [float(0)] * (len(parts) - 2) for zz in range(len(parts) - 2): embeddings[zz] = float(parts[zz + 1]) self.word2vec[word] = embeddings self.word_embeddings_size = len(parts) - 2 f.close() if self.cache_only: self.file_pointer = fopen(word_embeddings_file, "r")
def save(self, filename): assert (filename.endswith("metadata.json")) obj = {} obj["language"] = self.language obj["language_code"] = self.language_code obj["model_version"] = self.model_version obj["embeddings_remote_link"] = self.embeddings_remote_link obj["embeddings_file_name"] = self.embeddings_file_name obj["token_delimiter"] = self.token_delimiter obj["model_build_date"] = self.model_build_date obj["model_build_source"] = self.model_build_source obj["notes"] = self.notes json.dump(obj, fopen(filename, "w"), indent=4, sort_keys=True)
def read(self, filename): assert (filename.endswith("metadata.json")) data = json.load(fopen(filename, "r")) if sys.version_info[0] == 2: items = data.iteritems() else: items = data.items() for key, value in items: if key == "model_version": # safety check to keep the version as float if isinstance(value, str): self.__dict__[key] = float(value) else: self.__dict__[key] = value
def load_dict(self, path): #print ("Loading lemma dictionary") with fopen(path, "r") as f: lines = f.readlines() for line in lines: parts = line.strip().split('\t') if len(parts) == 5: if sys.version_info[0] == 2: word = unicode(parts[0], 'utf-8').lower().encode('utf-8') else: word = parts[0].lower() upos = parts[1] key = word + '\t' + upos self.word2lemma[key] = parts[4]
def _download_with_progress_bar(self, url, local_filename): r = requests.get(url, stream=True) total_size = int(r.headers['Content-Length'].strip()) current_size = 0 #request_content = [] f = fopen(local_filename, 'wb') for buf in r.iter_content(4096 * 16): if buf: #request_content.append(buf) f.write(buf) current_size += len(buf) done = int(40 * current_size / total_size) sys.stdout.write( "\r[%s%s] %3.1f%%, downloading %.2f/%.2f MB ..." % ('=' * done, ' ' * (40 - done), 100 * current_size / total_size, current_size / 1024 / 1024, total_size / 1024 / 1024)) sys.stdout.flush() #return b"".join(request_content) f.close()
def eval(self, dataset, filename=None): total_bleu = 0.0 last_proc = 0 iSeq = 0 if filename is not None: f = fopen(filename,"w",encoding="utf-8") for seq in dataset.sequences: proc = int((iSeq + 1) * 100 / len(dataset.sequences)) if proc % 5 == 0 and proc != last_proc: last_proc = proc sys.stdout.write(" " + str(proc)) sys.stdout.flush() iSeq += 1 hyp = self.translator.translate(seq.src) ref = [entry.word for entry in seq.dst] hyp = list(hyp) ref = list(ref) # print "hyp=",hyp # print "ref=",ref # print "\n\n\n\n" # sys.stdout.flush() if filename is not None: for entry in seq.src: f.write(entry.word + " ") f.write("\n") for entry in seq.dst: f.write(entry.word + " ") f.write("\n") for word in hyp: f.write(word.encode('utf-8') + " ") f.write("\n\n") if len(ref) >= 4 and len(hyp) >= 4: score = nltk.translate.bleu_score.sentence_bleu([ref], hyp) total_bleu += score if filename is not None: f.close() return total_bleu / len(dataset.sequences)
def save(self, filename): """Save configuration to file.""" sorted_dict = collections.OrderedDict(sorted(self.__dict__.items())) # sort dictionary if sys.version_info[0] == 2: config = ConfigParser.ConfigParser() else: config = configparser.ConfigParser() config.add_section(self.__config__) # write header if sys.version_info[0] == 2: items = sorted_dict.iteritems() else: items = sorted_dict.items() for k, v in items: # for python3 use .items() if not k.startswith("_"): # write only non-private properties if isinstance(v, float): # if we are dealing with a float str_v = str(v) if "e" not in str_v and "." not in str_v: # stop possible confusion with an int by appending a ".0" v = str_v + ".0" v = str(v) config.set(self.__config__, k, v) with fopen(filename, 'w') as cfgfile: config.write(cfgfile)
def load(self, filename): # We only read character2int, labels, holistic words and label2int here. word_list should be recomputed for every dataset (if deemed necessary) with fopen(filename,"r") as f: line = f.readline() num_labels = int(line.split(" ")[1]) if self.verbose: print ("Loading labels " + str(num_labels)) self.labels = [""] * num_labels for _ in range(num_labels): line = f.readline() parts = line.split("\t") key = parts[0] value = int(parts[1]) self.label2int[key] = value self.labels[value] = key line = f.readline() num_characters = int(line.split(" ")[1]) self.characters = [""] * num_characters if self.verbose: print ("Loading characters " + str(num_characters)) for _ in range(num_characters): line = f.readline() parts = line.split("\t") import sys if sys.version_info[0] == 2: key = parts[0].decode('utf-8') else: key = parts[0] value = int(parts[1]) self.char2int[key] = value self.characters[value] = key line = f.readline() num_words = int(line.split(" ")[1]) if self.verbose: print ("Loading words " + str(num_words)) for _x in range(num_words): line = f.readline() parts = line.split("\t") import sys if sys.version_info[0] == 2: key = parts[0].decode('utf-8') else: key = parts[0] value = int(parts[1]) self.word2int[key] = value # morphological attributes line = f.readline() num_labels = int(line.split(" ")[1]) if self.verbose: print ("Loading upos " + str(num_labels)) self.upos_list = [""] * num_labels for _ in range(num_labels): line = f.readline() parts = line.split("\t") key = parts[0] value = int(parts[1]) self.upos2int[key] = value self.upos_list[value] = key line = f.readline() num_labels = int(line.split(" ")[1]) self.xpos_list = [""] * num_labels if self.verbose: print ("Loading xpos " + str(num_labels)) for _ in range(num_labels): line = f.readline() parts = line.split("\t") key = parts[0] value = int(parts[1]) self.xpos2int[key] = value self.xpos_list[value] = key line = f.readline() num_labels = int(line.split(" ")[1]) self.attrs_list = [""] * num_labels if self.verbose: print ("Loading attrs " + str(num_labels)) for _ in range(num_labels): line = f.readline() parts = line.split("\t") key = parts[0] value = int(parts[1]) self.attrs2int[key] = value self.attrs_list[value] = key f.close()
def parse_run(params): sys.stdout.write("\nINPUT FILE: " + params.input_file) sys.stdout.write("\nOUTPUT FILE: " + params.output_file) sys.stdout.write("\nMODELS FILE: " + params.models + "\n") sys.stdout.flush() components = params.run.split(",") tokenize = True if "tokenizer" in components else False compound = True if "compound" in components else False lemmatize = True if "lemmatizer" in components else False tag = True if "tagger" in components else False parse = True if "parser" in components else False # common elements load sys.stdout.write("\nLoading embeddings : " + params.embeddings + " ...\n") embeddings = WordEmbeddings() embeddings.read_from_file(params.embeddings, None) encodings = None if tokenize == True: if not os.path.isfile( os.path.join(params.models, "tokenizer-tok.bestAcc")): sys.stdout.write( "\n\tTokenizer model not found! (" + os.path.join(params.models, "tokenizer-tok.bestAcc") + ")") sys.stdout.flush() sys.exit(1) sys.stdout.write("\n\tTokenization enabled.\n") tokenizer_encodings = Encodings(verbose=False) tokenizer_encodings.load( os.path.join(params.models, "tokenizer.encodings")) if compound == True: if not os.path.isfile(os.path.join(params.models, "compound.bestAcc")): sys.stdout.write("\n\tCompound word expander model not found!") sys.stdout.flush() sys.exit(1) sys.stdout.write("\n\tCompound word expander enabled.\n") if encodings == None: encodings = Encodings(verbose=False) encodings.load(os.path.join(params.models, "lemmatizer.encodings")) if lemmatize == True: if not os.path.isfile(os.path.join( params.models, "lemmatizer.bestACC")) and not os.path.isfile( os.path.join(params.models, "lemmatizer.bestAcc")): sys.stdout.write("\n\tLemmatization model not found!") sys.stdout.flush() sys.exit(1) sys.stdout.write("\n\tLemmatization enabled.\n") if encodings == None: encodings = Encodings(verbose=False) encodings.load(os.path.join(params.models, "lemmatizer.encodings")) if tag == True: if not os.path.isfile(os.path.join(params.models, "tagger.bestOVERALL")): sys.stdout.write("\n\tTagger model not found!") sys.stdout.flush() sys.exit(1) sys.stdout.write("\n\tTagger enabled.\n") if encodings == None: encodings = Encodings(verbose=False) encodings.load(os.path.join(params.models, "tagger.encodings")) if parse == True: if not os.path.isfile(os.path.join(params.models, "parser.bestUAS")): sys.stdout.write("\n\tParser model not found!") sys.stdout.flush() sys.exit(1) sys.stdout.write("\n\tParser enabled.\n") if encodings == None: encodings = Encodings(verbose=False) encodings.load(os.path.join(params.models, "parser.encodings")) sequences = None if tokenize: sys.stdout.write("\nTokenizing " + params.input_file + " ... \n\t") sys.stdout.flush() from io_utils.config import TieredTokenizerConfig from generic_networks.tokenizers import TieredTokenizer config = TieredTokenizerConfig( os.path.join(params.models, "tokenizer.conf")) tokenizer_object = TieredTokenizer(config, tokenizer_encodings, embeddings, runtime=True) tokenizer_object.load(os.path.join(params.models, "tokenizer")) with fopen(params.input_file, 'r') as file: lines = file.readlines() # analyze use of spaces in first part of the file test = "" useSpaces = " " cnt = 0 while True: test = test + lines[cnt] # print(lines[cnt]) if cnt >= len(lines) or cnt > 5: break cnt += 1 if float(test.count(' ')) / float(len(test)) < 0.02: useSpaces = "" # print (str(float(test.count(' '))/float(len(test)))) i = -1 input_string = "" sequences = [] while i < len(lines) - 1: i += 1 input_string = input_string + lines[i].replace("\r", "").replace( "\n", "").strip() + useSpaces if lines[i].strip() == "" or i == len(lines) - 1: # end of block if input_string.strip() != "": sequences += tokenizer_object.tokenize(input_string) input_string = "" del tokenizer_object # free memory else: ds = Dataset(params.input_file) sequences = ds.sequences sys.stdout.write(" done\n") sys.stdout.flush() if compound: sys.stdout.write("\nCompound word expanding " + params.input_file + " ... \n\t") sys.stdout.flush() from generic_networks.token_expanders import CompoundWordExpander from io_utils.config import CompoundWordConfig config = CompoundWordConfig( os.path.join(params.models, "compound.conf")) compoundwordexpander_object = CompoundWordExpander(config, encodings, embeddings, runtime=True) compoundwordexpander_object.load( os.path.join(params.models, "compound.bestAcc")) sequences = compoundwordexpander_object.expand_sequences(sequences) del compoundwordexpander_object # free memory sys.stdout.write(" done\n") sys.stdout.flush() if parse == True: sys.stdout.write("\nParsing " + params.input_file + " ... \n\t") sys.stdout.flush() from io_utils.config import ParserConfig from generic_networks.parsers import BDRNNParser config = ParserConfig(os.path.join(params.models, "parser.conf")) parser_object = BDRNNParser(config, encodings, embeddings, runtime=True) parser_object.load(os.path.join(params.models, "parser.bestUAS")) sequences = parser_object.parse_sequences(sequences) del parser_object # free memory sys.stdout.write(" done\n") sys.stdout.flush() if tag == True: sys.stdout.write("\nTagging " + params.input_file + " ... \n\t") sys.stdout.flush() from io_utils.config import TaggerConfig from generic_networks.taggers import BDRNNTagger config = TaggerConfig(os.path.join(params.models, "tagger.conf")) tagger_object_UPOS = BDRNNTagger(config, encodings, embeddings, runtime=True) tagger_object_UPOS.load(os.path.join(params.models, "tagger.bestUPOS")) tagger_object_XPOS = BDRNNTagger(config, encodings, embeddings, runtime=True) tagger_object_XPOS.load(os.path.join(params.models, "tagger.bestXPOS")) tagger_object_ATTRS = BDRNNTagger(config, encodings, embeddings, runtime=True) tagger_object_ATTRS.load( os.path.join(params.models, "tagger.bestATTRS")) new_sequences = [] for sequence in sequences: new_sequence = copy.deepcopy(sequence) predicted_tags_UPOS = tagger_object_UPOS.tag(new_sequence) predicted_tags_XPOS = tagger_object_XPOS.tag(new_sequence) predicted_tags_ATTRS = tagger_object_ATTRS.tag(new_sequence) for entryIndex in range(len(sequence)): new_sequence[entryIndex].upos = predicted_tags_UPOS[ entryIndex][0] new_sequence[entryIndex].xpos = predicted_tags_XPOS[ entryIndex][1] new_sequence[entryIndex].attrs = predicted_tags_ATTRS[ entryIndex][2] new_sequences.append(new_sequence) sequences = copy.deepcopy(new_sequences) del tagger_object_UPOS # free memory del tagger_object_XPOS # free memory del tagger_object_ATTRS # free memory sys.stdout.write(" done\n") sys.stdout.flush() if lemmatize: sys.stdout.write("\nLemmatizing " + params.input_file + " ... \n\t") sys.stdout.flush() from generic_networks.lemmatizers import FSTLemmatizer from io_utils.config import LemmatizerConfig config = LemmatizerConfig( os.path.join(params.models, "lemmatizer.conf")) lemmatizer_object = FSTLemmatizer(config, encodings, embeddings, runtime=True) if os.path.isfile(os.path.join(params.models, "lemmatizer.bestACC")): lemmatizer_object.load( os.path.join(params.models, "lemmatizer.bestACC")) else: lemmatizer_object.load( os.path.join(params.models, "lemmatizer.bestAcc")) sequences = lemmatizer_object.lemmatize_sequences(sequences) del lemmatizer_object # free memory sys.stdout.write(" done\n") sys.stdout.flush() output_dataset = Dataset() output_dataset.sequences = sequences output_dataset.write(params.output_file)