def test_is_placeholder(): assert not pyonmttok.is_placeholder("hello") assert pyonmttok.is_placeholder("⦅hello⦆")
def finalize(self): config = self._config if not self._source_counters and not self._target_counters: return tok_config = config["preprocess"][self._tok_step] if self._source_counters is self._target_counters: vocabularies = [("multi", self._source_counters)] else: vocabularies = [] if self._source_counters: vocabularies.append(("source", self._source_counters)) if self._target_counters: vocabularies.append(("target", self._target_counters)) for side, counters in vocabularies: vocabulary = counters["tokens"] total_size = counters["total"] name = (tok_config[side]["build_vocabulary"]["name"] if "name" in tok_config[side]["build_vocabulary"] else "vocab" + str(self._tok_step)) logger.info("Generating %s vocabulary '%s'", side, name) # Size option is mandatory, already checked it. size = tok_config[side]["build_vocabulary"]["size"] min_frequency = ( tok_config[side]["build_vocabulary"]["min-frequency"] if "min-frequency" in tok_config[side]["build_vocabulary"] else 0) added_size = 0 # Merge previously created vocabulary. vocab_to_merge = (tok_config[side]["build_vocabulary"]["merge"] if "merge" in tok_config[side]["build_vocabulary"] else None) if vocab_to_merge and os.path.isfile(vocab_to_merge): for w in tokenizer.vocabulary_iterator(vocab_to_merge): if w: # Set heaviest frequency on tokens from vocabulary to merge. vocabulary[w] = float("inf") added_size += 1 # Add extra tokens from a list. vocab_to_add = (tok_config[side]["build_vocabulary"]["add"] if "add" in tok_config[side]["build_vocabulary"] else []) for w in vocab_to_add: vocabulary[w] = float("inf") added_size += 1 if added_size > size: raise RuntimeError( "The size of extra tokens from 'merge' and 'add' (%d) cannot be bigger than than the required vocabulary size (%d)" % (added_size, size)) # Add tokens added by operators, such as extra numbered placeholders that might not all be present in the sampled data. new_tokens = self._tokens_to_add.new_tokens if side == "multi": tokens_to_add = set().union(*new_tokens.values()) else: tokens_to_add = new_tokens[side] for ph in tokens_to_add: vocabulary[ph] = float("inf") # First add placeholders to vocabulary. sorted_vocabulary = [ item for item in vocabulary.items() if pyonmttok.is_placeholder(item[0]) ] # Then add everything else in frequency order. sorted_vocabulary.extend( sorted( [ item for item in vocabulary.items() if not pyonmttok.is_placeholder(item[0]) ], key=lambda k_v: k_v[1], reverse=True, )) # Find out the real vocabulary size. real_size = self._prune(sorted_vocabulary, size, min_frequency) # Write to file. if side == "multi": out_file = os.path.join( self._result_dir, "joint_vocab_%s-%d.%s_%s" % (name, real_size, config["source"], config["target"]), ) tok_config["source"]["vocabulary_path"] = out_file tok_config["target"]["vocabulary_path"] = out_file else: out_file = os.path.join( self._result_dir, "vocab_%s-%d.%s" % (name, real_size, config[side])) tok_config[side]["vocabulary_path"] = out_file with open(out_file, "w") as vocab_file: # Add header with configuration vocab_file.write("# Generated by buildvocab\n") vocab_file.write("# CONFIG: {} \n".format(self._config)) for i in range(real_size): w, f = sorted_vocabulary[i] vocab_file.write("%s %s\n" % (w, f / float(total_size)))