class WordTokenizer(BaseTokenizer): def __init__(self): self.tokenizer = MosesTokenizer() def tokenize(self, text: str) -> List[str]: return self.tokenizer(text.strip()) def detokenize(self, tokens: List[str]) -> str: text = " ".join(tokens).strip() return text def close(self): self.tokenizer.close()
def process_corpus(embeddings_dictionary, corpus, vectors, language): """ Cleans corpus using the dictionary of embeddings. Any word without an associated embedding in the dictionary is ignored. Adds '__target-language' and '__source-language' at the end of the words according to their language. """ clean_corpus, clean_vectors, keys = [], {}, [] words_we_want = set(embeddings_dictionary) tokenize = MosesTokenizer(language) for key, doc in enumerate(corpus): clean_doc = [] words = tokenize(doc) for word in words: if word in words_we_want: clean_doc.append(word + "__%s" % language) clean_vectors[word + "__%s" % language] = np.array( vectors[word].split()).astype(np.float) if len(clean_doc) > 3 and len(clean_doc) < 25: keys.append(key) clean_corpus.append(" ".join(clean_doc)) tokenize.close() return np.array(clean_corpus), clean_vectors, keys
] special_tokens.extend(args["special_symbols"].split(",")) # slice with vocab size vocab = counter.most_common(args["vocab_size"] - len(special_tokens)) # print out-of-vocabulary total_freq = sum(counter.values()) oov_freq = total_freq - sum([v[1] for v in vocab]) print( f"oov: {oov_freq}/{total_freq} ({oov_freq * 100.0 / total_freq:.2f}%)") # save word vocab output_vocab_path = os.path.join(output_dir, "tok.vocab") with open(output_vocab_path, "w", encoding="utf-8") as f: for token in special_tokens: f.write(f"{token}\t-1\n") for token, freq in vocab: f.write(f"{token}\t{freq}\n") # save fairseq vocab with open(os.path.join(output_dir, "fairseq.vocab"), "w") as fout: with open(os.path.join(output_dir, "tok.vocab"), "r") as fin: start_idx = 4 + len(args["special_symbols"].split( ",")) # pad, unk, bos, eos + special_symbols for line in fin.readlines()[start_idx:]: splitted = line.split("\t") fout.write(f"{' '.join(splitted)}") tokenize.close() print("done.")