Exemplo n.º 1
0
def build_vocab(filepath, tokenizer):
  counter = Counter()
  with io.open(filepath, encoding="utf8") as f:
    for string_ in f:
      counter.update(tokenizer(string_))

  vocab = Vocab(counter, specials=['<unk>', '<pad>', '<sos>', '<eos>'], max_size=10000, min_freq=2);
  vocab.init_token = '<sos>'
  vocab.eos_token = '<eos>'
  return vocab
Exemplo n.º 2
0
def build_vocab_tsv(filepath, tokenizer, column = 0):
  counter = Counter()
  with io.open(filepath, encoding="utf8") as f:
    for string_ in f:
      string_ = string_.rstrip()
      string_ = string_.split("\t")
      string_ = string_[column]
      counter.update(tokenizer(string_.lower()))

  vocab = Vocab(counter, specials=['<unk>', '<pad>', '<sos>', '<eos>'], max_size=40000, min_freq=2);
  vocab.init_token = '<sos>'
  vocab.eos_token = '<eos>'
  vocab.pad_token = '<pad>'
  return vocab