Exemplo n.º 1
0
    def encode_file(self,
                    path,
                    ordered=False,
                    verbose=False,
                    add_eos=False,
                    add_double_eos=False):
        if verbose: print('encoding file {} ...'.format(path))
        assert exists(path)
        encoded = []
        with open(path, 'r') as f:
            for idx, line in enumerate(f):
                if verbose and idx > 0 and idx % 500000 == 0:
                    print('  line {}'.format(idx))
                if len(line.strip()) == 0:
                    continue
                symbols = self.tokenize(line,
                                        add_eos=add_eos,
                                        add_double_eos=add_double_eos)

                encoded.append(self.convert_to_nparray(symbols))

        if ordered:
            encoded = np.concatenate(encoded)

        return encoded
Exemplo n.º 2
0
    def save_vocab(self, path, vocab_name):
        with open(os.path.join(path, "vocabulary_%s.txt" % vocab_name),
                  "w") as write_vocab:
            for i in self.idx2sym:
                write_vocab.write(i + "\n")

        print("  Vocabulary of {} is saved under path: {}".format(
            vocab_name, os.path.abspath(path)))
Exemplo n.º 3
0
  def _build_from_file(self, vocab_file):
    self.idx2sym = []
    self.sym2idx = OrderedDict()

    with open(vocab_file, 'r') as f:
      for line in f:
        symb = line.strip().split()[0]
        self.add_symbol(symb)
    self.unk_idx = self.sym2idx['<UNK>']
Exemplo n.º 4
0
    def count_file(self, path, verbose=False, add_eos=False):
        if verbose: print('counting file {} ...'.format(path))
        assert exists(path)

        with open(path, 'r') as f:
            for idx, line in enumerate(f):
                if verbose and idx > 0 and idx % 500000 == 0:
                    print('  line {}'.format(idx))
                symbols = self.tokenize(line, add_eos=add_eos)
                self.counter.update(symbols)
Exemplo n.º 5
0
    def _build_from_file(self, vocab_file):
        self.idx2sym = []
        self.sym2idx = OrderedDict()

        with open(vocab_file, 'r') as f:
            for line in f:
                symb = line.replace("\n", "")
                self.add_symbol(symb)

        self.unk_idx = self.sym2idx['<UNK>']