def __init__(self, paths): self.articles = OrderedDict([(path, Article(path)) for path in paths]) self.corpus = Corpus.generate(self.articles) self.tree = DirTree(paths)
if __name__ == '__main__': from argparse import ArgumentParser import numpy as np from corpus import Corpus, Indexer from keras.utils.np_utils import to_categorical parser = ArgumentParser() parser.add_argument('-r', '--root', type=str, required=True) args = parser.parse_args() root = args.root idxr = Indexer(reserved={0: 'padding', 1: 'OOV'}) train = Corpus(root + 'train') test = Corpus(root + 'test') dev = Corpus(root + 'dev') idxr.encode_seq(train.chars()) # quick pass to fit vocab print("Encoding test set") X_test, y_test = list(zip(*test.generate(idxr, oov_idx=1))) X_test = np.asarray(X_test), y_test = to_categorical(y_test, nb_classes=idxr.vocab_len()) print("Encoding dev set") X_dev, y_dev = list(zip(*dev.generate(idxr, oov_idx=1))) X_dev = np.asarray(X_dev), y_dev = to_categorical(y_dev, nb_classes=idxr.vocab_len())