示例#1
0
        nlp = English()
        with open('train.json', 'wb') as f:
            json.dump(parse_file(train_file, nlp), f, indent=2)
        with open('test.json', 'wb') as f:
            json.dump(parse_file(test_file, nlp), f, indent=2)

    logging.info('starting numericalization')
    word_vocab = SennaVocab()
    rel_vocab = Vocab()

    with open('train.json') as f:
        train = json.load(f)
    with open('test.json') as f:
        test = json.load(f)

    numericalize(train, word_vocab, rel_vocab, add=True)
    word_vocab = word_vocab.prune_rares(cutoff=2)
    word_vocab = word_vocab.sort_by_decreasing_count()
    rel_vocab = rel_vocab.sort_by_decreasing_count()
    train = numericalize(train, word_vocab, rel_vocab, add=False)
    test = numericalize(test, word_vocab, rel_vocab, add=False)

    with open('vocab.pkl', 'wb') as f:
        pkl.dump({'word': word_vocab, 'rel': rel_vocab}, f)

    with open('trainXY.json', 'wb') as f:
        json.dump(train, f, indent=2)

    with open('testXY.json', 'wb') as f:
        json.dump(test, f, indent=2)