Пример #1
0
def preprocess(dataset_name, delete_data=False):
    full_path = 'data/{0}/e1rel_to_e2_full.json'.format(dataset_name)
    train_path = 'data/{0}/e1rel_to_e2_train.json'.format(dataset_name)
    dev_ranking_path = 'data/{0}/e1rel_to_e2_ranking_dev.json'.format(
        dataset_name)
    test_ranking_path = 'data/{0}/e1rel_to_e2_ranking_test.json'.format(
        dataset_name)

    keys2keys = {}
    keys2keys['e1'] = 'e1'  # entities
    keys2keys['rel'] = 'rel'  # relations
    keys2keys['e2'] = 'e1'  # entities
    keys2keys['e2_multi1'] = 'e1'  # entity
    keys2keys['e2_multi2'] = 'e1'  # entity
    input_keys = ['e1', 'rel', 'e2', 'e2_multi1', 'e2_multi2']
    d = DatasetStreamer(input_keys)
    d.add_stream_processor(JsonLoaderProcessors())
    d.add_stream_processor(DictKey2ListMapper(input_keys))

    # process full vocabulary and save it to disk
    d.set_path(full_path)
    p = Pipeline(Config.dataset,
                 delete_data,
                 keys=input_keys,
                 skip_transformation=True)
    p.add_sent_processor(ToLower())
    p.add_sent_processor(CustomTokenizer(lambda x: x.split(' ')),
                         keys=['e2_multi1', 'e2_multi2'])
    p.add_token_processor(AddToVocab())
    p.execute(d)
    p.save_vocabs()

    # process train, dev and test sets and save them to hdf5
    p.skip_transformation = False
    for path, name in zip([train_path, dev_ranking_path, test_ranking_path],
                          ['train', 'dev_ranking', 'test_ranking']):
        d.set_path(path)
        p.clear_processors()
        p.add_sent_processor(ToLower())
        p.add_sent_processor(CustomTokenizer(lambda x: x.split(' ')),
                             keys=['e2_multi1', 'e2_multi2'])
        p.add_post_processor(
            ConvertTokenToIdx(keys2keys=keys2keys),
            keys=['e1', 'rel', 'e2', 'e2_multi1', 'e2_multi2'])
        p.add_post_processor(
            StreamToHDF5(name,
                         samples_per_file=1500
                         if Config.dataset == 'YAGO3-10' else 1000,
                         keys=input_keys))
        p.execute(d)
Пример #2
0
def preprocess(args, delete_data=False):

    keys2keys = {}
    keys2keys['e1'] = 'e1'  # entities
    keys2keys['rel'] = 'rel'  # relations
    keys2keys['rel_eval'] = 'rel'  # relations
    keys2keys['e2'] = 'e1'  # entities
    keys2keys['e2_multi1'] = 'e1'  # entity
    keys2keys['e2_multi2'] = 'e1'  # entity
    input_keys = ['e1', 'rel', 'rel_eval', 'e2', 'e2_multi1', 'e2_multi2']
    print('create dataset streamer', flush=True)
    d = DatasetStreamer(input_keys)
    d.add_stream_processor(JsonLoaderProcessors())
    d.add_stream_processor(DictKey2ListMapper(input_keys))

    # process full vocabulary and save it to disk
    d.set_path(args.train_path)
    print('create pipeline', flush=True)
    p = Pipeline(args.data,
                 delete_data,
                 keys=input_keys,
                 skip_transformation=True)
    p.add_sent_processor(ToLower())
    p.add_sent_processor(CustomTokenizer(lambda x: x.split(' ')),
                         keys=['e2_multi1', 'e2_multi2'])
    p.add_token_processor(AddToVocab())
    print('execute full vocabs', flush=True)
    p.execute(d)
    print('save full vocabs', flush=True)
    p.save_vocabs()

    # process train sets and save them to hdf5
    p.skip_transformation = False
    d.set_path(args.train_path)
    p.clear_processors()
    p.add_sent_processor(ToLower())
    p.add_sent_processor(CustomTokenizer(lambda x: x.split(' ')),
                         keys=['e2_multi1', 'e2_multi2'])
    p.add_post_processor(
        ConvertTokenToIdx(keys2keys=keys2keys),
        keys=['e1', 'rel', 'rel_eval', 'e2', 'e2_multi1', 'e2_multi2'])
    p.add_post_processor(
        StreamToHDF5('train', samples_per_file=1000, keys=input_keys))
    print('execute and save train vocabs', flush=True)
    p.execute(d)