def preprocess(dataset_name, delete_data=False): full_path = 'data/{0}/e1rel_to_e2_full.json'.format(dataset_name) train_path = 'data/{0}/e1rel_to_e2_train.json'.format(dataset_name) dev_ranking_path = 'data/{0}/e1rel_to_e2_ranking_dev.json'.format( dataset_name) test_ranking_path = 'data/{0}/e1rel_to_e2_ranking_test.json'.format( dataset_name) keys2keys = {} keys2keys['e1'] = 'e1' # entities keys2keys['rel'] = 'rel' # relations keys2keys['e2'] = 'e1' # entities keys2keys['e2_multi1'] = 'e1' # entity keys2keys['e2_multi2'] = 'e1' # entity input_keys = ['e1', 'rel', 'e2', 'e2_multi1', 'e2_multi2'] d = DatasetStreamer(input_keys) d.add_stream_processor(JsonLoaderProcessors()) d.add_stream_processor(DictKey2ListMapper(input_keys)) # process full vocabulary and save it to disk d.set_path(full_path) p = Pipeline(Config.dataset, delete_data, keys=input_keys, skip_transformation=True) p.add_sent_processor(ToLower()) p.add_sent_processor(CustomTokenizer(lambda x: x.split(' ')), keys=['e2_multi1', 'e2_multi2']) p.add_token_processor(AddToVocab()) p.execute(d) p.save_vocabs() # process train, dev and test sets and save them to hdf5 p.skip_transformation = False for path, name in zip([train_path, dev_ranking_path, test_ranking_path], ['train', 'dev_ranking', 'test_ranking']): d.set_path(path) p.clear_processors() p.add_sent_processor(ToLower()) p.add_sent_processor(CustomTokenizer(lambda x: x.split(' ')), keys=['e2_multi1', 'e2_multi2']) p.add_post_processor( ConvertTokenToIdx(keys2keys=keys2keys), keys=['e1', 'rel', 'e2', 'e2_multi1', 'e2_multi2']) p.add_post_processor( StreamToHDF5(name, samples_per_file=1500 if Config.dataset == 'YAGO3-10' else 1000, keys=input_keys)) p.execute(d)
def preprocess(args, delete_data=False): keys2keys = {} keys2keys['e1'] = 'e1' # entities keys2keys['rel'] = 'rel' # relations keys2keys['rel_eval'] = 'rel' # relations keys2keys['e2'] = 'e1' # entities keys2keys['e2_multi1'] = 'e1' # entity keys2keys['e2_multi2'] = 'e1' # entity input_keys = ['e1', 'rel', 'rel_eval', 'e2', 'e2_multi1', 'e2_multi2'] print('create dataset streamer', flush=True) d = DatasetStreamer(input_keys) d.add_stream_processor(JsonLoaderProcessors()) d.add_stream_processor(DictKey2ListMapper(input_keys)) # process full vocabulary and save it to disk d.set_path(args.train_path) print('create pipeline', flush=True) p = Pipeline(args.data, delete_data, keys=input_keys, skip_transformation=True) p.add_sent_processor(ToLower()) p.add_sent_processor(CustomTokenizer(lambda x: x.split(' ')), keys=['e2_multi1', 'e2_multi2']) p.add_token_processor(AddToVocab()) print('execute full vocabs', flush=True) p.execute(d) print('save full vocabs', flush=True) p.save_vocabs() # process train sets and save them to hdf5 p.skip_transformation = False d.set_path(args.train_path) p.clear_processors() p.add_sent_processor(ToLower()) p.add_sent_processor(CustomTokenizer(lambda x: x.split(' ')), keys=['e2_multi1', 'e2_multi2']) p.add_post_processor( ConvertTokenToIdx(keys2keys=keys2keys), keys=['e1', 'rel', 'rel_eval', 'e2', 'e2_multi1', 'e2_multi2']) p.add_post_processor( StreamToHDF5('train', samples_per_file=1000, keys=input_keys)) print('execute and save train vocabs', flush=True) p.execute(d)