def preprocess(dataset_name, delete_data=False): full_path = 'data/{0}/e1rel_to_e2_full.json'.format(dataset_name) train_path = 'data/{0}/e1rel_to_e2_train.json'.format(dataset_name) dev_ranking_path = 'data/{0}/e1rel_to_e2_ranking_dev.json'.format( dataset_name) test_ranking_path = 'data/{0}/e1rel_to_e2_ranking_test.json'.format( dataset_name) keys2keys = {} keys2keys['e1'] = 'e1' # entities keys2keys['rel'] = 'rel' # relations keys2keys['e2'] = 'e1' # entities keys2keys['e2_multi1'] = 'e1' # entity keys2keys['e2_multi2'] = 'e1' # entity input_keys = ['e1', 'rel', 'e2', 'e2_multi1', 'e2_multi2'] d = DatasetStreamer(input_keys) d.add_stream_processor(JsonLoaderProcessors()) d.add_stream_processor(DictKey2ListMapper(input_keys)) # process full vocabulary and save it to disk d.set_path(full_path) p = Pipeline(Config.dataset, delete_data, keys=input_keys, skip_transformation=True) p.add_sent_processor(ToLower()) p.add_sent_processor(CustomTokenizer(lambda x: x.split(' ')), keys=['e2_multi1', 'e2_multi2']) p.add_token_processor(AddToVocab()) p.execute(d) p.save_vocabs() # process train, dev and test sets and save them to hdf5 p.skip_transformation = False for path, name in zip([train_path, dev_ranking_path, test_ranking_path], ['train', 'dev_ranking', 'test_ranking']): d.set_path(path) p.clear_processors() p.add_sent_processor(ToLower()) p.add_sent_processor(CustomTokenizer(lambda x: x.split(' ')), keys=['e2_multi1', 'e2_multi2']) p.add_post_processor( ConvertTokenToIdx(keys2keys=keys2keys), keys=['e1', 'rel', 'e2', 'e2_multi1', 'e2_multi2']) p.add_post_processor( StreamToHDF5(name, samples_per_file=1500 if Config.dataset == 'YAGO3-10' else 1000, keys=input_keys)) p.execute(d)
def preprocess(args, delete_data=False): keys2keys = {} keys2keys['e1'] = 'e1' # entities keys2keys['rel'] = 'rel' # relations keys2keys['rel_eval'] = 'rel' # relations keys2keys['e2'] = 'e1' # entities keys2keys['e2_multi1'] = 'e1' # entity keys2keys['e2_multi2'] = 'e1' # entity input_keys = ['e1', 'rel', 'rel_eval', 'e2', 'e2_multi1', 'e2_multi2'] print('create dataset streamer', flush=True) d = DatasetStreamer(input_keys) d.add_stream_processor(JsonLoaderProcessors()) d.add_stream_processor(DictKey2ListMapper(input_keys)) # process full vocabulary and save it to disk d.set_path(args.train_path) print('create pipeline', flush=True) p = Pipeline(args.data, delete_data, keys=input_keys, skip_transformation=True) p.add_sent_processor(ToLower()) p.add_sent_processor(CustomTokenizer(lambda x: x.split(' ')), keys=['e2_multi1', 'e2_multi2']) p.add_token_processor(AddToVocab()) print('execute full vocabs', flush=True) p.execute(d) print('save full vocabs', flush=True) p.save_vocabs() # process train sets and save them to hdf5 p.skip_transformation = False d.set_path(args.train_path) p.clear_processors() p.add_sent_processor(ToLower()) p.add_sent_processor(CustomTokenizer(lambda x: x.split(' ')), keys=['e2_multi1', 'e2_multi2']) p.add_post_processor( ConvertTokenToIdx(keys2keys=keys2keys), keys=['e1', 'rel', 'rel_eval', 'e2', 'e2_multi1', 'e2_multi2']) p.add_post_processor( StreamToHDF5('train', samples_per_file=1000, keys=input_keys)) print('execute and save train vocabs', flush=True) p.execute(d)
def preprocess_SNLI(delete_data=False): # load data #names, file_paths = snli2json() #train_path, dev_path, test_path = file_paths tokenizer = nltk.tokenize.WordPunctTokenizer() zip_path = join(get_data_path(), 'snli_1.0.zip', 'snli_1.0') file_paths = [ 'snli_1.0_train.jsonl', 'snli_1.0_dev.jsonl', 'snli_1.0_test.jsonl' ] not_t = [] t = ['input', 'support', 'target'] # tokenize and convert to hdf5 # 1. Setup pipeline to save lengths and generate vocabulary p = Pipeline('snli_example', delete_data) p.add_path(join(zip_path, file_paths[0])) p.add_line_processor(JsonLoaderProcessors()) p.add_line_processor( RemoveLineOnJsonValueCondition('gold_label', lambda label: label == '-')) p.add_line_processor( DictKey2ListMapper(['sentence1', 'sentence2', 'gold_label'])) p.add_sent_processor(ToLower()) p.add_sent_processor(Tokenizer(tokenizer.tokenize), t) #p.add_sent_processor(NaiveNCharTokenizer(3), not_t) p.add_token_processor(AddToVocab()) p.add_post_processor(SaveLengthsToState()) p.execute() p.clear_processors() p.state['vocab'].save_to_disk() # 2. Process the data further to stream it to hdf5 p.add_sent_processor(ToLower()) p.add_sent_processor(Tokenizer(tokenizer.tokenize), t) #p.add_sent_processor(NaiveNCharTokenizer(3), not_t) p.add_post_processor(ConvertTokenToIdx()) p.add_post_processor( CreateBinsByNestedLength('snli_train', min_batch_size=128)) state = p.execute() # dev and test data p2 = Pipeline('snli_example') p2.copy_vocab_from_pipeline(p) p2.add_path(join(zip_path, file_paths[1])) p2.add_line_processor(JsonLoaderProcessors()) p2.add_line_processor( RemoveLineOnJsonValueCondition('gold_label', lambda label: label == '-')) p2.add_line_processor( DictKey2ListMapper(['sentence1', 'sentence2', 'gold_label'])) p2.add_sent_processor(ToLower()) p2.add_sent_processor(Tokenizer(tokenizer.tokenize), t) #p2.add_sent_processor(NaiveNCharTokenizer(3), not_t) p2.add_post_processor(SaveLengthsToState()) p2.execute() p2.clear_processors() p2.add_sent_processor(ToLower()) p2.add_sent_processor(Tokenizer(tokenizer.tokenize), t) #p2.add_sent_processor(NaiveNCharTokenizer(3), not_t) p2.add_post_processor(ConvertTokenToIdx()) p2.add_post_processor(StreamToHDF5('snli_dev')) p2.execute() p3 = Pipeline('snli_example') p3.copy_vocab_from_pipeline(p) p3.add_path(join(zip_path, file_paths[2])) p3.add_line_processor(JsonLoaderProcessors()) p3.add_line_processor( RemoveLineOnJsonValueCondition('gold_label', lambda label: label == '-')) p3.add_line_processor( DictKey2ListMapper(['sentence1', 'sentence2', 'gold_label'])) p3.add_sent_processor(ToLower()) p3.add_sent_processor(Tokenizer(tokenizer.tokenize), t) #p3.add_sent_processor(NaiveNCharTokenizer(3), not_t) p3.add_post_processor(SaveLengthsToState()) p3.execute() p3.clear_processors() p3.add_sent_processor(ToLower()) p3.add_sent_processor(Tokenizer(tokenizer.tokenize), t) #p3.add_sent_processor(NaiveNCharTokenizer(3), not_t) p3.add_post_processor(ConvertTokenToIdx()) p3.add_post_processor(StreamToHDF5('snli_test')) p3.execute()