Пример #1
0
def preprocess(dataset_name, delete_data=False):
    full_path = 'data/{0}/e1rel_to_e2_full.json'.format(dataset_name)
    train_path = 'data/{0}/e1rel_to_e2_train.json'.format(dataset_name)
    dev_ranking_path = 'data/{0}/e1rel_to_e2_ranking_dev.json'.format(
        dataset_name)
    test_ranking_path = 'data/{0}/e1rel_to_e2_ranking_test.json'.format(
        dataset_name)

    keys2keys = {}
    keys2keys['e1'] = 'e1'  # entities
    keys2keys['rel'] = 'rel'  # relations
    keys2keys['e2'] = 'e1'  # entities
    keys2keys['e2_multi1'] = 'e1'  # entity
    keys2keys['e2_multi2'] = 'e1'  # entity
    input_keys = ['e1', 'rel', 'e2', 'e2_multi1', 'e2_multi2']
    d = DatasetStreamer(input_keys)
    d.add_stream_processor(JsonLoaderProcessors())
    d.add_stream_processor(DictKey2ListMapper(input_keys))

    # process full vocabulary and save it to disk
    d.set_path(full_path)
    p = Pipeline(Config.dataset,
                 delete_data,
                 keys=input_keys,
                 skip_transformation=True)
    p.add_sent_processor(ToLower())
    p.add_sent_processor(CustomTokenizer(lambda x: x.split(' ')),
                         keys=['e2_multi1', 'e2_multi2'])
    p.add_token_processor(AddToVocab())
    p.execute(d)
    p.save_vocabs()

    # process train, dev and test sets and save them to hdf5
    p.skip_transformation = False
    for path, name in zip([train_path, dev_ranking_path, test_ranking_path],
                          ['train', 'dev_ranking', 'test_ranking']):
        d.set_path(path)
        p.clear_processors()
        p.add_sent_processor(ToLower())
        p.add_sent_processor(CustomTokenizer(lambda x: x.split(' ')),
                             keys=['e2_multi1', 'e2_multi2'])
        p.add_post_processor(
            ConvertTokenToIdx(keys2keys=keys2keys),
            keys=['e1', 'rel', 'e2', 'e2_multi1', 'e2_multi2'])
        p.add_post_processor(
            StreamToHDF5(name,
                         samples_per_file=1500
                         if Config.dataset == 'YAGO3-10' else 1000,
                         keys=input_keys))
        p.execute(d)
Пример #2
0
def preprocess(args, delete_data=False):

    keys2keys = {}
    keys2keys['e1'] = 'e1'  # entities
    keys2keys['rel'] = 'rel'  # relations
    keys2keys['rel_eval'] = 'rel'  # relations
    keys2keys['e2'] = 'e1'  # entities
    keys2keys['e2_multi1'] = 'e1'  # entity
    keys2keys['e2_multi2'] = 'e1'  # entity
    input_keys = ['e1', 'rel', 'rel_eval', 'e2', 'e2_multi1', 'e2_multi2']
    print('create dataset streamer', flush=True)
    d = DatasetStreamer(input_keys)
    d.add_stream_processor(JsonLoaderProcessors())
    d.add_stream_processor(DictKey2ListMapper(input_keys))

    # process full vocabulary and save it to disk
    d.set_path(args.train_path)
    print('create pipeline', flush=True)
    p = Pipeline(args.data,
                 delete_data,
                 keys=input_keys,
                 skip_transformation=True)
    p.add_sent_processor(ToLower())
    p.add_sent_processor(CustomTokenizer(lambda x: x.split(' ')),
                         keys=['e2_multi1', 'e2_multi2'])
    p.add_token_processor(AddToVocab())
    print('execute full vocabs', flush=True)
    p.execute(d)
    print('save full vocabs', flush=True)
    p.save_vocabs()

    # process train sets and save them to hdf5
    p.skip_transformation = False
    d.set_path(args.train_path)
    p.clear_processors()
    p.add_sent_processor(ToLower())
    p.add_sent_processor(CustomTokenizer(lambda x: x.split(' ')),
                         keys=['e2_multi1', 'e2_multi2'])
    p.add_post_processor(
        ConvertTokenToIdx(keys2keys=keys2keys),
        keys=['e1', 'rel', 'rel_eval', 'e2', 'e2_multi1', 'e2_multi2'])
    p.add_post_processor(
        StreamToHDF5('train', samples_per_file=1000, keys=input_keys))
    print('execute and save train vocabs', flush=True)
    p.execute(d)
Пример #3
0
def preprocess_SNLI(delete_data=False):
    # load data
    #names, file_paths = snli2json()
    #train_path, dev_path, test_path = file_paths
    tokenizer = nltk.tokenize.WordPunctTokenizer()

    zip_path = join(get_data_path(), 'snli_1.0.zip', 'snli_1.0')
    file_paths = [
        'snli_1.0_train.jsonl', 'snli_1.0_dev.jsonl', 'snli_1.0_test.jsonl'
    ]

    not_t = []
    t = ['input', 'support', 'target']
    # tokenize and convert to hdf5
    # 1. Setup pipeline to save lengths and generate vocabulary
    p = Pipeline('snli_example', delete_data)
    p.add_path(join(zip_path, file_paths[0]))
    p.add_line_processor(JsonLoaderProcessors())
    p.add_line_processor(
        RemoveLineOnJsonValueCondition('gold_label',
                                       lambda label: label == '-'))
    p.add_line_processor(
        DictKey2ListMapper(['sentence1', 'sentence2', 'gold_label']))
    p.add_sent_processor(ToLower())
    p.add_sent_processor(Tokenizer(tokenizer.tokenize), t)
    #p.add_sent_processor(NaiveNCharTokenizer(3), not_t)
    p.add_token_processor(AddToVocab())
    p.add_post_processor(SaveLengthsToState())
    p.execute()
    p.clear_processors()
    p.state['vocab'].save_to_disk()

    # 2. Process the data further to stream it to hdf5
    p.add_sent_processor(ToLower())
    p.add_sent_processor(Tokenizer(tokenizer.tokenize), t)
    #p.add_sent_processor(NaiveNCharTokenizer(3), not_t)
    p.add_post_processor(ConvertTokenToIdx())
    p.add_post_processor(
        CreateBinsByNestedLength('snli_train', min_batch_size=128))
    state = p.execute()

    # dev and test data
    p2 = Pipeline('snli_example')
    p2.copy_vocab_from_pipeline(p)
    p2.add_path(join(zip_path, file_paths[1]))
    p2.add_line_processor(JsonLoaderProcessors())
    p2.add_line_processor(
        RemoveLineOnJsonValueCondition('gold_label',
                                       lambda label: label == '-'))
    p2.add_line_processor(
        DictKey2ListMapper(['sentence1', 'sentence2', 'gold_label']))
    p2.add_sent_processor(ToLower())
    p2.add_sent_processor(Tokenizer(tokenizer.tokenize), t)
    #p2.add_sent_processor(NaiveNCharTokenizer(3), not_t)
    p2.add_post_processor(SaveLengthsToState())
    p2.execute()

    p2.clear_processors()
    p2.add_sent_processor(ToLower())
    p2.add_sent_processor(Tokenizer(tokenizer.tokenize), t)
    #p2.add_sent_processor(NaiveNCharTokenizer(3), not_t)
    p2.add_post_processor(ConvertTokenToIdx())
    p2.add_post_processor(StreamToHDF5('snli_dev'))
    p2.execute()

    p3 = Pipeline('snli_example')
    p3.copy_vocab_from_pipeline(p)
    p3.add_path(join(zip_path, file_paths[2]))
    p3.add_line_processor(JsonLoaderProcessors())
    p3.add_line_processor(
        RemoveLineOnJsonValueCondition('gold_label',
                                       lambda label: label == '-'))
    p3.add_line_processor(
        DictKey2ListMapper(['sentence1', 'sentence2', 'gold_label']))
    p3.add_sent_processor(ToLower())
    p3.add_sent_processor(Tokenizer(tokenizer.tokenize), t)
    #p3.add_sent_processor(NaiveNCharTokenizer(3), not_t)
    p3.add_post_processor(SaveLengthsToState())
    p3.execute()

    p3.clear_processors()
    p3.add_sent_processor(ToLower())
    p3.add_sent_processor(Tokenizer(tokenizer.tokenize), t)
    #p3.add_sent_processor(NaiveNCharTokenizer(3), not_t)
    p3.add_post_processor(ConvertTokenToIdx())
    p3.add_post_processor(StreamToHDF5('snli_test'))
    p3.execute()