예제 #1
0
    def prepare_glove(self, dimension):
        if self.index is not None: return
        if not os.path.exists(join(get_data_path(), 'glove', 'index_50.p')):
            dims = [50, 100, 200, 300]
            base_filename = 'glove.6B.{0}d.txt'
            paths = [
                join(get_data_path(), 'glove', base_filename.format(dim))
                for dim in dims
            ]
            for path, dim in zip(paths, dims):
                index = {}
                index = {'PATH': path}
                with open(path, 'rb') as f:
                    log.info('Building index for {0}', path)
                    while True:
                        prev_pos = f.tell()
                        line = f.readline().decode('utf-8')
                        if line == '': break
                        next_pos = f.tell()
                        data = line.strip().split(' ')
                        token = data[0]
                        index[token] = (prev_pos, next_pos)

                log.info('Saving glove index...')
                json.dump(
                    index,
                    open(
                        join(get_data_path(), 'glove',
                             'index_{0}.p'.format(dim)), 'w'))

        log.info('Loading glove index...')
        self.index = json.load(
            open(
                join(get_data_path(), 'glove',
                     'index_{0}.p'.format(dimension)), 'r'))
예제 #2
0
 def download_glove(self):
     if not os.path.exists(join(get_data_path(), 'glove')):
         log.info('Glove data is missing, dowloading data now...')
         os.mkdir(join(get_data_path(), 'glove'))
         bashmagic.wget("http://nlp.stanford.edu/data/glove.6B.zip",
                        join(get_data_path(), 'glove'))
         bashmagic.unzip(join(get_data_path(), 'glove', 'glove.6B.zip'),
                         join(get_data_path(), 'glove'))
예제 #3
0
def test_save_load_data(dtype):
    folder = join(get_data_path(), 'test_hdf')
    if os.path.exists(folder):
        shutil.rmtree(folder)
    os.mkdir(folder)
    for i in range(5):
        filename = str(uuid.uuid4())
        data1 = dtype(np.random.randn(100, 100))
        save_data(join(folder, filename), data1)
        data2 = load_data(join(folder, filename))
        np.testing.assert_array_equal(data1, data2, 'Arrays must be equal')
    shutil.rmtree(folder)
예제 #4
0
def preprocess_SNLI(delete_data=False):
    # load data
    #names, file_paths = snli2json()
    #train_path, dev_path, test_path = file_paths
    tokenizer = nltk.tokenize.WordPunctTokenizer()

    zip_path = join(get_data_path(), 'snli_1.0.zip', 'snli_1.0')
    file_paths = [
        'snli_1.0_train.jsonl', 'snli_1.0_dev.jsonl', 'snli_1.0_test.jsonl'
    ]

    not_t = []
    t = ['input', 'support', 'target']
    # tokenize and convert to hdf5
    # 1. Setup pipeline to save lengths and generate vocabulary
    p = Pipeline('snli_example', delete_data)
    p.add_path(join(zip_path, file_paths[0]))
    p.add_line_processor(JsonLoaderProcessors())
    p.add_line_processor(
        RemoveLineOnJsonValueCondition('gold_label',
                                       lambda label: label == '-'))
    p.add_line_processor(
        DictKey2ListMapper(['sentence1', 'sentence2', 'gold_label']))
    p.add_sent_processor(ToLower())
    p.add_sent_processor(Tokenizer(tokenizer.tokenize), t)
    #p.add_sent_processor(NaiveNCharTokenizer(3), not_t)
    p.add_token_processor(AddToVocab())
    p.add_post_processor(SaveLengthsToState())
    p.execute()
    p.clear_processors()
    p.state['vocab'].save_to_disk()

    # 2. Process the data further to stream it to hdf5
    p.add_sent_processor(ToLower())
    p.add_sent_processor(Tokenizer(tokenizer.tokenize), t)
    #p.add_sent_processor(NaiveNCharTokenizer(3), not_t)
    p.add_post_processor(ConvertTokenToIdx())
    p.add_post_processor(
        CreateBinsByNestedLength('snli_train', min_batch_size=128))
    state = p.execute()

    # dev and test data
    p2 = Pipeline('snli_example')
    p2.copy_vocab_from_pipeline(p)
    p2.add_path(join(zip_path, file_paths[1]))
    p2.add_line_processor(JsonLoaderProcessors())
    p2.add_line_processor(
        RemoveLineOnJsonValueCondition('gold_label',
                                       lambda label: label == '-'))
    p2.add_line_processor(
        DictKey2ListMapper(['sentence1', 'sentence2', 'gold_label']))
    p2.add_sent_processor(ToLower())
    p2.add_sent_processor(Tokenizer(tokenizer.tokenize), t)
    #p2.add_sent_processor(NaiveNCharTokenizer(3), not_t)
    p2.add_post_processor(SaveLengthsToState())
    p2.execute()

    p2.clear_processors()
    p2.add_sent_processor(ToLower())
    p2.add_sent_processor(Tokenizer(tokenizer.tokenize), t)
    #p2.add_sent_processor(NaiveNCharTokenizer(3), not_t)
    p2.add_post_processor(ConvertTokenToIdx())
    p2.add_post_processor(StreamToHDF5('snli_dev'))
    p2.execute()

    p3 = Pipeline('snli_example')
    p3.copy_vocab_from_pipeline(p)
    p3.add_path(join(zip_path, file_paths[2]))
    p3.add_line_processor(JsonLoaderProcessors())
    p3.add_line_processor(
        RemoveLineOnJsonValueCondition('gold_label',
                                       lambda label: label == '-'))
    p3.add_line_processor(
        DictKey2ListMapper(['sentence1', 'sentence2', 'gold_label']))
    p3.add_sent_processor(ToLower())
    p3.add_sent_processor(Tokenizer(tokenizer.tokenize), t)
    #p3.add_sent_processor(NaiveNCharTokenizer(3), not_t)
    p3.add_post_processor(SaveLengthsToState())
    p3.execute()

    p3.clear_processors()
    p3.add_sent_processor(ToLower())
    p3.add_sent_processor(Tokenizer(tokenizer.tokenize), t)
    #p3.add_sent_processor(NaiveNCharTokenizer(3), not_t)
    p3.add_post_processor(ConvertTokenToIdx())
    p3.add_post_processor(StreamToHDF5('snli_test'))
    p3.execute()
예제 #5
0
    def __init__(self,
                 pipeline_name,
                 name,
                 batch_size,
                 mnt_name='',
                 loader_threads=4,
                 randomize=False,
                 seed=None,
                 keys=['input', 'support', 'target'],
                 is_volatile=False,
                 cache_size_GB=4):
        config_path = join(get_data_path(mnt_name=mnt_name), pipeline_name,
                           name, 'hdf5_config.pkl')
        if not exists(config_path):
            log.error(
                'Path {0} does not exists! Have you forgotten to preprocess your dataset?',
                config_path)
        config = pickle.load(open(config_path, 'rb'))
        self.paths = config['paths']
        self.fractions = config['fractions']
        self.num_batches = int(np.sum(config['counts']) / batch_size)
        self.max_lengths = config['max_lengths']
        self.batch_size = batch_size
        self.batch_idx = 0
        self.prefetch_batch_idx = 0
        self.loaders = []
        self.prepared_batches = {}
        self.prepared_batchidx = queue.Queue()
        self.work = queue.Queue()
        self.cached_batches = {}
        self.end_iter_observers = []
        self.end_epoch_observers = []
        self.start_epoch_observers = []
        self.at_batch_prepared_observers = []
        self.state = BatcherState()
        self.current_iter = 0
        self.current_epoch = 0
        self.timer = Timer()
        self.loader_threads = loader_threads
        if Config.backend == Backends.TORCH:
            from spodernet.backends.torchbackend import TorchConverter, TorchCUDAConverter
            self.subscribe_to_batch_prepared_event(DictConverter(keys))
            self.subscribe_to_batch_prepared_event(TorchConverter(is_volatile))
            if Config.cuda:
                import torch
                self.subscribe_to_batch_prepared_event(
                    TorchCUDAConverter(torch.cuda.current_device()))
        elif Config.backend == Backends.TENSORFLOW:
            from spodernet.backends.tfbackend import TensorFlowConverter
            self.subscribe_to_batch_prepared_event(TensorFlowConverter())
        elif Config.backend == Backends.TEST:
            pass
        elif Config.backend == Backends.CNTK:
            self.subscribe_to_batch_prepared_event(DictConverter(keys))
        else:
            raise Exception('Backend has unsupported value {0}'.format(
                Config.backend))

        batchidx2paths, batchidx2start_end, shard2batchidx = self.create_batchidx_maps(
            config['counts'])

        for i in range(loader_threads):
            seed = 2345 + (i * 83)
            self.loaders.append(
                DataLoaderSlave(self, batchidx2paths, batchidx2start_end,
                                randomize, self.paths, shard2batchidx, seed,
                                self.fractions, cache_size_GB))
            self.loaders[-1].start()