def prepare_glove(self, dimension): if self.index is not None: return if not os.path.exists(join(get_data_path(), 'glove', 'index_50.p')): dims = [50, 100, 200, 300] base_filename = 'glove.6B.{0}d.txt' paths = [ join(get_data_path(), 'glove', base_filename.format(dim)) for dim in dims ] for path, dim in zip(paths, dims): index = {} index = {'PATH': path} with open(path, 'rb') as f: log.info('Building index for {0}', path) while True: prev_pos = f.tell() line = f.readline().decode('utf-8') if line == '': break next_pos = f.tell() data = line.strip().split(' ') token = data[0] index[token] = (prev_pos, next_pos) log.info('Saving glove index...') json.dump( index, open( join(get_data_path(), 'glove', 'index_{0}.p'.format(dim)), 'w')) log.info('Loading glove index...') self.index = json.load( open( join(get_data_path(), 'glove', 'index_{0}.p'.format(dimension)), 'r'))
def download_glove(self): if not os.path.exists(join(get_data_path(), 'glove')): log.info('Glove data is missing, dowloading data now...') os.mkdir(join(get_data_path(), 'glove')) bashmagic.wget("http://nlp.stanford.edu/data/glove.6B.zip", join(get_data_path(), 'glove')) bashmagic.unzip(join(get_data_path(), 'glove', 'glove.6B.zip'), join(get_data_path(), 'glove'))
def test_save_load_data(dtype): folder = join(get_data_path(), 'test_hdf') if os.path.exists(folder): shutil.rmtree(folder) os.mkdir(folder) for i in range(5): filename = str(uuid.uuid4()) data1 = dtype(np.random.randn(100, 100)) save_data(join(folder, filename), data1) data2 = load_data(join(folder, filename)) np.testing.assert_array_equal(data1, data2, 'Arrays must be equal') shutil.rmtree(folder)
def preprocess_SNLI(delete_data=False): # load data #names, file_paths = snli2json() #train_path, dev_path, test_path = file_paths tokenizer = nltk.tokenize.WordPunctTokenizer() zip_path = join(get_data_path(), 'snli_1.0.zip', 'snli_1.0') file_paths = [ 'snli_1.0_train.jsonl', 'snli_1.0_dev.jsonl', 'snli_1.0_test.jsonl' ] not_t = [] t = ['input', 'support', 'target'] # tokenize and convert to hdf5 # 1. Setup pipeline to save lengths and generate vocabulary p = Pipeline('snli_example', delete_data) p.add_path(join(zip_path, file_paths[0])) p.add_line_processor(JsonLoaderProcessors()) p.add_line_processor( RemoveLineOnJsonValueCondition('gold_label', lambda label: label == '-')) p.add_line_processor( DictKey2ListMapper(['sentence1', 'sentence2', 'gold_label'])) p.add_sent_processor(ToLower()) p.add_sent_processor(Tokenizer(tokenizer.tokenize), t) #p.add_sent_processor(NaiveNCharTokenizer(3), not_t) p.add_token_processor(AddToVocab()) p.add_post_processor(SaveLengthsToState()) p.execute() p.clear_processors() p.state['vocab'].save_to_disk() # 2. Process the data further to stream it to hdf5 p.add_sent_processor(ToLower()) p.add_sent_processor(Tokenizer(tokenizer.tokenize), t) #p.add_sent_processor(NaiveNCharTokenizer(3), not_t) p.add_post_processor(ConvertTokenToIdx()) p.add_post_processor( CreateBinsByNestedLength('snli_train', min_batch_size=128)) state = p.execute() # dev and test data p2 = Pipeline('snli_example') p2.copy_vocab_from_pipeline(p) p2.add_path(join(zip_path, file_paths[1])) p2.add_line_processor(JsonLoaderProcessors()) p2.add_line_processor( RemoveLineOnJsonValueCondition('gold_label', lambda label: label == '-')) p2.add_line_processor( DictKey2ListMapper(['sentence1', 'sentence2', 'gold_label'])) p2.add_sent_processor(ToLower()) p2.add_sent_processor(Tokenizer(tokenizer.tokenize), t) #p2.add_sent_processor(NaiveNCharTokenizer(3), not_t) p2.add_post_processor(SaveLengthsToState()) p2.execute() p2.clear_processors() p2.add_sent_processor(ToLower()) p2.add_sent_processor(Tokenizer(tokenizer.tokenize), t) #p2.add_sent_processor(NaiveNCharTokenizer(3), not_t) p2.add_post_processor(ConvertTokenToIdx()) p2.add_post_processor(StreamToHDF5('snli_dev')) p2.execute() p3 = Pipeline('snli_example') p3.copy_vocab_from_pipeline(p) p3.add_path(join(zip_path, file_paths[2])) p3.add_line_processor(JsonLoaderProcessors()) p3.add_line_processor( RemoveLineOnJsonValueCondition('gold_label', lambda label: label == '-')) p3.add_line_processor( DictKey2ListMapper(['sentence1', 'sentence2', 'gold_label'])) p3.add_sent_processor(ToLower()) p3.add_sent_processor(Tokenizer(tokenizer.tokenize), t) #p3.add_sent_processor(NaiveNCharTokenizer(3), not_t) p3.add_post_processor(SaveLengthsToState()) p3.execute() p3.clear_processors() p3.add_sent_processor(ToLower()) p3.add_sent_processor(Tokenizer(tokenizer.tokenize), t) #p3.add_sent_processor(NaiveNCharTokenizer(3), not_t) p3.add_post_processor(ConvertTokenToIdx()) p3.add_post_processor(StreamToHDF5('snli_test')) p3.execute()
def __init__(self, pipeline_name, name, batch_size, mnt_name='', loader_threads=4, randomize=False, seed=None, keys=['input', 'support', 'target'], is_volatile=False, cache_size_GB=4): config_path = join(get_data_path(mnt_name=mnt_name), pipeline_name, name, 'hdf5_config.pkl') if not exists(config_path): log.error( 'Path {0} does not exists! Have you forgotten to preprocess your dataset?', config_path) config = pickle.load(open(config_path, 'rb')) self.paths = config['paths'] self.fractions = config['fractions'] self.num_batches = int(np.sum(config['counts']) / batch_size) self.max_lengths = config['max_lengths'] self.batch_size = batch_size self.batch_idx = 0 self.prefetch_batch_idx = 0 self.loaders = [] self.prepared_batches = {} self.prepared_batchidx = queue.Queue() self.work = queue.Queue() self.cached_batches = {} self.end_iter_observers = [] self.end_epoch_observers = [] self.start_epoch_observers = [] self.at_batch_prepared_observers = [] self.state = BatcherState() self.current_iter = 0 self.current_epoch = 0 self.timer = Timer() self.loader_threads = loader_threads if Config.backend == Backends.TORCH: from spodernet.backends.torchbackend import TorchConverter, TorchCUDAConverter self.subscribe_to_batch_prepared_event(DictConverter(keys)) self.subscribe_to_batch_prepared_event(TorchConverter(is_volatile)) if Config.cuda: import torch self.subscribe_to_batch_prepared_event( TorchCUDAConverter(torch.cuda.current_device())) elif Config.backend == Backends.TENSORFLOW: from spodernet.backends.tfbackend import TensorFlowConverter self.subscribe_to_batch_prepared_event(TensorFlowConverter()) elif Config.backend == Backends.TEST: pass elif Config.backend == Backends.CNTK: self.subscribe_to_batch_prepared_event(DictConverter(keys)) else: raise Exception('Backend has unsupported value {0}'.format( Config.backend)) batchidx2paths, batchidx2start_end, shard2batchidx = self.create_batchidx_maps( config['counts']) for i in range(loader_threads): seed = 2345 + (i * 83) self.loaders.append( DataLoaderSlave(self, batchidx2paths, batchidx2start_end, randomize, self.paths, shard2batchidx, seed, self.fractions, cache_size_GB)) self.loaders[-1].start()