def test_casts(comm): fname = tempfile.mkdtemp() x = BigFile(fname, create=True) with x.create('block', Nfile=1, dtype='f8', size=128) as b: assert_raises(BigFileError, b.write, 0, numpy.array('aaaaaa')) b.write(0, numpy.array(True, dtype='?'))
def test_slicing(comm): fname = tempfile.mkdtemp() x = BigFile(fname, create=True) x.create('.') numpy.random.seed(1234) # test creating with x.create("data", Nfile=1, dtype=('f8', 32), size=128) as b: data = numpy.random.uniform(100000, size=(128, 32)) junk = numpy.random.uniform(100000, size=(128, 32)) b.write(0, data) with x['data'] as b: assert_equal(b[:], data) assert_equal(b[0], data[0]) b[:len(junk)] = junk with x['data'] as b: assert_equal(b[:], junk) assert_equal(b[0], junk[0]) b[3] = data[3] with x['data'] as b: assert_equal(b[3], data[3]) shutil.rmtree(fname)
def test_pickle(comm): fname = tempfile.mkdtemp() x = BigFile(fname, create=True) # test creating column = x.create("abc", dtype='f8', size=128) import pickle str = pickle.dumps(column) column1 = pickle.loads(str) assert type(column) == type(column1) assert column.size == column1.size assert column.dtype == column1.dtype assert column.comm is column1.comm column.close() str = pickle.dumps(column) column1 = pickle.loads(str) str = pickle.dumps(x) x1 = pickle.loads(str) assert type(x) == type(x1) assert x1.basename == x.basename x.close() str = pickle.dumps(x) x1 = pickle.loads(str) assert tuple(sorted(x1.blocks)) == tuple(sorted(x.blocks)) shutil.rmtree(fname)
def test_attr(comm): fname = tempfile.mkdtemp() x = BigFile(fname, create=True) with x.create('.', dtype=None) as b: b.attrs['int'] = 128 b.attrs['float'] = [128.0, 3, 4] b.attrs['string'] = 'abcdefg' b.attrs['complex'] = 128 + 128J b.attrs['bool'] = True b.attrs['arrayustring'] = numpy.array(u'unicode') b.attrs['arraysstring'] = numpy.array('str') with x.open('.') as b: assert_equal(b.attrs['int'], 128) assert_equal(b.attrs['float'], [128.0, 3, 4]) assert_equal(b.attrs['string'], 'abcdefg') assert_equal(b.attrs['complex'], 128 + 128J) assert_equal(b.attrs['bool'], True) b.attrs['int'] = 30 b.attrs['float'] = [3, 4] b.attrs['string'] = 'defg' b.attrs['complex'] = 32 + 32J b.attrs['bool'] = False with x.open('.') as b: assert_equal(b.attrs['int'], 30) assert_equal(b.attrs['float'], [3, 4]) assert_equal(b.attrs['string'], 'defg') assert_equal(b.attrs['complex'], 32 + 32J) assert_equal(b.attrs['bool'], False) shutil.rmtree(fname)
def test_append(comm): fname = tempfile.mkdtemp() x = BigFile(fname, create=True) name = 'f4' d = numpy.dtype(('f4', 3)) numpy.random.seed(1234) data = numpy.random.uniform(100000, size=(100, 3)).astype('f4') # test creating with x.create(name, Nfile=3, dtype=d, size=100) as b: b.write(0, data) b.append(data, Nfile=2) with x.open(name) as bb: assert bb.size == 200 assert b.size == 200 with x.open(name) as b: assert b.Nfile == 5 assert_equal(b[:100], data) assert_equal(b[100:], data) assert b.size == 200 shutil.rmtree(fname)
def test_attr(comm): fname = tempfile.mkdtemp() x = BigFile(fname, create=True) with x.create('.', dtype=None) as b: b.attrs['int'] = 128 b.attrs['float'] = [128.0, 3, 4] b.attrs['string'] = 'abcdefg' b.attrs['complex'] = 128 + 128J with x.open('.') as b: assert_equal(b.attrs['int'], 128) assert_equal(b.attrs['float'], [128.0, 3, 4]) assert_equal(b.attrs['string'], 'abcdefg') assert_equal(b.attrs['complex'], 128 + 128J) b.attrs['int'] = 30 b.attrs['float'] = [3, 4] b.attrs['string'] = 'defg' b.attrs['complex'] = 32 + 32J with x.open('.') as b: assert_equal(b.attrs['int'], 30) assert_equal(b.attrs['float'], [3, 4]) assert_equal(b.attrs['string'], 'defg') assert_equal(b.attrs['complex'], 32 + 32J) shutil.rmtree(fname)
def test_grow(comm): fname = tempfile.mkdtemp() x = BigFile(fname, create=True) d = numpy.dtype(('f4', 3)) numpy.random.seed(1234) data = numpy.random.uniform(100000, size=(100, 3)).astype('f4') # test creating with x.create(d.str, Nfile=3, dtype=d, size=100) as b: b.write(0, data) b.grow(size=100, Nfile=2) with x.open(d.str) as bb: assert bb.size == 200 b.write(100, data) assert b.size == 200 with x.open(d.str) as b: assert b.Nfile == 5 assert_equal(b[:100], data) assert_equal(b[100:], data) assert b.size == 200 shutil.rmtree(fname)
def __init__(self, collection, vocab_file, feature, language, flag_shuffle=True, method=None, fluency_threshold=DEFAULT_FLUENCY_U, rootpath=ROOT_PATH): self.language = language self.anno_file_path = utility.get_sent_file(collection, language, rootpath) self.fluency_threshold = fluency_threshold self.method = method if method: self.sent_score_file = utility.get_sent_score_file( collection, language, rootpath) assert method in ['sample', 'filter', 'weighted'] assert self.sent_score_file != None assert fluency_threshold > 0 if method == 'weighted': # Not sampling the data if fluency-guided method is weighted_loss self.method = method = None else: self.sent_score_file = None self.textbank = TextBank(vocab_file) assert self.textbank.vocab[TOKEN_PAD] == 0 self.vf_reader = BigFile( utility.get_feat_dir(collection, feature, rootpath)) self.vf_names = set(self.vf_reader.names) self.vf_size = self.vf_reader.ndims self.flag_shuffle = flag_shuffle self._load_data()
def SaveSnapshot(comm, filename, P, blocks=None): file = BigFile(filename) if blocks is None: blocks = P.keys() for key in blocks: # hack, skip scalar mass if numpy.isscalar(P[key]): continue file.mpi_create_from_data(comm, '1/%s' % key, P[key])
def test_closed(comm): fname = tempfile.mkdtemp() x = BigFile(fname, create=True) x.create('.') x.close() assert x.blocks == [] try: h = x['.'] except BigFileClosedError: pass
def test_passby(comm): fname = tempfile.mkdtemp() x = BigFile(fname, create=True) # half floats are pass-through types, no casting is supported data = numpy.array([3.0, 5.0], dtype='f2') with x.create('block', Nfile=1, dtype='f2', size=128) as b: b.write(0, data) assert_equal(b[:2], data) assert_raises(BigFileError, b.write, 0, numpy.array((30, 20.)))
def test_attr_objects(comm): fname = tempfile.mkdtemp() x = BigFile(fname, create=True) with x.create('block', dtype=None) as b: def set_obj1(): b.attrs['objects'] = numpy.array([object()]) assert_raises(ValueError, set_obj1); def set_obj_scalar(): b.attrs['objects'] = object() assert_raises(ValueError, set_obj_scalar); shutil.rmtree(fname)
def test_dataset(comm): fname = tempfile.mkdtemp() x = BigFile(fname, create=True) x.create('.') for name, d in dtypes: dt = numpy.dtype(d) numpy.random.seed(1234) # test creating with x.create(name, Nfile=1, dtype=dt, size=128) as b: data = numpy.random.uniform(100000, size=128*128).view(dtype=b.dtype.base).reshape([-1] + list(dt.shape))[:b.size] b.write(0, data) bd = Dataset(x) assert set(bd.dtype.names) == set(x.blocks) assert isinstance(bd[:], numpy.ndarray) assert isinstance(bd['f8'], BigBlock) assert_equal(len(bd['f8'].dtype), 0) # tuple of one item is the same as non-tuple assert isinstance(bd[('f8',)], BigBlock) assert_equal(len(bd[('f8',)].dtype), 0) assert isinstance(bd['f8', :10], numpy.ndarray) assert_equal(len(bd['f8', :10]), 10) assert_equal(len(bd['f8', :10].dtype), 0) assert_equal(len(bd[['f8',], :10].dtype), 1) # tuple of one item is the same as non-tuple assert_equal(len(bd[('f8',), :10].dtype), 0) assert isinstance(bd[:10, 'f8'], numpy.ndarray) assert isinstance(bd['f8'], BigBlock) assert isinstance(bd[['f8', 'f4'],], Dataset) assert_equal(len(bd[['f8', 'f4'],].dtype), 2) assert isinstance(bd[['f8', 'f4'], :10], numpy.ndarray) for name, d in dtypes: assert_array_equal(x[name][:], bd[:][name]) data1 = bd[:10] data2 = bd[10:20] bd[:10] = data2 assert_array_equal(bd[:10], data2) bd[10:20] = data1 assert_array_equal(bd[:10], data2) assert_array_equal(bd[10:20], data1) bd.append(data1) assert bd.size == 128 + 10 assert_array_equal(bd[-10:], data1) shutil.rmtree(fname)
def test_create(comm): fname = tempfile.mkdtemp() x = BigFile(fname, create=True) x.create('.') for d in dtypes: d = numpy.dtype(d) numpy.random.seed(1234) # test creating with x.create(d.str, Nfile=1, dtype=d, size=128) as b: data = numpy.random.uniform(100000, size=128*128).view(dtype=b.dtype.base).reshape([-1] + list(d.shape))[:b.size] b.write(0, data) with x[d.str] as b: assert_equal(b[:], data.astype(d.base)) assert_equal(b[:], b[...]) # test creating data = numpy.random.uniform(100000, size=128*128).view(dtype=d.base).reshape([-1] + list(d.shape))[:128] with x.create_from_array(d.str, data) as b: pass with x[d.str] as b: assert_equal(b[:], data) # test writing with an offset with x[d.str] as b: b.write(1, data[0:1]) assert_equal(b[1:2], data[0:1].astype(d.base)) # test writing beyond file length with x[d.str] as b: caught = False try: b.write(1, data) except: caught = True assert caught assert_equal(set(x.blocks), set([numpy.dtype(d).str for d in dtypes])) import os os.system("ls -r %s" % fname) for b in x.blocks: assert b in x for b in x: assert b in x bd = BigData(x) assert set(bd.dtype.names) == set(x.blocks) d = bd[:] shutil.rmtree(fname)
def get_we(vocab, w2v_dir): w2v = BigFile(w2v_dir) ndims = w2v.ndims nr_words = len(vocab) words = [vocab[i] for i in range(nr_words)] we = np.random.uniform(low=-1.0, high=1.0, size=(nr_words, ndims)) renamed, vecs = w2v.read(words) for i, word in enumerate(renamed): idx = vocab.find(word) we[idx] = vecs[i] return torch.Tensor(we)
class Text2W2VEncoder: def __init__(self, data_path): self.w2v = BigFile(data_path) vocab_size, self.ndims = self.w2v.shape() print("Text2W2VEncoder", "vocab_size", vocab_size, "dim", self.ndims) def encode(self, words): renamed, vectors = self.w2v.read(words) if len(vectors) > 0: vec = np.array(vectors).mean(axis=0) else: vec = np.zeros([self.ndims]) return torch.Tensor(vec)
class W2Vec(Txt2Vec): def __init__(self, data_path, norm=0, clean=True): super(W2Vec, self).__init__(data_path, norm, clean) self.w2v = BigFile(data_path) vocab_size, self.ndims = self.w2v.shape() logger.info('vob size: %d, vec dim: %d' % (vocab_size, self.ndims)) def _encoding(self, words): renamed, vectors = self.w2v.read(words) if len(vectors) > 0: vec = np.array(vectors).mean(axis=0) else: vec = np.zeros(self.ndims, ) return vec
class VisionDataset(torch.utils.data.Dataset): def __init__(self, filename): self.vis_feat_file = BigFile(filename) self.vis_ids = self.vis_feat_file.names def __getitem__(self, index): vis_tensor = self.vis_feat_file.read_one(self.vis_ids[index]) return self.vis_ids[index], torch.Tensor(vis_tensor) def get_by_name(self, name): vis_tensor = self.vis_feat_file.read_one(name) return torch.Tensor(vis_tensor) def __len__(self): return len(self.vis_ids)
def test_create(comm): fname = tempfile.mkdtemp() x = BigFile(fname, create=True) x.create('.') for d in dtypes: d = numpy.dtype(d) numpy.random.seed(1234) # test creating with x.create(d.str, Nfile=1, dtype=d, size=128) as b: data = numpy.random.uniform(100000, size=128 * 128).view( dtype=b.dtype.base).reshape([-1] + list(d.shape))[:b.size] b.write(0, data) with x[d.str] as b: assert_equal(b[:], data.astype(d.base)) assert_equal(b[:], b[...]) # test writing with an offset with x[d.str] as b: b.write(1, data[0:1]) assert_equal(b[1:2], data[0:1].astype(d.base)) # test writing beyond file length with x[d.str] as b: caught = False try: b.write(1, data) except: caught = True assert caught assert_equal(set(x.blocks), set([numpy.dtype(d).str for d in dtypes])) import os os.system("ls -r %s" % fname) for b in x.blocks: assert b in x for b in x: assert b in x bd = BigData(x) assert set(bd.dtype.names) == set(x.blocks) d = bd[:] shutil.rmtree(fname)
def calc_mf_each_bf(dir_name: str, bf: BigFile): """ Calculate MFs of a single bf file""" header = bf.open('Header') redshift = 1. / header.attrs['Time'][0] - 1. bhmass = bf.open('5/BlackholeMass')[:] * TO_MSUN halomass = bf.open('FOFGroups/Mass')[:] * TO_MSUN starmass = bf.open('FOFGroups/MassByType')[:][:, 4] * TO_MSUN halo_mf = mass_function(halomass, HALO_MIN, HALO_MAX, N_BIN, BOXSIZE) star_mf = mass_function(starmass, STAR_MIN, STAR_MAX, N_BIN, BOXSIZE) bh_mf = mass_function(bhmass, BH_MIN, BH_MAX, N_BIN, BOXSIZE) print(' Saving MFs at z = %0.4f' % redshift) np.save('{}halo_mf_%0.4f'.format(dir_name) %redshift, halo_mf) np.save('{}star_mf_%0.4f'.format(dir_name) %redshift, star_mf) np.save('{}bh_mf_%0.4f'.format(dir_name) %redshift, bh_mf)
def test_threads(comm): # This test shall not core dump # raise many errors here and there on many threads from threading import Thread, Event import gc fname = tempfile.mkdtemp() x = BigFile(fname, create=True) b = x.create("Threading", Nfile=1, dtype='i8', size=128) old = gc.get_threshold() gc.set_threshold(1, 1, 1) E = Event() def func(i, b): E.wait() x['.'].attrs['v3'] = [1, 2, 3] err = 0 for j in range(100 * i): try: with pytest.raises(BigFileError): b.attrs['v 3'] = ['a', 'bb', 'ccc'] b.write(0, numpy.ones(128)) except BigBlockClosedError: err = err + 1 b.close() x['Threading'].attrs['v3'] = [1, 2, 3] t = [] for i in range(4): t.append(Thread(target = func, args=(i, b))) for i in t: i.start() E.set() for i in t: i.join() gc.set_threshold(*old) shutil.rmtree(fname)
def __init__(self, collection, vocab_file, feature, language, flag_shuffle=False, fluency_threshold=DEFAULT_FLUENCY_U, rootpath=ROOT_PATH): self.language = language self.anno_file_path = utility.get_sent_file(collection, language, rootpath) self.fluency_threshold = fluency_threshold self.textbank = TextBank(vocab_file) assert self.textbank.vocab[TOKEN_PAD] == 0 self.vf_reader = BigFile( utility.get_feat_dir(collection, feature, rootpath)) self.vf_names = set(self.vf_reader.names) self.vf_size = self.vf_reader.ndims self.flag_shuffle = flag_shuffle self._load_data()
def __init__(self, collections, concept_files, feature, batch_size=100, rootpath=ROOT_PATH): assert (len(collections) == len(concept_files)) self.batch_size = batch_size self.feat_file = BigFile( os.path.join(rootpath, collections[0], 'FeatureData', feature)) self.label_set = LabelSet(collections[0], concept_files[0], rootpath) self.aux_label_set = None if len(collections) > 1: self.aux_label_set = LabelSet(collections[1], concept_files[1], rootpath) self.img_ids = sorted(self.label_set.im2labels.keys()) self.num_labels = self.label_set.num_labels self.aux_num_labels = self.aux_label_set.num_labels if self.aux_label_set else 0 self.update()
def __init__(self, collection, vocab_file, feature, language, flag_shuffle=False, fluency_threshold=DEFAULT_FLUENCY_U, rootpath=ROOT_PATH): self.language = language self.anno_file_path = utility.get_sent_file(collection, language, rootpath) self.fluency_threshold = fluency_threshold self.textbank = TextBank(vocab_file) assert self.textbank.vocab[TOKEN_PAD] == 0 self.vf_reader = BigFile(utility.get_feat_dir(collection, feature, rootpath)) self.vf_names = set(self.vf_reader.names) self.vf_size = self.vf_reader.ndims self.flag_shuffle = flag_shuffle self._load_data()
def test_string(comm): fname = tempfile.mkdtemp() x = BigFile(fname, create=True) # test creating with x.create("Header", Nfile=1, dtype=None, size=128) as b: b.attrs['v3'] = ['a', 'bb', 'ccc'] b.attrs['v32'] = [ ['a', 'bb', 'ccc'], ['1', '22', '333'],] b.attrs['s'] = 'abc' b.attrs['l'] = 'a' * 65536 with x.open("Header") as b: assert_equal(b.attrs['v3'], ['a', 'bb', 'ccc']) assert_equal(b.attrs['v32'], ['a', 'bb', 'ccc', '1', '22', '333']) assert_equal(b.attrs['s'], 'abc') assert_equal(b.attrs['l'], 'a' * 65536) shutil.rmtree(fname)
def ReadIC(filename): # this reads in a MP-Gadget3/GENIC format IC # major thing is to scale vel by a0**1.5 file = BigFile(filename) header = file.open('header') BoxSize = header.attrs['BoxSize'][0] a0 = header.attrs['Time'][0] Ntot = file.open('1/ID').size myslice = slice( MPI.COMM_WORLD.rank * Ntot // MPI.COMM_WORLD.size, (MPI.COMM_WORLD.rank + 1) * Ntot // MPI.COMM_WORLD.size, ) P = dict() P['Mass'] = header.attrs['MassTable'][1] P['Position'] = file.open('1/Position')[myslice] P['Velocity'] = file.open('1/Velocity')[myslice] P['Velocity'] *= a0 ** 1.5 P['ID'] = file.open('1/ID')[myslice] return P, BoxSize, a0
def process(options, feat_dir, imsetfile, result_dir): resultfile = os.path.join(result_dir, 'feature.bin') if checkToSkip(resultfile, options.overwrite): sys.exit(0) imset = map(str.strip, open(imsetfile).readlines()) print "requested", len(imset) feat_file = BigFile(feat_dir) makedirsforfile(resultfile) fw = open(resultfile, 'wb') done = [] start = 0 while start < len(imset): end = min(len(imset), start + options.blocksize) printStatus(INFO, 'processing images from %d to %d' % (start, end-1)) toread = imset[start:end] if len(toread) == 0: break renamed, vectors = feat_file.read(toread) for vec in vectors: vec = np.array(vec, dtype=np.float32) vec.tofile(fw) done += renamed start = end fw.close() assert(len(done) == len(set(done))) with open(os.path.join(result_dir, 'id.txt'), 'w') as fw: fw.write(' '.join(done)) fw.close() with open(os.path.join(result_dir,'shape.txt'), 'w') as fw: fw.write('%d %d' % (len(done), feat_file.ndims)) fw.close() print '%d requested, %d obtained' % (len(imset), len(done))
def test_bigdata(comm): fname = tempfile.mkdtemp() x = BigFile(fname, create=True) x.create('.') for d in dtypes: dt = numpy.dtype(d) numpy.random.seed(1234) # test creating with x.create(str(d), Nfile=1, dtype=dt, size=128) as b: data = numpy.random.uniform(100000, size=128 * 128).view( dtype=b.dtype.base).reshape([-1] + list(dt.shape))[:b.size] b.write(0, data) bd = BigData(x) assert set(bd.dtype.names) == set(x.blocks) assert isinstance(bd[:], numpy.ndarray) assert isinstance(bd['f8'], BigBlock) assert_equal(len(bd['f8'].dtype), 0) # tuple of one item is the same as non-tuple assert isinstance(bd[('f8', )], BigBlock) assert_equal(len(bd[('f8', )].dtype), 0) assert isinstance(bd['f8', :10], numpy.ndarray) assert_equal(len(bd['f8', :10]), 10) assert_equal(len(bd['f8', :10].dtype), 0) assert_equal(len(bd[[ 'f8', ], :10].dtype), 1) # tuple of one item is the same as non-tuple assert_equal(len(bd[('f8', ), :10].dtype), 0) assert isinstance(bd[:10, 'f8'], numpy.ndarray) assert isinstance(bd['f8'], BigBlock) assert isinstance(bd[['f8', 'f4'], ], BigData) assert_equal(len(bd[['f8', 'f4'], ].dtype), 2) assert isinstance(bd[['f8', 'f4'], :10], numpy.ndarray) shutil.rmtree(fname)
def test_closed(comm): fname = tempfile.mkdtemp() x = BigFile(fname, create=True) x.create('.') x.close() assert x.blocks == [] try: h = x['.'] except BigFileClosedError: pass try: x.refresh() except BigFileClosedError: pass
def test_file_large_attr(comm): import os.path fname = tempfile.mkdtemp() x = BigFile(fname, create=True) data = numpy.ones(1024 * 128 * 8, dtype='f8') with x['.'] as bb: bb.attrs['value'] = data with x['.'] as bb: assert_equal(bb.attrs['value'], data) shutil.rmtree(fname)
def test_fileattr(comm): import os.path fname = tempfile.mkdtemp() x = BigFile(fname, create=True) assert not os.path.exists(os.path.join(fname, 'attr-v2')) assert not os.path.exists(os.path.join(fname, '000000')) with x['.'] as bb: bb.attrs['value'] = 1234 assert bb.attrs['value'] == 1234 assert not os.path.exists(os.path.join(fname, 'header')) assert os.path.exists(os.path.join(fname, 'attr-v2')) shutil.rmtree(fname)
def find_redshift(redshift, directory, pig=True): """Find a snapshot at a given redshift from a directory list. Returns snapshot number.""" if pig: fname = "PIG_*" else: fname = "PART_*" globs = glob.glob(os.path.join(directory, fname)) for gg in globs: bf = BigFile(gg) rr = 1/bf['Header'].attrs['Time']-1 if np.abs(rr - redshift) < 0.05: return gg del bf return None
def test_create_odd(comm): fname = tempfile.mkdtemp() x = BigFile(fname, create=True) x.create('.') d = numpy.dtype('f4') numpy.random.seed(1234) # test creating with x.create(d.str, Nfile=3, dtype=d, size=455**3) as b: data = numpy.random.uniform(100000, size=455**3).astype(d) b.write(0, data) import os os.system("ls -r %s" % fname) for b in x.blocks: assert b in x for b in x: assert b in x shutil.rmtree(fname)
def test_bigdata(comm): fname = tempfile.mkdtemp() x = BigFile(fname, create=True) x.create('.') for d in dtypes: dt = numpy.dtype(d) numpy.random.seed(1234) # test creating with x.create(str(d), Nfile=1, dtype=dt, size=128) as b: data = numpy.random.uniform(100000, size=128*128).view(dtype=b.dtype.base).reshape([-1] + list(dt.shape))[:b.size] b.write(0, data) bd = BigData(x) assert set(bd.dtype.names) == set(x.blocks) assert isinstance(bd[:], numpy.ndarray) assert isinstance(bd['f8'], BigBlock) assert_equal(len(bd['f8'].dtype), 0) # tuple of one item is the same as non-tuple assert isinstance(bd[('f8',)], BigBlock) assert_equal(len(bd[('f8',)].dtype), 0) assert isinstance(bd['f8', :10], numpy.ndarray) assert_equal(len(bd['f8', :10]), 10) assert_equal(len(bd['f8', :10].dtype), 0) assert_equal(len(bd[['f8',], :10].dtype), 1) # tuple of one item is the same as non-tuple assert_equal(len(bd[('f8',), :10].dtype), 0) assert isinstance(bd[:10, 'f8'], numpy.ndarray) assert isinstance(bd['f8'], BigBlock) assert isinstance(bd[['f8', 'f4'],], BigData) assert_equal(len(bd[['f8', 'f4'],].dtype), 2) assert isinstance(bd[['f8', 'f4'],:10], numpy.ndarray) shutil.rmtree(fname)
def plot_bhmf(pig, label=None): """Plot a black hole mass function from a FOF table.""" bf = BigFile(pig) redshift = 1/bf['Header'].attrs['Time']-1 hh = bf['Header'].attrs['HubbleParam'] lbox = bf['Header'].attrs['BoxSize']/1000/hh lfm = getbmf(bf,lbox, hh) plt.plot(lfm[0],lfm[1],label=(label or '')+' z=%.1f'%redshift) plt.fill_between(lfm[0],lfm[2],lfm[3],alpha=0.2) plt.xlabel(r'$\mathrm{log}_{10} [M_{\rm BH}/M_{\odot}]$',fontsize=17) plt.ylabel(r'$\mathrm{log}_{10} \phi/[\mathrm{dex}^{-1} \mathrm{Mpc}^{-3}]$',fontsize=15) plt.xlim(6,12) plt.ylim(-7,-2.5) plt.title('BH Mass function',fontsize=15) plt.legend(fontsize=15)
def __init__(self, collection, feature, batch_size=100, rootpath=ROOT_PATH): self.feat_file = BigFile( os.path.join(rootpath, collection, 'FeatureData', feature)) self.batch_size = batch_size self.label_set = None self.aux_label_set = None self.img_ids = map( str.strip, open( os.path.join(rootpath, collection, 'ImageSets', collection + '.txt')).readlines()) self.update()
def test_blank_attr(comm): fname = tempfile.mkdtemp() x = BigFile(fname, create=True) with x.create("Header", Nfile=1, dtype=None, size=128) as b: with pytest.raises(BigFileError): b.attrs['v 3'] = ['a', 'bb', 'ccc'] with pytest.raises(BigFileError): b.attrs['v\t3'] = ['a', 'bb', 'ccc'] with pytest.raises(BigFileError): b.attrs['v\n3'] = ['a', 'bb', 'ccc'] with pytest.raises(BigFileError): x.create(" ", Nfile=1, dtype=None, size=128) with pytest.raises(BigFileError): x.create("\t", Nfile=1, dtype=None, size=128) with pytest.raises(BigFileError): x.create("\n", Nfile=1, dtype=None, size=128) shutil.rmtree(fname)
if newtag != tag: raise Exception("rank = %d, tag is %s, on root tag is %s" % (self.comm.rank, tag, newtag)) self.comm.Barrier() if self.comm.rank == 0: print(tag) #bigfile = BigFile(argv[1]) from argparse import ArgumentParser ap = ArgumentParser() ap.add_argument("config") ap.add_argument("input") ap.add_argument("output") ns = ap.parse_args() bigfile = BigFile(ns.input) world = MPI.COMM_WORLD if world.rank == 0: attrs = bigfile.open("Header").attrs HEADER = dict([(i, attrs[i]) for i in attrs]) else: HEADER = None HEADER = world.bcast(HEADER) BoxSize = HEADER['BoxSize'] # set up some defaults SMLFACTOR = 1.0 TilePadding = 256
import sys import os from bigfile import BigFile os.system("python merge_feat.py f3d toydata,toydata,toydata,toydata2 newdata --rootpath ./ --overwrite 1") feat_file = BigFile("newdata/FeatureData/f3d") renamed, vectors = feat_file.read(feat_file.names) for _id, _vec in zip(renamed, vectors): print _id, _vec
if __name__ == '__main__': from sys import argv # this will set the units to # # time: 980 Myear/h # distance: 1 Kpc/h # speed: 100 km/s # mass: 1e10 Msun /h DH = 3e5 / 100. G = 43007.1 H0 = 0.1 Nmesh = int(argv[2]) file = BigFile(argv[1]) header = file.open('header') BoxSize = header.attrs['BoxSize'][0] a0 = header.attrs['Time'][0] Ntot = file.open('1/ID').size myslice = slice( MPI.COMM_WORLD.rank * Ntot // MPI.COMM_WORLD.size, (MPI.COMM_WORLD.rank + 1) * Ntot // MPI.COMM_WORLD.size, ) P = lambda : None P.Pos = file.open('1/Position')[myslice]
if __name__ == "__main__": rootpath = './' trainCollection = 'toydata' nimages = 2 feature = 'f1' dim = 3 testCollection = trainCollection testset = testCollection featureDir = os.path.join(rootpath, trainCollection, "FeatureData", feature) searcher = simpleknn.load_model(os.path.join(featureDir, "feature.bin"), dim, nimages, os.path.join(featureDir, "id.txt")) searcher.set_distance('l2') searcher.set_distance('l1') print ("[simpleknn] dim=%d, nr_images=%d" % (searcher.get_dim(), searcher.get_nr_images())) testfeaturedir = os.path.join(rootpath, testCollection, 'FeatureData', feature) testfeaturefile = BigFile(testfeaturedir, dim) testset = testfeaturefile.names for testid in testset: testfeature = testfeaturefile.read_one(testid) visualNeighbors = searcher.search_knn(testfeature, max_hits=20000) print testid, len(visualNeighbors), " ".join(["%s %.3f" % (v[0],v[1]) for v in visualNeighbors[:3]])
def main(unused_args): length_normalization_factor = FLAGS.length_normalization_factor # Load model configuration config_path = os.path.join(os.path.dirname(__file__), 'model_conf', FLAGS.model_name + '.py') config = utility.load_config(config_path) config.trainCollection = FLAGS.train_collection config.word_cnt_thr = FLAGS.word_cnt_thr config.rootpath = FLAGS.rootpath train_collection = FLAGS.train_collection test_collection = FLAGS.test_collection overwrite = FLAGS.overwrite feature = FLAGS.vf_name img_set_file = os.path.join(rootpath, test_collection, 'VideoSets', '%s.txt' % test_collection) if not os.path.exists(img_set_file): img_set_file = os.path.join(rootpath, test_collection, 'ImageSets', '%s.txt' % test_collection) img_list = map(str.strip, open(img_set_file).readlines()) # have visual feature ready FLAGS.vf_dir = os.path.join(rootpath, test_collection, 'FeatureData', feature) vf_reader = BigFile(FLAGS.vf_dir) textbank = TextBank(utility.get_train_vocab_file(FLAGS)) config.vocab_size = len(textbank.vocab) config.vf_size = int(open(os.path.join(FLAGS.vf_dir, 'shape.txt')).read().split()[1]) model_dir = utility.get_model_dir(FLAGS) output_dir = utility.get_pred_dir(FLAGS) checkpoint_style = FLAGS.checkpoint_style if checkpoint_style == 'file': #output_per_filename = 'model_perf_in_topk_%d_%s' % (FLAGS.top_k, FLAGS.eval_model_list_file) # read validated top models validation_output_dir = utility.get_sim_dir(FLAGS) if not os.path.exists(output_dir): os.makedirs(output_dir) eval_model_list_file = os.path.join(validation_output_dir, 'loss_info.txt') #FLAGS.eval_model_list_file) shutil.copy(eval_model_list_file, output_dir) test_iter_list = [] for line in open(eval_model_list_file).readlines()[:FLAGS.top_k]: iter_current = int(line.strip().split()[0]) test_iter_list.append(iter_current) elif checkpoint_style == 'iter_interval': #output_per_filename = 'model_perf_in_%s' % FLAGS.eval_stat test_iter_list = range(*[int(x) for x in FLAGS.eval_stat.split("-")]) elif checkpoint_style == 'iter_num': #output_per_filename = 'model_perf_in_iter_%d' % FLAGS.iter_num test_iter_list = [FLAGS.iter_num] with_image_embedding = True if FLAGS.with_image_embedding != 0 else False g = tf.Graph() with g.as_default(): model = InferenceWrapper(config=config,model_dir=model_dir, gpu_memory_fraction=FLAGS.gpu_memory_fraction, gpu=FLAGS.gpu, with_image_embedding=with_image_embedding) model.build_model() for k, iter_n in enumerate(test_iter_list): model_path = os.path.join(model_dir, 'variables', 'model_%d.ckpt' % iter_n) while not os.path.exists(model_path+'.meta'): logger.error('Model path: %s', model_path) logger.error('Cannot load model file and exit') sys.exit(0) top_one_pred_sent_file = os.path.join(output_dir, 'top%d' % k, 'top_one_pred_sent.txt') top_n_pred_sent_file = os.path.join(output_dir, 'top%d' % k, 'top_n_pred_sent.txt') # perf_file = os.path.join(output_dir, 'model_%d.ckpt' % iter_n, 'perf.txt') if os.path.exists(top_one_pred_sent_file) and not overwrite: # write existing perf file and print out logger.info('%s exists. skip', top_one_pred_sent_file) continue if not os.path.exists(os.path.split(top_one_pred_sent_file)[0]): os.makedirs(os.path.split(top_one_pred_sent_file)[0]) logger.info('save results to %s', top_one_pred_sent_file) # load the trained model generator = CaptionGenerator(config, model, length_normalization_factor = length_normalization_factor) gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction) config_proto = tf.ConfigProto( intra_op_parallelism_threads=FLAGS.ses_threads, gpu_options=gpu_options, allow_soft_placement=True) #with tf.Session(config=config_proto) as session: #model.build_model(session, model_path) model.load_model(model_path) fout_one_sent = codecs.open(top_one_pred_sent_file, 'w','utf-8') fout_n_sent = codecs.open(top_n_pred_sent_file, 'w','utf-8') for progress,img in enumerate(img_list): print(img) # predict sentences given a visual feature visual_feature = np.array(vf_reader.read_one(img)) sentences = generator.beam_search( visual_feature, FLAGS.beam_size) # output top one sentence info sent_score = sentences[0].score sent = ' '.join(sentences[0].words) fout_one_sent.write(img + ' ' + '%.3f' % sent_score + ' ' + sent + '\n') logger.debug(img + ' ' + '%.3f' % sent_score + ' ' + sent) # output top n sentences info fout_n_sent.write(img) for sentence in sentences: sent_score = sentence.score sent = ' '.join(sentence.words) fout_n_sent.write('\t' + '%.3f' % sent_score + '\t' + sent) fout_n_sent.write('\n') if progress % 100 == 0: logger.info('%d images decoded' % (progress+1)) logger.info('%d images decoded' % (progress+1)) fout_one_sent.close() fout_n_sent.close()
import os, random import simpleknn from bigfile import BigFile rootpath = '/Users/xirong/VisualSearch' collection = 'train10k' nr_of_images = 10000 feature = 'color64' dim = 64 feature_dir = os.path.join(rootpath,collection,'FeatureData',feature) feature_file = BigFile(feature_dir, dim) imset = map(str.strip, open(os.path.join(rootpath,collection,'ImageSets','%s.txt'%collection)).readlines()) imset = random.sample(imset, 10) searcher = simpleknn.load_model(os.path.join(feature_dir, "feature.bin"), dim, nr_of_images, os.path.join(feature_dir, "id.txt")) searcher.set_distance('l1') renamed,vectors = feature_file.read(imset) for name,vec in zip(renamed,vectors): visualNeighbors = searcher.search_knn(vec, max_hits=100) print name, visualNeighbors[:3]
class BucketDataProvider(object): """TensorFlow Data Provider with Buckets""" def __init__(self, collection, vocab_file, feature, language, flag_shuffle=False, fluency_threshold=DEFAULT_FLUENCY_U, rootpath=ROOT_PATH): self.language = language self.anno_file_path = utility.get_sent_file(collection, language, rootpath) self.fluency_threshold = fluency_threshold self.textbank = TextBank(vocab_file) assert self.textbank.vocab[TOKEN_PAD] == 0 self.vf_reader = BigFile(utility.get_feat_dir(collection, feature, rootpath)) self.vf_names = set(self.vf_reader.names) self.vf_size = self.vf_reader.ndims self.flag_shuffle = flag_shuffle self._load_data() def shuffle_data_queue(self): random.shuffle(self._data_queue) def generate_batches(self, batch_size, buckets): """Return a list generator of mini-batches of training data.""" # create Batches batches = [] for max_seq_len in buckets: batches.append(Batch(batch_size, max_seq_len, self.vf_size, self.textbank.vocab[TOKEN_BOS])) # shuffle if necessary if self.flag_shuffle: np.random.shuffle(self._data_queue) # scan data queue for data in self._data_queue: # pdb.set_trace() sentence = data['sentence'] # Load visual features # print(len(data['image_id'])) visual_features = np.array(self.vf_reader.read_one(data['image_id'])) #print("11111111") # print (data['image_id']) # print(visual_features) # print(data['sentence']) # sent = self.textbank.decode_tokens(data['sentence'], flag_remove_bos=True) # for word in sent: # print (word) # # pdb.set_trace() if len(sentence) >= buckets[-1]: feed_res = batches[-1].feed_and_vomit(visual_features, sentence) ind_buc = len(buckets) - 1 else: for (ind_b, batch) in enumerate(batches): if len(sentence) < batch.max_seq_len: feed_res = batches[ind_b].feed_and_vomit(visual_features, sentence) ind_buc = ind_b break if feed_res: yield (ind_buc,) + feed_res batches[ind_buc].empty() def _load_data(self, verbose=True): logger.debug('Loading data') self._data_queue = [] annoss = codecs.open(self.anno_file_path,'r','utf-8').readlines() annos = [an.encode('utf-8').decode('utf-8-sig') for an in annoss] for (ind_a, line) in enumerate(annos): data = {} sid, sent = line.strip().split(" ", 1) imgid = sid.strip().split("#", 1)[0] # print(imgid) assert(imgid in self.vf_names) # pdb.set_trace() # if imgid not in self.vf_names: # print(imgid) # logger.info('%s not in feature data, skipping that.'%imgid) # pdb.set_trace() # continue data['image_id'] = imgid # print(imgid) # # Encode sentences tokens = TextTool.tokenize(sent, self.language) data['sentence'] = self.textbank.encode_tokens(tokens, flag_add_bos=False) self._data_queue.append(data) if verbose and (ind_a + 1) % 20000 == 0: logger.debug('%d/%d annotation', ind_a + 1, len(annos)) random.shuffle( self._data_queue ) # ############################# changed by gxr nr_of_images = len(set([data['image_id'] for data in self._data_queue])) logger.info('%d images, %d sentences from %s', nr_of_images, len(self._data_queue), self.anno_file_path)