def compute_pca(data_path=os.path.join(BASE_DIR, 'data/memmap/'), out_path=os.path.join(BASE_DIR, 'data/'), batch_size=500, image_size=3*300*300): ipca = IncrementalPCA(n_components=3, batch_size=batch_size) path = os.path.join(data_path, 'tn_x.dat') train = np.memmap(path, dtype=theano.config.floatX, mode='r+', shape=(4044,image_size)) n_samples, _ = train.shape for batch_num, batch in enumerate(gen_batches(n_samples, batch_size)): X = train[batch,:] X = np.reshape(X, (X.shape[0], 3, int(image_size/3))) X = X.transpose(0, 2, 1) X = np.reshape(X, (reduce(np.multiply, X.shape[:2]), 3)) ipca.partial_fit(X) path = os.path.join(data_path, 'v_x.dat') valid = np.memmap(path, dtype=theano.config.floatX, mode='r+', shape=(500,image_size)) n_samples, _ = valid.shape for batch_num, batch in enumerate(gen_batches(n_samples, batch_size)): X = valid[batch,:] X = np.reshape(X, (X.shape[0], 3, int(image_size/3))) X = X.transpose(0, 2, 1) X = np.reshape(X, (reduce(np.multiply, X.shape[:2]), 3)) ipca.partial_fit(X) eigenvalues, eigenvectors = np.linalg.eig(ipca.get_covariance()) eigenvalues.astype('float32').dump(os.path.join(out_path, 'eigenvalues.dat')) eigenvectors.astype('float32').dump(os.path.join(out_path, 'eigenvectors.dat'))
def as_numpy_array(self, projection=['Smile', 'Trackerfail'], valid_only=True, train_proportion=0.8): tmp_train_images = os.path.join(self.cache_dir, self.TMP_TRAIN_IMAGES) tmp_train_labels = os.path.join(self.cache_dir, self.TMP_TRAIN_LABELS) tmp_test_images = os.path.join(self.cache_dir, self.TMP_TEST_IMAGES) tmp_test_labels = os.path.join(self.cache_dir, self.TMP_TEST_LABELS) if all([os.path.exists(f) for f in [tmp_train_images, tmp_train_labels, tmp_test_images, tmp_test_labels]]): X_train_memmap = np.memmap(tmp_train_images).reshape((-1, 64, 64, self.c_dim)) y_train_memmap = np.memmap(tmp_train_labels).reshape((-1, len(projection) - 1)) X_test_memmap = np.memmap(tmp_test_images).reshape((-1, 64, 64, self.c_dim)) y_test_memmap = np.memmap(tmp_test_labels).reshape((-1, len(projection) - 1)) else: entities = self.find_data_intersection() test_size = int(entities.shape[0] * (1 - train_proportion)) test_idxs = random.sample(range(entities.shape[0]), test_size) count = 0 train = [] test = [] for i, e in entities.iterrows(): entity = Entity(e.video, e.au_label, e.landmarks, self.cache_dir) entity_records = entity.frames(projection=projection, valid_only=valid_only) if i in test_idxs: test += entity_records else: train += entity_records self.logger.info('Finished video %d/%d' % (count + 1, len(entities))) count += 1 X_train_memmap, y_train_memmap = self.__list_to_array(tmp_train_images, tmp_train_labels, train) if test_idxs: X_test_memmap, y_test_memmap = self.__list_to_array(tmp_test_images, tmp_test_labels, test) p = np.random.permutation(X_train_memmap.shape[0]) return X_train_memmap[p], y_train_memmap[p], X_test_memmap, y_test_memmap
def __init__(self,fname): #dump to binary fndata=fname; if (not os.path.isfile(fndata)): fndatain=fndata.replace('/bin/','/'); datain=Epix100a(fndatain); #write header binheader=np.zeros(16).astype(np.uint32); binheader[0:6]=[datain.nframes, datain.my*datain.mx, datain.my, datain.mx, datain.nblocks, datain.nbcols]; binheader.tofile(fndata); #write data dataout=np.memmap(fndata,dtype=np.int16,mode='r+', shape=(datain.nframes,datain.my,datain.mx),offset=64); t0=time.clock(); for iframe in range(datain.nframes): dataout[iframe]=datain.frames(iframe); if (iframe%100==0): #progress(iframe,nframes,iframe); print str(iframe)+' - '+str(1000*(time.clock()-t0)/(iframe+1))+' ms. average frame: '+str(np.mean(datain.frames(iframe))); dataout.flush(); del dataout; del datain; #get nr of frames data=np.memmap(fndata,dtype=np.uint32,mode='r',shape=((64)),offset=0); self.nframes=data[0]; self.nframesize=data[1]; self.my=data[2]; self.mx=data[3]; self.nblocks=data[4]; self.nbcols=data[5]; self.data=np.memmap(fndata,dtype=np.int16,mode='c',shape=(self.nframes,self.my,self.mx),offset=64);
def linear_regression_2(self): '''Run a linear regression and save the output of that regression as new X features''' logging.info('Beginning Creation of new files based on Linear Regression model.') x_2 = self.X[:,1:] ** 2 y_2 = self.X_submit[:,1:] ** 2 self.reset_data() self.clean_data() kg.split_data(.3, .0001, 100, 100) lr = linear_model.LinearRegression() self.__fit(lr,lr.fit) self.__score_cv(lr,lr.predict) x_pred = lr.predict(self.X[:,1:]) y_pred = lr.predict(self.X_submit[:,1:]) self.reset_data() X_new = np.hstack((self.X,x_pred)) Y_new = np.hstack((self.X_submit, y_pred)) X_new = np.hstack((X_new,x_2)) Y_new = np.hstack((Y_new, y_2)) X_new = np.hstack((X_new,self.Y[:,1].reshape(-1,1))) logging.info('New X shape is %s' %(str(X_new.shape))) logging.info('New Y shape is %s' %(str(Y_new.shape))) mm = np.memmap('mm.x_with_linear.csv', dtype='float32', mode='w+',shape=X_new.shape) mm[:] = X_new[:] del mm mm = np.memmap('mm.y_with_linear.csv', dtype='float32', mode='w+',shape=Y_new.shape) mm[:] = Y_new[:] del mm logging.info('Completed creating new files based on Linear Regression model. Remember to update X_ROW values.')
def convert(in_name, out_name): """convert the file identified by filename in_name to a complex numpy array and store it to a file named out_name""" wav = wave.open(in_name,'rb') verifyfileformat(wav) length = wav.getnframes() channels = wav.getnchannels() logging.info('length: {} frames, channels: {}'.format(length, channels)) wav.close() # now that we know the format is valid, access data directly npinfile = np.memmap(in_name, dtype=np.int16, mode='r', offset=44) if npinfile.shape[0]/2 != length: raise TypeError('frame mismatch in direct access') # our output file, this will be an npy binary holding complex64 types npfile = np.memmap(out_name, dtype=np.complex64, mode='w+', shape=(length,)) # convert input to complex output npfile[:] = npinfile[0::2] + 1j * npinfile[1::2] # cleanup del npinfile del npfile
def test_score_memmap(): # Ensure a scalar score of memmap type is accepted iris = load_iris() X, y = iris.data, iris.target clf = MockClassifier() tf = tempfile.NamedTemporaryFile(mode='wb', delete=False) tf.write(b'Hello world!!!!!') tf.close() scores = np.memmap(tf.name, dtype=np.float64) score = np.memmap(tf.name, shape=(), mode='r', dtype=np.float64) try: cross_val_score(clf, X, y, scoring=lambda est, X, y: score) # non-scalar should still fail assert_raises(ValueError, cross_val_score, clf, X, y, scoring=lambda est, X, y: scores) finally: # Best effort to release the mmap file handles before deleting the # backing file under Windows scores, score = None, None for _ in range(3): try: os.unlink(tf.name) break except WindowsError: sleep(1.)
def compare_binary(pattern,testname): files = glob.glob(pattern) data1 = np.memmap(files[0],dtype=np.complex128) data2 = np.memmap(files[1],dtype=np.complex128) diff = data1 - data2 diff_real = np.abs(np.real(diff)) diff_imag = np.abs(np.imag(diff)) diff_abs = np.abs(data1) - np.abs(data2) diff_phase = np.angle(data1) - np.angle(data2) diff_phase[diff_phase > math.pi] -= (2*math.pi) diff_phase = np.abs(diff_phase) if np.max(diff_real) < 1e-15 and np.max(diff_imag) < 1e-15: print('TEST {}:\tOK'.format(testname)) else: print('TEST {}:\tFAILED'.format(testname)) print('differences between {} and {}'.format(files[0],files[1])) print('max difference real part:\t',np.max(diff_real )) print('max difference imag part:\t',np.max(diff_imag )) print('max difference modulus:\t\t',np.max(diff_abs )) print('max difference phase:\t\t', np.max(diff_phase)) print()
def read_ply(ply_filename): vfile = tempfile.mktemp() ffile = tempfile.mktemp() reader = ply_reader.PlyReader(ply_filename) v_id = 0 f_id = 0 # Reading the header for evt, data in reader.read(): if evt == ply_reader.EVENT_HEADER: n_vertices, n_faces = data vertices = np.memmap(vfile, dtype='float64', shape = (n_vertices,3), mode='w+') faces = np.memmap(ffile, dtype='int64', shape = (n_faces,3), mode='w+') break # Reading the vertices and faces for evt, data in reader.read(): if evt == ply_reader.EVENT_VERTEX: current_vertex = data vertices[v_id] = current_vertex v_id += 1 elif evt == ply_reader.EVENT_FACE: faces[f_id] = data f_id += 1 return vertices, faces
def load_vectors(self): """ Loads the appropriate word vector from each corpus in self.corpora """ for corp in self.corpora: rows = pd.read_pickle( '{0}{1}/{4}/{2}/{4}_IndexList_w={2}_lems=False_min_occs={3}_no_stops=False.pickle'.format( self.base, corp[0], corp[1], corp[2], self.english)) i = rows.index(self.greek) if self.norm: os.system('echo Now normalizing {}'.format(corp[0])) orig = np.memmap( '{0}{1}/{4}/{2}/{5}_{2}_lems=False_{4}_min_occ={3}_{6}no_stops=False_weighted={7}.dat'.format( self.base, corp[0], corp[1], corp[2], self.english, self.prefix, self.svd, corp[3]), dtype='float', shape=(len(rows), len(rows))) normed = np.memmap( '{0}{1}/{4}/{2}/{5}_{2}_lems=False_{4}_min_occ={3}_{6}no_stops=False_weighted={7}_NORMED.dat'.format( self.base, corp[0], corp[1], corp[2], self.english, self.prefix, self.svd, corp[3]), dtype='float', mode='w+', shape=(len(rows), len(rows))) normed[:] = scale(orig) r = normed[i] del normed del orig else: r = np.memmap( '{0}{1}/{4}/{2}/{5}_{2}_lems=False_{4}_min_occ={3}_{6}no_stops=False_weighted={7}_NORMED.dat'.format( self.base, corp[0], corp[1], corp[2], self.english, self.prefix, self.svd, corp[3]), dtype='float', shape=(len(rows), len(rows)))[i] self.ekk_rows[corp[0]] = pd.Series(r, index=rows)
def save(self, dirname = None): """Save the current rdfspace to a directory (by default the directory in which indexes are stored)""" if dirname is None and self._index_dir is not None: dirname = self._index_dir if not os.path.exists(dirname): os.makedirs(dirname) # We memmap big matrices, as pickle eats the whole RAM # We don't save the full adjacency matrix ut_m = np.memmap(os.path.join(dirname, 'ut.dat'), dtype='float64', mode='w+', shape=self._ut_shape) ut_m[:] = self._ut[:] s_m = np.memmap(os.path.join(dirname, 's.dat'), dtype='float64', mode='w+', shape=self._s_shape) s_m[:] = self._s[:] vt_m = np.memmap(os.path.join(dirname, 'vt.dat'), dtype='float64', mode='w+', shape=self._vt_shape) vt_m[:] = self._vt[:] if self._index_dir is None: # The index is in memory, we'll pickle it with the rest (adjacency, ut, s, vt) = (self._adjacency, self._ut, self._s, self._vt) (self._adjacency, self._ut, self._s, self._vt) = (None, None, None, None) f = open(os.path.join(dirname, 'space.dat'), 'w') pickle.dump(self, f) f.close() (self._adjacency, self._ut, self._s, self._vt) = (adjacency, ut, s, vt) else: # Flushing indexes self._uri_index.close() self._index_uri.close() # The index is stored in dbm, we will exclude it from the pickle (adjacency, ut, s, vt) = (self._adjacency, self._ut, self._s, self._vt) (self._adjacency, self._ut, self._s, self._vt, self._uri_index, self._index_uri) = (None, None, None, None, None, None) f = open(os.path.join(dirname, 'space.dat'), 'w') pickle.dump(self, f) f.close() (self._adjacency, self._ut, self._s, self._vt) = (adjacency, ut, s, vt) self._uri_index = dbm.open(os.path.join(dirname, 'uri_index'), 'r') self._index_uri = dbm.open(os.path.join(dirname, 'index_uri'), 'r')
def readchunk(H, fname, bottom=None, top=None, extra=''): fid = int(fname.replace('.', '-').split('-')[-1]) dispx = numpy.memmap(fname % ('dispx' + extra), mode='r', dtype='f4') dispy = numpy.memmap(fname % ('dispy' + extra), mode='r', dtype='f4') dispz = numpy.memmap(fname % ('dispz' + extra), mode='r', dtype='f4') delta = numpy.memmap(fname % ('delta' + extra), mode='r', dtype='f4') if False and H['Scale'] == 0.0: assert H['DownSample'] == 1 xstart = fid * int(numpy.ceil(1.0 * H['Nmesh'] / H['NTask'])) xend = (fid + 1) * int(numpy.ceil(1.0 * H['Nmesh'] / H['NTask'])) if xend > H['Nmesh']: xend = H['Nmesh'] index = numpy.arange( xstart * H['Nmesh'] * H['Nmesh'], xend * H['Nmesh'] * H['Nmesh']) if xstart > xend: xstart = xend else: index = numpy.memmap(fname % 'index', mode='r', dtype='i8') ipos = numpy.array(numpy.unravel_index(index, H['Size']), dtype='i4').T ipos += H['Offset'] if bottom is not None and top is not None: includemask = ipos_in_box(ipos, bottom, top) else: includemask = numpy.ones(len(ipos), dtype='?') result = numpy.empty(includemask.sum(), dtype=[('ipos', ('i4', 3)), ('disp', ('f4', 3)), ('delta', 'f4')]) if len(result) == 0: return result result['ipos'] = ipos[includemask] result['disp'][:, 0] = dispx[includemask] result['disp'][:, 1] = dispy[includemask] result['disp'][:, 2] = dispz[includemask] result['delta'] = delta[includemask] return result
def sim_calc(self): nt = self.corpora[0] self.scores = {} for corp in self.corpora: i_nt = [] i_c2 = [] rows = self.ekk_rows[corp[0]] for i, word in enumerate(self.ekk_rows['NT']): if word in rows: i_nt.append(i) i_c2.append(self.ekk_rows[corp[0]].index(word)) d_c2 = np.memmap( '{0}{1}/{4}/{2}/{5}_{2}_lems=False_{4}_min_occ={3}_{6}no_stops=False_NORMED.dat'.format( self.base, corp[0], corp[1], corp[2], self.english, self.prefix, self.svd), dtype='float32', shape=(len(rows), len(rows)))[i_c2] d_c2 = d_c2[:, i_c2] d_nt = np.memmap( '{0}{1}/{4}/{2}/{5}_{2}_lems=False_{4}_min_occ={3}_{6}no_stops=False_NORMED.dat'.format( self.base, nt[0], nt[1], nt[2], self.english, self.prefix, self.svd), dtype='float32', shape=(len(self.ekk_rows['NT']), len(self.ekk_rows['NT'])))[ i_nt] d_nt = d_nt[:, i_nt] self.scores['{0}_{1}'.format('NT', corp[0])] = np.average(np.diag( 1 - pairwise_distances(d_nt, d_c2, metric='cosine', n_jobs=12)))
def main(A): sightlines = Sightlines(A) fgpa = FGPAmodel(A) Npixels = sightlines.Npixels.sum() specloglam = numpy.memmap(A.SpectraOutputLogLam, mode='w+', dtype='f4', shape=Npixels) # now save LogLam of the pixels for ease of access # (not used by our code) LogLamGrid = A.LogLamGrid LogLamCenter = 0.5 * (LogLamGrid[1:] + LogLamGrid[:-1]) for index in range(len(sightlines)): sl2 = slice(sightlines.PixelOffset[index], sightlines.PixelOffset[index] + sightlines.Npixels[index]) sl = slice( sightlines.LogLamGridIndMin[index], sightlines.LogLamGridIndMax[index] - 1) specloglam[sl2] = LogLamCenter[sl] specloglam.flush() # now save QSONpixel for ease of access # (not used by our code) QSONpixel = numpy.memmap(A.QSONpixel, mode='w+', dtype='i4', shape=len(sightlines)) QSONpixel[...] = numpy.int32(sightlines.Npixels) QSONpixel.flush()
def parse_graph(self, graph_path, data_dir='data', load_edges=False, extend_paths=2): graph = parser.Graph(graph_path) self.from_nodes, self.to_nodes = graph.get_mappings() graph.save_mappings(self.output_dir) if load_edges: self.inverse_degrees = np.memmap( os.path.join(data_dir, 'inverse_degrees.mat'), mode='r', dtype='float32' ) self.from_to_idxs = np.memmap( os.path.join(data_dir, 'from_to.mat'), mode='r', dtype='int32' ) self.from_to_idxs = np.reshape(self.from_to_idxs, newshape=(self.inverse_degrees.shape[0], 2)) else: from_to_idxs, inverse_degrees = graph.extend_graph(max_degree=extend_paths) self.from_to_idxs = np.memmap( os.path.join(data_dir, 'from_to.mat'), mode='r+', shape=from_to_idxs.shape, dtype='int32' ) self.from_to_idxs[:] = from_to_idxs[:] self.inverse_degrees = np.memmap( os.path.join(data_dir, 'inverse_degrees.mat'), mode='r+', shape=inverse_degrees.shape, dtype='float32' ) self.inverse_degrees[:] = inverse_degrees[:]
def get_session(self, session=-1, signal="data"): """Return the aggregate data array of a session If the session consists in many buffers, they are concatenated into a single buffer loaded in memory. If the data is a single file, it is memmaped as an array. """ sessions = self.list_sessions() if isinstance(session, int): session_id = sessions[session] elif session in sessions: session_id = session else: raise ValueError("No such session %r" % session) signal_folder = os.path.join(self.data_folder, session_id, signal) data_files = os.listdir(signal_folder) dtypes = [self.decode_dtype(filename) for filename in data_files] if len(data_files) == 0: return np.array([]) elif len(data_files) == 1: return np.memmap(os.path.join(signal_folder, data_files[0]), dtype=dtypes[0]) else: return np.concatenate( [np.memmap(os.path.join(signal_folder, f), dtype=dtype) for f, dtype in zip(data_files, dtypes)] )
def memmap(docompute, dowrite, verbose): afilename = os.path.join(OUT_DIR, "memmap-a.bin") bfilename = os.path.join(OUT_DIR, "memmap-b.bin") rfilename = os.path.join(OUT_DIR, "memmap-output.bin") if dowrite: t0 = time() a = np.memmap(afilename, dtype='float32', mode='w+', shape=shape) b = np.memmap(bfilename, dtype='float32', mode='w+', shape=shape) # Fill arrays a and b #row = np.linspace(0, 1, ncols) row = np.arange(0, ncols, dtype='float32') for i in range(nrows): a[i] = row * (i + 1) b[i] = row * (i + 1) * 2 del a, b # flush data print("[numpy.memmap] Time for creating inputs:", round(time() - t0, 3)) if docompute: t0 = time() # Reopen inputs in read-only mode a = np.memmap(afilename, dtype='float32', mode='r', shape=shape) b = np.memmap(bfilename, dtype='float32', mode='r', shape=shape) # Create the array output r = np.memmap(rfilename, dtype='float32', mode='w+', shape=shape) # Do the computation row by row for i in range(nrows): r[i] = eval(expr, {'a': a[i], 'b': b[i]}) if verbose: print("First ten values:", r[0, :10]) del a, b del r # flush output data print("[numpy.memmap] Time for compute & save:", round(time() - t0, 3))
def load_data(fname, use_cropped=False, as_grey=False): n = 4543 size = int(fname.split('_')[0]) if use_cropped: if as_grey: X_fname = 'cache/X_cropped_grey_%s.npy' % fname y_fname = 'cache/y_cropped_grey_%s.npy' % fname else: X_fname = 'cache/X_cropped_%s.npy' % fname y_fname = 'cache/y_cropped_%s.npy' % fname else: X_fname = 'cache/X_%s.npy' % fname y_fname = 'cache/y_%s.npy' % fname num_channels = 1 if args.as_grey else 3 X_shape = (n, num_channels, size, size) y_shape = (n,) X = np.memmap(X_fname, dtype=np.float32, mode='r', shape=X_shape) y = np.memmap(y_fname, dtype=np.int32, mode='r', shape=y_shape) assert X.shape == X_shape assert y.shape == y_shape return X, y
def update(self): """ Updates L-BFGS algorithm history """ unix.cd(self.path) s = self.load('m_new') - self.load('m_old') y = self.load('g_new') - self.load('g_old') m = len(s) n = self.memory if self.memory_used == 0: S = np.memmap('LBFGS/S', mode='w+', dtype='float32', shape=(m, n)) Y = np.memmap('LBFGS/Y', mode='w+', dtype='float32', shape=(m, n)) S[:, 0] = s Y[:, 0] = y self.memory_used = 1 else: S = np.memmap('LBFGS/S', mode='r+', dtype='float32', shape=(m, n)) Y = np.memmap('LBFGS/Y', mode='r+', dtype='float32', shape=(m, n)) S[:, 1:] = S[:, :-1] Y[:, 1:] = Y[:, :-1] S[:, 0] = s Y[:, 0] = y if self.memory_used < self.memory: self.memory_used += 1 return S, Y
def __init__(self, one_hot=False, shuffle_rng=None, preproc=[], size=(48,48), num_channels=1, img_per_seq=3, path=None): if path is None: path = '/data/lisa/data/faces/EmotiW/preproc/arranged_data' self.x = np.memmap(path + '_x.npy', mode='r', dtype='float32') self.y = np.memmap(path + '_y.npy', mode='r', dtype='uint8') self.y = self.y.view() self.y.shape = (len(self.y)/(img_per_seq), img_per_seq, 1) self.x = self.x.view() self.x.shape = (len(self.y), img_per_seq, size[0], size[1], num_channels) if shuffle_rng is None: shuffle_rng = np.random.RandomState((2013, 06, 11)) elif not isinstance(shuffle_rng, np.random.RandomState): shuffle_rng = np.random.RandomState(shuffle_rng) self.permutation = shuffle_rng.permutation(len(self.y)) self.one_hot = one_hot self.space = CompositeSpace( (FaceTubeSpace(shape=size, num_channels=num_channels, axes=('b', 't', 0, 1, 'c')), VectorSpace(dim=(self.one_hot and 7 or 1)))) self.source = ('features', 'targets') self.data_specs = (self.space, self.source) self.n_samples = len(self.y)
def get_data(start,stop): #n = np.exp(np.squeeze(collect("n",tind=[start,stop],path=path,info=False))) n = (np.squeeze(collect("n",tind=[start,stop],path=path,info=False))) #phi = (np.squeeze(collect("phi",tind=[start,stop],path=path,info=False))) n_mmap = np.memmap(nfile,dtype=n.dtype.name,mode='w+',shape=n.shape) n_mmap[:] = n[:] print 'n_mmap.shape :',n_mmap.shape,n.shape del n gc.collect() u = np.squeeze(collect("u",tind=[start,stop],path=path,info=False)) u_mmap = np.memmap(ufile,dtype=u.dtype.name,mode='w+',shape=u.shape) u_mmap[:] = u[:] del u gc.collect() fft_u = np.fft.rfft(u_mmap) power = fft_u.conj()*fft_u A_k = np.real(np.sqrt(power)) del fft_u,power gc.collect() phi = np.squeeze(collect("phi",tind=[start,stop],path=path,info=False)) phi_mmap = np.memmap(phifile,dtype=phi.dtype.name,mode='w+',shape=phi.shape) phi_mmap[:] = phi[:] del phi gc.collect() return n_mmap,u_mmap,A_k,phi_mmap
def LogOfMatrix(ccMapObj): ccMapObj.make_readable() LogHiCmap = CCMAP() LogHiCmap.path2matrix = os.getcwd() + '/nparray_' + getRandomName() + '.bin' LogHiCmap.shape = ccMapObj.shape LogHiCmap.xticks = ccMapObj.xticks LogHiCmap.yticks = ccMapObj.yticks LogHiCmap.binsize = ccMapObj.binsize LogHiCmap.bLog = True bNonZeros = None #if ccMapObj.bNoData is not None: # LogHiCmap.bNoData = ccMapObj.bNoData # bNonZeros = ~LogHiCmap.bNoData #else: LogHiCmap.bNoData = np.all( ccMapObj.matrix == 0.0, axis=0) bNonZeros = ~LogHiCmap.bNoData # Log of part of matrix containing data path2matrixA = os.getcwd() + '/nparray_' + getRandomName() + '.bin' A = (ccMapObj.matrix[bNonZeros,:])[:,bNonZeros] # Selected row-column which are not all zeros BinMatrixA = np.memmap(path2matrixA, dtype=dtype_npBINarray, mode='w+', shape=A.shape) BinMatrixA[:] = np.log10(A)[:] BinMatrixA.flush() # Assigning minvalue and maxvalue LogHiCmap.maxvalue = float(np.amax(BinMatrixA)) minvalue = np.amin(BinMatrixA) v_steps = np.linspace(minvalue, LogHiCmap.maxvalue, 100) LogHiCmap.minvalue = minvalue - (v_steps[1] - v_steps[0]) # Making full matrix BinLogMatrix = np.memmap(LogHiCmap.path2matrix, dtype=dtype_npBINarray, mode='w+', shape=LogHiCmap.shape) A_i = -1 A_j = 0 for i in range(BinLogMatrix.shape[0]): if not LogHiCmap.bNoData[i]: A_i += 1 A_j = 0 for j in range(BinLogMatrix.shape[1]): if LogHiCmap.bNoData[i] or LogHiCmap.bNoData[j]: BinLogMatrix[i][j] = LogHiCmap.minvalue else: BinLogMatrix[i][j] = BinMatrixA[A_i][A_j] A_j += 1 BinLogMatrix.flush() del BinLogMatrix del BinMatrixA try: os.remove(path2matrixA) except: pass return LogHiCmap
def _train(self, x): # print self.dtype if len(x) > self.defaultOutputLength: self.defaultOutputLength = len(x) self.cacheLength += len(x) if self.cache is None: if self.cacheSize == -1: #self.cache = np.memmap(self.cacheName, dtype='float32', mode='w+', shape = x.shape) self.cache = np.memmap(self.cacheName, dtype=self.dtype, mode='w+', shape = x.shape) else: #self.cache = np.memmap(self.cacheName, dtype='float32', mode='w+', shape = (self.cacheSize, len(x[0]))) self.cache = np.memmap(self.cacheName, dtype=self.dtype, mode='w+', shape = (self.cacheSize, len(x[0]))) elif self.cacheSize == -1: self.reshape((self.cache.shape[0]+len(x), len(x[0]))) # print x[0][0].dtype.itemsize # print self.cache._mmap.size() # #self.cache._mmap.resize( (self.cache.shape[0]+len(x), len(x[0])) ) # print self.cache.shape # newShape = (self.cache.shape[0]+len(x), len(x[0])) # memmap_resize( newShape, self.cache ) # del self.cache # self.cache = np.memmap(self.cacheName, dtype=self.dtype, mode='w+', shape = newShape) # print "new size: "+str(self.cache._mmap.size()) # print self.cache.reshape(newShape) self.cache[self.cachePos:self.cachePos+len(x)] = x # print self.cache._mmap.size() # print self.cache[0][0] # print self.cache[0][0].dtype.itemsize # print "---" self.cachePos += len(x)
def group_iter(input_arrays, swath_cols, swath_rows, input_dtype, output_arrays, grid_cols, grid_rows, group_size): ret_input_arrays = [] ret_output_arrays = [] for idx, (ia, oa) in enumerate(zip(input_arrays, output_arrays)): if isinstance(ia, str): ret_input_arrays.append(numpy.memmap(ia, shape=(swath_rows, swath_cols), dtype=input_dtype, mode='r')) else: ret_input_arrays.append(ia) # We iterate over this so that we only create output arrays when they are used if oa is None: ret_output_arrays.append(numpy.empty((grid_rows, grid_cols), dtype=ia.dtype)) # we should return the numpy arrays in the main function since the user didn't provide any output_arrays[idx] = ret_output_arrays[-1] elif isinstance(oa, str): ret_output_arrays.append(numpy.memmap(oa, shape=(grid_rows, grid_cols), dtype=input_dtype, mode='w+')) else: ret_output_arrays.append(oa) if group_size is None or len(ret_input_arrays) >= group_size: LOG.debug("Yielding group of size %d because group size is %d", len(ret_input_arrays), group_size) yield tuple(ret_input_arrays), tuple(ret_output_arrays) ret_input_arrays = [] ret_output_arrays = [] if len(ret_input_arrays): LOG.debug("Yielding remaining group items to process for EWA resampling") yield tuple(ret_input_arrays), tuple(ret_output_arrays)
def next(self): # for python 2.x # Keep under lock only the mechainsem which advance the indexing of each batch # see # http://anandology.com/blog/using-iterators-and-generators/ with self.lock: song_idx, self.cur_song = self.cur_song, self.cur_song+1 bX, bY = (None, None) if song_idx < self.n_songs: x_path = self.data[self.sidstr[song_idx]]['X_path'] y_path = self.data[self.sidstr[song_idx]]['y_path'] bX = np.memmap( x_path, dtype='float32', mode='r', shape=tuple(self.data[self.sidstr[song_idx]]['X_shape']) ) bY = np.memmap( y_path, dtype='float32', mode='r', shape=tuple(self.data[self.sidstr[song_idx]]['y_shape']) ) return bX, bY else: raise StopIteration() return bX, bY
def __init__(self,fdata,fndata): #dump to binary print('Initialize binary from ' + fdata) if (not os.path.isfile(fndata)): print('create Epix100a flat file ' + fndata + ' from ' + fdata) datain=Epix100a(fdata); #write header binheader=np.zeros(16).astype(np.uint32); binheader[0:6]=[datain.nframes, datain.my*datain.mx, datain.my, datain.mx, datain.nblocks, datain.nbcols]; binheader.tofile(fndata); #write data dataout=np.memmap(fndata,dtype=np.int16,mode='r+', shape=(datain.nframes,datain.my,datain.mx),offset=64); t0=time.clock(); for iframe in range(datain.nframes): dataout[iframe]=datain.frame(iframe); if (iframe%100==0): #progress(iframe,nframes,iframe); print (str(iframe)+' - '+str(1000*(time.clock()-t0)/(iframe+1))+' ms. average frame: '+str(np.mean(datain.frame(iframe)))) dataout.flush(); del dataout; del datain; #get nr of frames else: print(fndata + ' file already exists.') data=np.memmap(fndata,dtype=np.uint32,mode='r',shape=((64)),offset=0); self.nframes=data[0]; self.nframesize=data[1]; self.my=data[2]; self.mx=data[3]; self.nblocks=data[4]; self.nbcols=data[5]; self.data=np.memmap(fndata,dtype=np.int16,mode='c',shape=(self.nframes,self.my,self.mx),offset=64);
def extract_to_memmap(self): """ Allocate a memmap, fill it with extracted features, return r/o view. """ filename = self.filename feature_shp = self.feature_shp print('Creating memmap %s for features of shape %s' % ( filename, str(feature_shp))) features_fp = np.memmap(filename, dtype='float32', mode='w+', shape=feature_shp) info = open(filename+'.info', 'w') cPickle.dump(('float32', feature_shp), info) del info self.extract_to_storage(features_fp) # -- docs here: # http://docs.scipy.org/doc/numpy/reference/generated/numpy.memmap.html # say that deletion is the way to flush changes !? del features_fp rval = np.memmap(self.filename, dtype='float32', mode='r', shape=feature_shp) return rval
def __next__(self): #check to see if at end of chunks if self._chunk_counter==self.num_chunks: offset = int(self._chunk_counter * self.chunksize) row_size = self.rmndr_row_size self._chunk_counter += 1 elif self._chunk_counter < self.num_chunks: offset = int(self._chunk_counter * self.chunksize) end_dp = (self._chunk_counter+1) + self.chunksize row_size = self.chunk_row_size self._chunk_counter += 1 elif self._chunk_counter > self.num_chunks: raise StopIteration if self.abr.header['f_structure']['nDataFormat'][0]==1: #float data data = memmap(self.abr.fid, dtype = float32, shape = (row_size,self.ncols), offset = offset+self.offset_base) return data elif self.abr.header['f_structure']['nDataFormat'][0]==0: #integer data try: data = memmap(self.abr.fid, dtype = int16, shape = (row_size,self.ncols), mode = 'r',offset = offset + self.offset_base) except ValueError: pdb.set_trace() data = data[:].astype(float32) data = self.abr.scale_int_data(data) return data
def get_sequence(mraw_path, file_shape, nmax=None, offset=0): ''' Get a sequence of image files as 3D numpy array. :param mraw_path: path to .mraw file containing image data :param file_shape: tuple, (ntotal, height, width) of images in .mraw file :param nmax: maximum number of images in sequence :param offset: First image to be read :return: 3D array of image sequence ''' ntotal, h, w = file_shape byte_size = 2*h*w # Number of bytes for one image byte_offset = offset * byte_size # Offset to first byte to be read # If only a single image was requested: if nmax and nmax == 1: with open(mraw_path, 'rb') as mraw: imarray = np.memmap(mraw, dtype=np.uint16, offset=byte_offset, mode='r', shape=(h, w)) # Only display nmax or less images: elif nmax and ntotal > nmax: image_step = ntotal//nmax with open(mraw_path, 'rb') as mraw: memmap = np.memmap(mraw, dtype=np.uint16, offset=byte_offset, mode='r', shape=(ntotal-offset, h, w)) imarray = memmap[::image_step, :, :] # If there are less than nmax images: else: with open(mraw_path, 'rb') as mraw: imarray = np.memmap(mraw, dtype=np.uint16, offset=byte_offset, mode='r', shape=(ntotal-offset, h, w)) return imarray
def convert(cls, file_path): meta_path = file_path + '.meta' index_path = file_path + '.idx' edge_path = file_path + '.bin' with open(file_path, 'r') as f: nodes, edges = map(int, f.readline().split()) nodes, edges = nodes + 1, edges + 1 with open(meta_path, 'w+') as m: m.write('{} {}'.format(nodes, edges)) index_map = np.memmap(index_path, dtype='uint32', mode='w+', shape=(nodes, 2)) edge_map = np.memmap(edge_path, dtype='uint32', mode='w+', shape=(edges, 1)) current = 0 count = 0 degree = 0 for line in f: origin, destination = map(int, line.split()) while current < origin: index_map[current] = (count - degree, degree) degree = 0 current += 1 if current == origin: degree += 1 edge_map[count] = destination count += 1 index_map[current] = (count - degree, degree) index_map.flush() edge_map.flush()
def main(A): """convolve the tau(mass) field, add in thermal broadening and redshift distortion """ sightlines = Sightlines(A) maker = SpectraMaker(A, sightlines) fgpa = FGPAmodel(A) Npixels = sightlines.Npixels.sum() spectaureal = numpy.memmap(A.SpectraOutputTauReal, mode='w+', dtype='f4', shape=Npixels) spectaured = numpy.memmap(A.SpectraOutputTauRed, mode='w+', dtype='f4', shape=Npixels) specdelta = numpy.memmap(A.SpectraOutputDelta, mode='w+', dtype='f4', shape=Npixels) def work(i): sl2 = slice(sightlines.PixelOffset[i], sightlines.PixelOffset[i] + sightlines.Npixels[i]) result = maker.convolve(i, Afunc=fgpa.Afunc, Bfunc=fgpa.Bfunc) spectaureal[sl2] = result.taureal spectaured[sl2] = result.taured specdelta[sl2] = result.delta sightlines.Z_RED[i] = result.Zqso chunkmap(work, range(len(sightlines)), 100) spectaureal.flush() spectaured.flush() specdelta.flush() sightlines.Z_RED.flush()
def save_memmap(filenames, base_name='Yr', resize_fact=(1, 1, 1), remove_init=0, idx_xy=None, order='F', xy_shifts=None, is_3D=False, add_to_movie=0, border_to_0=0): """ Saves efficiently a list of tif files into a memory mappable file Parameters: ---------- filenames: list list of tif files or list of numpy arrays base_name: str the base used to build the file name. IT MUST NOT CONTAIN "_" resize_fact: tuple x,y, and z downampling factors (0.5 means downsampled by a factor 2) remove_init: int number of frames to remove at the begining of each tif file (used for resonant scanning images if laser in rutned on trial by trial) idx_xy: tuple size 2 [or 3 for 3D data] for selecting slices of the original FOV, for instance idx_xy = (slice(150,350,None), slice(150,350,None)) order: string whether to save the file in 'C' or 'F' order xy_shifts: list x and y shifts computed by a motion correction algorithm to be applied before memory mapping is_3D: boolean whether it is 3D data Returns: ------- fname_new: the name of the mapped file, the format is such that the name will contain the frame dimensions and the number of f """ # TODO: can be done online Ttot = 0 for idx, f in enumerate(filenames): if isinstance(f, str): print(f) if is_3D: #import tifffile # print("Using tifffile library instead of skimage because of 3D") Yr = f if isinstance(f, basestring) else tifffile.imread(f) if idx_xy is None: Yr = Yr[remove_init:] elif len(idx_xy) == 2: Yr = Yr[remove_init:, idx_xy[0], idx_xy[1]] else: Yr = Yr[remove_init:, idx_xy[0], idx_xy[1], idx_xy[2]] else: Yr = cm.load(f, fr=1, in_memory=True) if isinstance( f, basestring) else cm.movie(f) if xy_shifts is not None: Yr = Yr.apply_shifts(xy_shifts, interpolation='cubic', remove_blanks=False) if idx_xy is None: if remove_init > 0: Yr = np.array(Yr)[remove_init:] elif len(idx_xy) == 2: Yr = np.array(Yr)[remove_init:, idx_xy[0], idx_xy[1]] else: raise Exception('You need to set is_3D=True for 3D data)') Yr = np.array(Yr)[remove_init:, idx_xy[0], idx_xy[1], idx_xy[2]] if border_to_0 > 0: min_mov = Yr.calc_min() Yr[:, :border_to_0, :] = min_mov Yr[:, :, :border_to_0] = min_mov Yr[:, :, -border_to_0:] = min_mov Yr[:, -border_to_0:, :] = min_mov fx, fy, fz = resize_fact if fx != 1 or fy != 1 or fz != 1: if 'movie' not in str(type(Yr)): Yr = cm.movie(Yr, fr=1) Yr = Yr.resize(fx=fx, fy=fy, fz=fz) T, dims = Yr.shape[0], Yr.shape[1:] Yr = np.transpose(Yr, list(range(1, len(dims) + 1)) + [0]) Yr = np.reshape(Yr, (np.prod(dims), T), order='F') if idx == 0: fname_tot = base_name + '_d1_' + str(dims[0]) + '_d2_' + str( dims[1]) + '_d3_' + str( 1 if len(dims) == 2 else dims[2]) + '_order_' + str(order) if isinstance(f, str): fname_tot = os.path.join(os.path.split(f)[0], fname_tot) big_mov = np.memmap(fname_tot, mode='w+', dtype=np.float32, shape=(np.prod(dims), T), order=order) else: big_mov = np.memmap(fname_tot, dtype=np.float32, mode='r+', shape=(np.prod(dims), Ttot + T), order=order) big_mov[:, Ttot:Ttot + T] = np.asarray(Yr, dtype=np.float32) + 1e-10 + add_to_movie big_mov.flush() del big_mov Ttot = Ttot + T fname_new = fname_tot + '_frames_' + str(Ttot) + '_.mmap' os.rename(fname_tot, fname_new) return fname_new
def normed_patch_data_mat(raw_mat, save_dir, mean_mode='global_channel', sdev_mode='global_channel', file_name='normed_mat.npy', batch_size=0): modes = ('global_channel', 'global_feature', 'local_channel', 'local_full', 'gc', 'gf', 'lc', 'lf', 'none') assert mean_mode in modes assert sdev_mode in modes or isinstance(sdev_mode, float) num_patches, n_channels, n_feats_per_channel = raw_mat.shape batch_size = batch_size if batch_size else num_patches assert num_patches % batch_size == 0 data_mat = np.memmap(save_dir + file_name, dtype=np.float32, mode='w+', shape=raw_mat.shape) # MEAN treatment ###### if mean_mode in ('global_channel', 'gc'): channel_mean = np.mean(raw_mat, axis=(0, 2)) for idx in range(num_patches // batch_size): batch = raw_mat[idx * batch_size:(idx + 1) * batch_size, :, :] batch_t = np.transpose(batch, axes=(0, 2, 1)) batch_t_centered = batch_t - channel_mean batch_centered = np.transpose(batch_t_centered, axes=(0, 2, 1)) data_mat[idx * batch_size:(idx + 1) * batch_size, :, :] = batch_centered np.save(save_dir + 'data_mean.npy', channel_mean) elif mean_mode in ('global_feature', 'gf'): feature_mean = np.mean(raw_mat, axis=0) for idx in range(num_patches // batch_size): batch = raw_mat[idx * batch_size:(idx + 1) * batch_size, :, :] data_mat[idx * batch_size:(idx + 1) * batch_size, :, :] = batch - feature_mean np.save(save_dir + 'data_mean.npy', feature_mean) elif mean_mode in ('local_channel', 'lc'): for idx in range(num_patches // batch_size): batch = raw_mat[idx * batch_size:(idx + 1) * batch_size, :, :] channel_mean = np.mean(batch, axis=2) # shape=[n_patches, n_channels] batch_t = np.transpose(batch, axes=(2, 0, 1)) batch_t_centered = batch_t - channel_mean batch_centered = np.transpose(batch_t_centered, axes=(1, 2, 0)) data_mat[idx * batch_size:(idx + 1) * batch_size, :, :] = batch_centered elif mean_mode in ('local_full', 'lf'): for idx in range(num_patches // batch_size): batch = raw_mat[idx * batch_size:(idx + 1) * batch_size, :, :] sample_mean = np.mean(batch, axis=(1, 2)) batch_t = np.transpose(batch) batch_t_centered = batch_t - sample_mean batch_centered = np.transpose(batch_t_centered) data_mat[idx * batch_size:(idx + 1) * batch_size, :, :] = batch_centered else: # mean_mode is 'none' pass # SDEV treatment ###### if sdev_mode in ('global_channel', 'gc'): feat_sdev = np.std(raw_mat, axis=0) channel_sdev = np.mean(feat_sdev, axis=1) for idx in range(num_patches // batch_size): batch = raw_mat[idx * batch_size:(idx + 1) * batch_size, :, :] batch = np.rollaxis(np.rollaxis(batch, axis=2, start=1) / channel_sdev, axis=2, start=1) data_mat[idx * batch_size:(idx + 1) * batch_size, :, :] = batch np.save(save_dir + 'data_sdev.npy', channel_sdev) elif sdev_mode in ('global_feature', 'gf'): feat_sdev = np.std(raw_mat, axis=0) for idx in range(num_patches // batch_size): batch = raw_mat[idx * batch_size:(idx + 1) * batch_size, :, :] batch = batch / feat_sdev data_mat[idx * batch_size:(idx + 1) * batch_size, :, :] = batch np.save(save_dir + 'data_sdev.npy', feat_sdev) elif sdev_mode in ('local_channel', 'lc'): # seems like a bad idea anyway raise NotImplementedError elif sdev_mode in ('local_full', 'lf'): # this too for idx in range(num_patches // batch_size): batch = raw_mat[idx * batch_size:(idx + 1) * batch_size, :, :] sample_sdev = np.std(batch, axis=(1, 2)) batch_t = np.transpose(batch) batch_t_scaled = batch_t / sample_sdev batch_scaled = np.transpose(batch_t_scaled) data_mat[idx * batch_size:(idx + 1) * batch_size, :, :] = batch_scaled elif isinstance(sdev_mode, float): data_mat[:, :, :] = sdev_mode * raw_mat[:, :, :] else: # sdev_mode == 'none' pass del data_mat data_mat = np.memmap(save_dir + file_name, dtype=np.float32, mode='r', shape=raw_mat.shape) return data_mat
if base_name is None: base_name = mmap_fnames[0] base_name = base_name[:base_name.find('_d1_')] + '-#-' + str( len(mmap_fnames)) fname_tot = (base_name + '_d1_' + str(dims[0]) + '_d2_' + str(dims[1]) + '_d3_' + str(1 if len(dims) == 2 else dims[2]) + '_order_' + str(order) + '_frames_' + str(tot_frames) + '_.mmap') fname_tot = os.path.join(os.path.split(mmap_fnames[0])[0], fname_tot) print(fname_tot) big_mov = np.memmap(fname_tot, mode='w+', dtype=np.float32, shape=(d, tot_frames), order='C') step = np.int(old_div(d, n_chunks)) pars = [] for ref in range(0, d - step + 1, step): pars.append([fname_tot, d, tot_frames, mmap_fnames, ref, ref + step]) # last batch should include the leftover pixels pars[-1][-1] = d if dview is not None: if 'multiprocessing' in str(type(dview)): dview.map_async(save_portion, pars).get(9999999) else: dview.map_sync(save_portion, pars)
def get_data(size, vecsize): trainX = np.memmap('../data/prepared/TrainMap', dtype='float', mode='r', shape=(size, vecsize)) return trainX
def overlay_with_grid(image_path, pred_path, image_save_path, downsampled_image_save_path, label_save_path, shape, show=False): # use one of the following based on the size of the image; if image is huge, go with the first one! ########################################################################## (H, W, C) = shape full_image = np.memmap(image_path, dtype=np.uint16, mode='r', shape=(H, W, C))#.transpose(1,0,2) full_label = np.memmap(pred_path, dtype=np.uint8, mode='r', shape=(H, W))#.transpose(1,0) x_start = 64 * 3 y_start = 64 * 3 x_end = x_start + 64 * 10 y_end = y_start + 64 * 10 image = full_image.copy()[y_start:y_end,x_start:x_end,:] # ex_array = [] # for t in range(4, -1, -1): # temp = np.expand_dims(image[:, :, t], 2) # ex_array.append(temp) # image = np.dstack(ex_array) # do this for more than 3 channels show_image = image[:, :, :3] image = np.dstack((show_image[:, :, 2], show_image[:, :, 1], show_image[:, :, 0])) ################################# label = full_label.copy()[y_start:y_end,x_start:x_end] # image = np.load(image_path, mmap_mode='r') # label = np.load(pred_path, mmap_mode='r') # print(image) ########################################################################### # x_start = 64 * 140 # y_start = 64 * 10 # x_end = x_start + 64 * 10 # y_end = y_start + 64 * 10 # image = image[y_start:y_end,x_start:x_end,:] # label = label[y_start:y_end,x_start:x_end] ########################################################################### # colored_label = convert_to_colors(label) my_dpi = 300 # Set up figure fig = pl.figure(figsize=(float(image.shape[0])/my_dpi, float(image.shape[1])/my_dpi), dpi=my_dpi) ax = fig.add_subplot(111) # Remove whitespace from around the image fig.subplots_adjust(left=0, right=1, bottom=0, top=1) # Set the gridding interval: here we use the major tick interval myInterval = 64. loc = plticker.MultipleLocator(base=myInterval) ax.xaxis.set_major_locator(loc) ax.yaxis.set_major_locator(loc) # Add the grid ax.grid(which='major', axis='both', linestyle='-', color='g') # Add the image ax.imshow(image) # Find number of gridsquares in x and y direction nx = abs(int(float(ax.get_xlim()[1] - ax.get_xlim()[0]) / float(myInterval))) ny = abs(int(float(ax.get_ylim()[1] - ax.get_ylim()[0]) / float(myInterval))) # Add some labels to the gridsquares mt.rcParams.update({'font.size': 2}) for j in range(ny): y = myInterval / 2 + j * myInterval for i in range(nx): x = myInterval / 2. + float(i) * myInterval # ax.text(x, y, '{:d}'.format(i + j * nx), color='w', ha='center', va='center').set_color('red') # find the label at this point this_label = label[int(y),int(x)] ax.text(x, y, '{}'.format(all_labels_inverted[this_label]), color='w', ha='center', va='center').set_color('yellow') # Save the figure fig.savefig(image_save_path, dpi=my_dpi) ############ we will save downsampled images to show how it relates to pixel-wise results ########## # save colored label as well # 1. reduce 64*64 blocks, 2. apply filter to remove segmentation noise, 3. convert labels to colors colored_labels = block_reduce(full_label, block_size=(64,64), func=np.max) # colored_labels = cv2.medianBlur(colored_labels, ksize=3) filtered_forest = colored_labels[colored_labels == 1].sum()/colored_labels.reshape(-1).shape[0]*100 colored_labels = convert_to_colors(colored_labels, flag='forest') # print(set( tuple(v) for m2d in colored_labels for v in m2d )) # if you want to check unique colors pl.imsave(label_save_path, colored_labels) # downsample and save input image downsampled_input_image = block_reduce(full_image, block_size=(64,64,1), func=np.max) # downsampled_image_path = os.path.join(os.path.splitext(image_save_path)[0]+'_down.png') pl.imsave(downsampled_image_save_path, downsampled_input_image) if show: pl.show() return filtered_forest
'img_q_id_test', 'question_test', 'choices_test', ] #load wrods ID_PKL = pickle.load(open(data_prefix+paths[0]+'.pkl','rb')) QUESTION_PKL = pickle.load(open(data_prefix+paths[1]+'.pkl','rb')) CHOICE_PKL = pickle.load(open(data_prefix+paths[2]+'.pkl','rb')) #load picture features IM_ID = pickle.load(open('../Data/val2014/ID.pkl','rb')) IM_ID_DICT = dict() for num in xrange(len(IM_ID)): ID = IM_ID[num].split('_')[2].split('.')[0] IM_ID_DICT[ID]=num mem_shape = (40504,1,1000) mem_image = np.memmap('../Data/val2014/vgg_feats.memmap',dtype='float32',mode='r',shape=mem_shape ) #===== prepare pickList ===== pickList = range(0,len(ID_PKL)) numToC = {0:'A',1:'B',2:'C',3:'D',4:'E'} answers = [] # maybe this will help? -- Ray. # this really help, thanks! -- Angus. #print '{0:{fill}{align}12}'.format(ID_PKL[0][0],fill='0',align='>') # printing 300-dim word vector #print word_vec[ QUESTION_PKL[0][0] ] print "start making model..." model = Keras_model.keras_model(20) model.load_weights(MODEL_NAME) #===== Start training ===== print "Start testing!"
def make_flattened_patch_data(num_patches, ph, pw, classifier, map_name, n_channels, n_feats_white, whiten_mode='pca', batch_size=100, mean_mode='local_full', sdev_mode='global_feature', raw_mat_load_path='', n_val_patches=0): """ creates whitening, covariance, raw and whitened feature matrices for separate channels. all data is saved as [n_patches, n_channels, n_features_per_channel] """ save_dir = make_data_dir(map_name, ph, pw, mean_mode, sdev_mode, n_feats_white, classifier=classifier) if not os.path.exists(save_dir): os.makedirs(save_dir) if raw_mat_load_path: raw_mat = np.memmap(raw_mat_load_path, dtype=np.float32, mode='r', shape=(num_patches, n_channels, ph * pw)) else: raw_mat = raw_patch_data_mat(map_name, classifier, num_patches, ph, pw, batch_size, n_channels, save_dir) print('raw mat done') norm_mat = normed_patch_data_mat(raw_mat, save_dir, mean_mode=mean_mode, sdev_mode=sdev_mode) print('normed mat done') print('mat dims pre flatten:', norm_mat.shape) flat_mat = norm_mat.reshape([num_patches, -1]) cov = flattened_cov_acc(flat_mat, save_dir) print('cov done') whiten, unwhiten = flattened_whitening_mats(cov, whiten_mode, save_dir, n_feats_white) print('whitening mats done') data_mat = np.memmap(save_dir + 'data_mat_' + whiten_mode + '_whitened.npy', dtype=np.float32, mode='w+', shape=(num_patches, n_feats_white)) for idx in range(num_patches // batch_size): image = flat_mat[idx * batch_size:(idx + 1) * batch_size, :] # [bs, n_f] # whiten is [n_fw, n_f], target [bs, n_fw] data_mat[ idx * batch_size:(idx + 1) * batch_size, :] = image @ whiten.T # [bs, n_f] x [n_f, n_fw] = [bs, n_fw] print('whitened data done') if n_val_patches > 0: add_flattened_validation_set(n_val_patches, ph, pw, classifier, map_name, n_channels, n_feats_white, whiten_mode, batch_size, mean_mode, sdev_mode)
def make_channel_separate_patch_data(num_patches, ph, pw, classifier, map_name, n_channels, n_feats_per_channel_white, whiten_mode='pca', batch_size=100, mean_mode='global_channel', sdev_mode='global_channel', raw_mat_load_path='', n_val_patches=0): """ creates whitening, covariance, raw and whitened feature matrices for separate channels. They are saved as 3d matrices where the first dimension is the channel index """ save_dir = make_data_dir(map_name, ph, pw, mean_mode, sdev_mode, n_features_white=n_feats_per_channel_white, classifier=classifier) save_dir = save_dir.rstrip('/') + '_channelwise/' if not os.path.exists(save_dir): os.makedirs(save_dir) if raw_mat_load_path: raw_mat = np.memmap(raw_mat_load_path, dtype=np.float32, mode='r', shape=(num_patches, n_channels, ph * pw)) else: raw_mat = raw_patch_data_mat(map_name, classifier, num_patches, ph, pw, batch_size, n_channels, save_dir) print('raw mat done') norm_mat = normed_patch_data_mat(raw_mat, save_dir, mean_mode=mean_mode, sdev_mode=sdev_mode) print('normed mat done') cov = channel_independent_cov_acc(norm_mat, save_dir) print('cov done') n_dims_to_drop = ph * pw - n_feats_per_channel_white channel_whiten, channel_unwhiten = channel_independent_whitening_mats( cov, whiten_mode, save_dir, n_dims_to_drop=n_dims_to_drop) print('whitening mats done') data_mat = np.memmap( save_dir + 'data_mat_' + whiten_mode + '_whitened_channelwise.npy', dtype=np.float32, mode='w+', shape=(num_patches, n_channels, channel_whiten.shape[1])) for idx in range(num_patches // batch_size): image = norm_mat[idx * batch_size:(idx + 1) * batch_size, :, :] # [bs, n_c, n_fpc] # channel_whiten is [n_c, n_fpcw, n_fpc], target [bs, n_c, n_fpcw] image = np.expand_dims(image, axis=3) # [bs, n_c, n_fpc, 1] # [n_c, n_fpcw, n_fpc] x [n_c, n_fpc, 1] = [n_c, n_fpcw] data_mat[idx * batch_size:(idx + 1) * batch_size, :, :] = np.squeeze( channel_whiten @ image) print('whitened data done') if n_val_patches > 0: add_channelwise_validation_set(n_val_patches, ph, pw, classifier, map_name, n_channels, n_feats_per_channel_white, whiten_mode, batch_size, mean_mode, sdev_mode)
def __init__( self, ds_root, # pre-processed dataset root directory (where to find .dat files) mode='train', # mode of use of the dataset object (may be 'train', 'validation' or 'test') n_samples=None, # number of samples to consider (used just to access the right pre-processed files) return_malicious=True, # whether to return the malicious label for the data point or not return_counts=True, # whether to return the counts for the data point or not return_tags=True, # whether to return the tags for the data points or not return_shas=False ): # whether to return the sha256 of the data points or not """ Initialize Dataset class. Args: ds_root: Pre-processed dataset root directory (where to find .dat files) mode: Mode of use of the dataset object (it may be 'train', 'validation' or 'test') (default: 'train') n_samples: Number of samples to consider (used just to access the right pre-processed files) (default: None) return_malicious: Whether to return the malicious label for the data point or not (default: True) return_counts: Whether to return the counts for the data point or not (default: True) return_tags: Whether to return the tags for the data points or not (default: True) return_shas: Whether to return the sha256 of the data points or not (default: False) """ self.return_counts = return_counts self.return_tags = return_tags self.return_malicious = return_malicious self.return_shas = return_shas # if mode is not in one of the expected values raise an exception if mode not in {'train', 'validation', 'test'}: raise ValueError('invalid mode {}'.format(mode)) # if n_samples is not set or it is <= 0 -> set it to the max if n_samples is None or n_samples <= 0: n_samples = total_n_samples[mode] # set feature dimension ndim = 2381 # set labels dimension to 1 (malware) + 1 (count) + n_tags (tags) labels_dim = 1 + 1 + len(Dataset.tags) # generate X (features vector), y (labels vector) and S (shas) file names X_path = os.path.join(ds_root, "X_{}_{}.dat".format(mode, n_samples)) y_path = os.path.join(ds_root, "y_{}_{}.dat".format(mode, n_samples)) S_path = os.path.join(ds_root, "S_{}_{}.dat".format(mode, n_samples)) # log error and exit if at least one of the dataset files (X, y, S) does not exist if not (os.path.exists(X_path) and os.path.exists(y_path) and os.path.exists(S_path)): logger.error( "X, y, S files for mode {} and amount {} not found.".format( mode, n_samples)) sys.exit(1) logger.info('Opening Dataset at {} in {} mode.'.format(ds_root, mode)) # open S (shas) memory map in Read+ mode (+ because pytorch does not support read only ndarrays) self.S = np.memmap(S_path, dtype=np.dtype('U64'), mode="r+") # get number of elements from S vector self.N = self.S.shape[0] # open y (labels) memory map in Read+ mode (+ because pytorch does not support read only ndarrays) self.y = np.memmap(y_path, dtype=np.float32, mode="r+", shape=(self.N, labels_dim)) # open X (features) memory map in Read+ mode (+ because pytorch does not support read only ndarrays) self.X = np.memmap(X_path, dtype=np.float32, mode="r+", shape=(self.N, ndim)) logger.info("{} samples loaded.".format(self.N))
PATCH_SIZE = 256 slice_size = int(np.ceil(np.sqrt(2*PATCH_SIZE**2))) # ensure slice_size is compatible with several (here 5) maxpool operations slice_size += 32-slice_size%32 expected_n_samples = 70000 patient_markers = np.loadtxt("../../patient_markers.txt").astype(np.int32) memmap_shape = (expected_n_samples, 25, slice_size, slice_size) info_memmap_shape = (expected_n_samples, 4) memmap_name = "patchSegmentation_allInOne_ws_t1km_flair_adc_cbv" memmap_data = memmap("%s.memmap" % (memmap_name), dtype=np.float32, mode="w+", shape=memmap_shape) memmap_gt = memmap("%s_info.memmap" % (memmap_name), dtype=np.float32, mode="w+", shape=info_memmap_shape) def add_patch_to_memmap(x, y, z, t1km_img, flair_img, adc_img, cbv_img, seg_combined, slice_size, patient_id, patient_state, data_ctr): t1km_patch = t1km_img[z-2:z+3, x:x+slice_size, y:y+slice_size] flair_patch = flair_img[z-2:z+3, x:x+slice_size, y:y+slice_size] adc_patch = adc_img[z-2:z+3, x:x+slice_size, y:y+slice_size] cbv_patch = cbv_img[z-2:z+3, x:x+slice_size, y:y+slice_size] seg_patch = seg_combined[z-2:z+3, x:x+slice_size, y:y+slice_size] # no empty slices if len(np.unique(seg_patch[2])) == 1 and np.unique(seg_patch[2])[0] == 0: return data_ctr memmap_data[data_ctr, 0:5, :, :] = t1km_patch memmap_data[data_ctr, 5:10, :, :] = flair_patch memmap_data[data_ctr, 10:15, :, :] = adc_patch memmap_data[data_ctr, 15:20, :, :] = cbv_patch
[mad_estimator.input, writer_1.input, peak_detector.get_input('data')]) manager.connect(mad_estimator.get_output('mads'), [peak_detector.get_input('mads'), writer_2.input]) manager.connect(peak_detector.get_output('peaks'), writer_3.input) manager.start() director.sleep(duration=5.0) director.stop() director.destroy() start_mad = mad_estimator.start_step neg_peak_file = writer_3.recorded_peaks['negative'] pos_peak_file = writer_3.recorded_peaks['positive'] x1 = numpy.memmap('/tmp/input.dat', dtype=numpy.float32, mode='r') x1 = x1.reshape(x1.size / nb_channels, nb_channels) neg_peaks = numpy.fromfile(neg_peak_file, dtype=numpy.int32) neg_peaks = neg_peaks.reshape(neg_peaks.size / 2, 2) pos_peaks = numpy.fromfile(pos_peak_file, dtype=numpy.int32) pos_peaks = pos_peaks.reshape(pos_peaks.size / 2, 2) mads = numpy.fromfile('/tmp/mads.dat', dtype=numpy.float32) t_max = mads.size / nb_channels mads = mads[:t_max * nb_channels].reshape(t_max, nb_channels) channel_to_show = 0 t_stop = (start_mad + 10) * nb_samples
def raw_patch_data_mat(map_name, classifier, num_patches, ph, pw, batch_size, n_channels, save_dir, file_name='raw_mat.npy'): """ create (num_patches, n_channels, feats per channel) matrix of extracted patches """ assert num_patches % batch_size == 0 if classifier.lower() == 'vgg16': classifier = Vgg16() image_subdir = 'images_resized_224/' img_dims = [batch_size, 224, 224, 3] elif classifier.lower() == 'alexnet': classifier = AlexNet() image_subdir = 'images_resized_227/' img_dims = [batch_size, 227, 227, 3] else: raise NotImplementedError file_path = save_dir + file_name with tf.Graph().as_default() as graph: with tf.Session() as sess: img_pl = tf.placeholder(dtype=tf.float32, shape=img_dims, name='img_pl') classifier.build(img_pl, rescale=1.0) feat_map = graph.get_tensor_by_name(map_name) map_dims = [d.value for d in feat_map.get_shape()] n_feats_per_channel = ph * pw # n_features = n_feats_per_channel * map_dims[3] data_path = '../data/imagenet2012-validationset/' img_file = 'train_48k_images.txt' raw_mat = np.memmap(file_path, dtype=np.float32, mode='w+', shape=(num_patches, n_channels, n_feats_per_channel)) max_h = map_dims[1] - ph max_w = map_dims[2] - pw with open(data_path + img_file) as f: image_files = [k.rstrip() for k in f.readlines()] image_paths = [ data_path + image_subdir + k[:-len('JPEG')] + 'bmp' for k in image_files ] img_mat = np.zeros(shape=img_dims) for count in range(num_patches // batch_size): for idx in range(batch_size): img_path = image_paths[idx + (count * batch_size) % len(image_paths)] img_mat[idx, :, :, :] = load_image(img_path, resize=False) if count == 0: print('Verifying scale - this should be around 255: ', np.max(img_mat)) map_mat = sess.run(feat_map, feed_dict={img_pl: img_mat}) for idx in range(batch_size): h = np.random.randint(0, max_h) w = np.random.randint(0, max_w) map_patch = np.transpose(map_mat[idx, h:h + ph, w:w + pw, :], axes=(2, 0, 1)) map_patch = map_patch.reshape([n_channels, -1]).astype(np.float32) raw_mat[idx + (count * batch_size), :, :] = map_patch del raw_mat raw_mat = np.memmap(file_path, dtype=np.float32, mode='r', shape=(num_patches, n_channels, n_feats_per_channel)) return raw_mat
def main(args, _=None): """Run the ``catalyst-data text2embeddings`` script.""" batch_size = args.batch_size num_workers = args.num_workers max_length = args.max_length pooling_groups = args.pooling.split(",") utils.set_global_seed(args.seed) utils.prepare_cudnn(args.deterministic, args.benchmark) if hasattr(args, "in_huggingface"): model_config = BertConfig.from_pretrained(args.in_huggingface) model_config.output_hidden_states = args.output_hidden_states model = BertModel.from_pretrained(args.in_huggingface, config=model_config) tokenizer = BertTokenizer.from_pretrained(args.in_huggingface) else: model_config = BertConfig.from_pretrained(args.in_config) model_config.output_hidden_states = args.output_hidden_states model = BertModel(config=model_config) tokenizer = BertTokenizer.from_pretrained(args.in_vocab) if hasattr(args, "in_model"): checkpoint = utils.load_checkpoint(args.in_model) checkpoint = {"model_state_dict": checkpoint} utils.unpack_checkpoint(checkpoint=checkpoint, model=model) model = model.eval() model, _, _, _, device = utils.process_components(model=model) df = pd.read_csv(args.in_csv) df = df.dropna(subset=[args.txt_col]) df.to_csv(f"{args.out_prefix}.df.csv", index=False) df = df.reset_index().drop("index", axis=1) df = list(df.to_dict("index").values()) num_samples = len(df) open_fn = LambdaReader( input_key=args.txt_col, output_key=None, lambda_fn=partial( tokenize_text, strip=args.strip, lowercase=args.lowercase, remove_punctuation=args.remove_punctuation, ), tokenizer=tokenizer, max_length=max_length, ) dataloader = utils.get_loader( df, open_fn, batch_size=batch_size, num_workers=num_workers, ) features = {} dataloader = tqdm(dataloader) if args.verbose else dataloader with torch.no_grad(): for idx, batch in enumerate(dataloader): batch = utils.any2device(batch, device) bert_output = model(**batch) mask = (batch["attention_mask"].unsqueeze(-1) if args.mask_for_max_length else None) if utils.is_wrapped_with_ddp(model): # using several gpu hidden_size = model.module.config.hidden_size hidden_states = model.module.config.output_hidden_states else: # using cpu or one gpu hidden_size = model.config.hidden_size hidden_states = model.config.output_hidden_states features_ = process_bert_output( bert_output=bert_output, hidden_size=hidden_size, output_hidden_states=hidden_states, pooling_groups=pooling_groups, mask=mask, ) # create storage based on network output if idx == 0: for key, value in features_.items(): name_ = key if isinstance(key, str) else f"{key:02d}" _, embedding_size = value.shape features[name_] = np.memmap( f"{args.out_prefix}.{name_}.npy", dtype=np.float32, mode="w+", shape=(num_samples, embedding_size), ) indices = np.arange(idx * batch_size, min((idx + 1) * batch_size, num_samples)) for key, value in features_.items(): name_ = key if isinstance(key, str) else f"{key:02d}" features[name_][indices] = _detach(value)
def do_cuts(args): from root_optimize.timing import secondsToStr # before doing anything, let's ensure the directory we make is ok if not os.path.exists(args.output_directory): os.makedirs(args.output_directory) elif args.overwrite: import shutil shutil.rmtree(args.output_directory) else: raise IOError("Output directory already exists: {0:s}".format(args.output_directory)) # first step is to group by the sample DID dids = defaultdict(list) for fname in args.files: dids[utils.get_did(fname)].append(fname) # load in the supercuts file supercuts = utils.read_supercuts_file(args.supercuts) # load up the weights file if not os.path.isfile(args.weightsFile): raise ValueError('The supplied weights file `{0}` does not exist or I cannot find it.'.format(args.weightsFile)) else: weights = json.load(file(args.weightsFile)) # parallelize num_cores = min(multiprocessing.cpu_count(), args.num_cores) logger.log(25, "Using {0} cores".format(num_cores) ) pids = None # if pids is None, do_cut() will disable the progress if not args.hide_subtasks: from numpy import memmap, uint64 pids = memmap(os.path.join(tempfile.mkdtemp(), 'pids'), dtype=uint64, shape=num_cores, mode='w+') overall_progress = tqdm.tqdm(total=len(dids), desc='Num. files', position=0, leave=True, unit='file', dynamic_ncols=True) class CallBack(object): completed = defaultdict(int) def __init__(self, index, parallel): self.index = index self.parallel = parallel def __call__(self, index): CallBack.completed[self.parallel] += 1 overall_progress.update() overall_progress.refresh() if self.parallel._original_iterable: self.parallel.dispatch_next() import joblib.parallel joblib.parallel.CallBack = CallBack results = Parallel(n_jobs=num_cores)(delayed(utils.do_cut)(did, files, supercuts, weights, args.tree_name, args.output_directory, args.eventWeightBranch, args.numpy, pids) for did, files in dids.iteritems()) overall_progress.close() for did, result in zip(dids, results): logger.log(25, 'DID {0:s}: {1:s}'.format(did, 'ok' if result[0] else 'not ok')) logger.log(25, "Total CPU elapsed time: {0}".format(secondsToStr(sum(result[1] for result in results)))) return True
# t1 = time.clock() # print(t1) for epoch in range(epoch_num): print("############### Epoch",str(epoch+1)," #################") train_loss_online=[] train_loss = [] dev_loss = [] print("______________Training_________________") for i in range(6): ###needmod 20 print("=============================================") x_train_file = xFile + str(i+1)+".dat" # print("Loading x training data from",x_train_file) # x_train = np.memmap(x_train_file, dtype='float', mode='r', shape=(350, 128, 64, 64, 1)) x_train_load = np.memmap(x_train_file, dtype='float', mode='r', shape=(350, 128, 64, 64, 1)) x_train = x_train_load.copy() del x_train_load print(x_train.shape) y_train_file = yFile + str(i+1)+".npy" # print("Loading y training data from", y_train_file) y_train_load = np.load(y_train_file) y_train = y_train_load.copy() del y_train_load # print(y_train.shape) # # TODO: Delete # x_train = x_train[:100] ### # y_train = y_train[:100] ### print("Training model on training data",str(i+1)+"/6 ...")
def GetData(filename): return np.memmap(filename, dtype='h', mode='r', offset=44)
MODEL += '_normed' PKL_ID = './ID.pkl' #MEM_DATA = 'data.fbank.memmap' PGRAM_ROOT = 'dnn_result/posteriorgram/' DNN_MODEL = 'Angus_2' MEM_PGRAM = PGRAM_ROOT + DNN_MODEL + '.pgram' MEM_LABEL = 'label.memmap' MEM_PGRAM_shape = (1124823, 48) STATE_LENGTH = 1943 PHONE_LENGTH = 48 #LABEL_VARIETY = 1943 LABEL_VARIETY = 48 print "Reading data..." mem_pgram = np.memmap(MEM_PGRAM, dtype='float32', mode='r', shape=MEM_PGRAM_shape) mem_label = np.memmap(MEM_LABEL, dtype='int16', mode='r', shape=(1124823, )) IDs = readID(PKL_ID) idx = 0 IDs_utter = [] while idx <= len(IDs) - 1: IDs_utter.append(["_".join(IDs[idx][0].split('_')[0:2]), IDs[idx][1]]) #IDs_utter = [utter_name,utter_max] idx += IDs[idx][1] print "Preparing pickList..." pickList = range(0, len(IDs_utter)) pickList = shuffle(pickList) frame_max = max(IDs_utter, key=lambda x: x[1]) train_data_length = len(pickList) * VAL_SET_RATIO
def file_reader(filename, endianess='<', **kwds): metadata = {} f = open(filename, 'rb') std_header = np.fromfile(f, dtype=get_std_dtype_list(endianess), count=1) fei_header = None if std_header['NEXT'] / 1024 == 128: print "It seems to contain an extended FEI header" fei_header = np.fromfile(f, dtype=get_fei_dtype_list(endianess), count=1024) if f.tell() == 1024 + std_header['NEXT']: print "The FEI header was correctly loaded" else: print "There was a problem reading the extended header" f.seek(1024 + std_header['NEXT']) fei_header = None NX, NY, NZ = std_header['NX'], std_header['NY'], std_header['NZ'] data = np.memmap(f, mode='c', offset=f.tell(), dtype=get_data_type(std_header['MODE'], endianess)).squeeze().reshape( (NX, NY, NZ), order='F').T original_metadata = {'std_header': sarray2dict(std_header)} if fei_header is not None: fei_dict = sarray2dict(fei_header, ) del fei_dict['empty'] original_metadata['fei_header'] = fei_dict dim = len(data.shape) if fei_header is None: # The scale is in Amstrongs, we convert it to nm scales = [ 10 * float(std_header['Zlen'] / std_header['MZ']) if float(std_header['MZ']) != 0 else 1, 10 * float(std_header['Ylen'] / std_header['MY']) if float(std_header['MY']) != 0 else 1, 10 * float(std_header['Xlen'] / std_header['MX']) if float(std_header['MX']) != 0 else 1, ] offsets = [ 10 * float(std_header['ZORIGIN']), 10 * float(std_header['YORIGIN']), 10 * float(std_header['XORIGIN']), ] else: # FEI does not use the standard header to store the scale # It does store the spatial scale in pixel_size, one per angle in # meters scales = [ 1, ] + [ fei_header['pixel_size'][0] * 10**9, ] * 2 offsets = [ 0, ] * 3 units = [Undefined, 'nm', 'nm'] names = ['z', 'y', 'x'] metadata = { 'General': { 'original_filename': os.path.split(filename)[1] }, "Signal": { 'signal_type': "", 'record_by': 'image', }, } # create the axis objects for each axis axes = [{ 'size': data.shape[i], 'index_in_array': i, 'name': names[i + 3 - dim], 'scale': scales[i + 3 - dim], 'offset': offsets[i + 3 - dim], 'units': units[i + 3 - dim], } for i in xrange(dim)] dictionary = { 'data': data, 'axes': axes, 'metadata': metadata, 'original_metadata': original_metadata, } return [ dictionary, ]
if 'EC_number' in feature.qualifiers.keys(): eci = eci + 1 output[i] = eci print d, 'has', eci, 'features' ## For some assemblies an error is raised on a second(?) record identified ## in the Genbank file. It isn't clear why this is happening, pass the error ## here. except AttributeError: pass sums = np.memmap(open('tmp.paprica.mmp', 'w+b'), shape=genome_data.index.shape[0], dtype='uint64') Parallel(n_jobs=-1)(delayed(count_ec)(sums, i) for i in range(0, len(genome_data.index))) eci = int( sums.sum() * 2 ) # Count_ec is undercounting and not clear why. Multiply by 2 to insure large enough array. ## Delete mmp os.remove('tmp.paprica.mmp') ## Create numpy array for data and a 1D array that will become dataframe index. ## You can probably parallelize this as above, but going to take some effort.
img_list = [] count_imgs = [0] * 3 with open('./data/celeba/list_eval_partition.txt', 'r') as fp: for line in fp: img_name, img_set = line.split(' ') img_set = int(img_set) count_imgs[img_set] = count_imgs[img_set] + 1 img_list.append([img_name, img_set]) imgs_dir = './data/celeba/img_align_celeba/' data_mean = 0.431751299266 data_std = 0.300219581459 train_x = np.memmap('.tmp_celeba_train.npy', np.float32, 'w+', shape=(count_imgs[0] + count_imgs[1], 64, 64, 3)) test_x = np.memmap('.tmp_celeba_test.npy', np.float32, 'w+', shape=(count_imgs[2], 64, 64, 3)) train_count = 0 test_count = 0 for i in img_list: img_name, img_set = i img = skimage.transform.resize(plt.imread(imgs_dir + img_name), (64, 64)) img = (img - data_mean) / data_std if img_set == 2: test_x[test_count] = img test_count = test_count + 1 else: train_x[train_count] = img
NSQUARES = int(sys.argv[1]) # Initialize img = numpy.zeros((N, N), numpy.uint8) centers = numpy.random.random_integers(0, N, size=(NSQUARES, 2)) radii = numpy.random.randint(0, N/9, size=NSQUARES) colors = numpy.random.randint(100, 255, size=NSQUARES) # Generate squares for i in xrange(NSQUARES): xindices = range(centers[i][0] - radii[i], centers[i][0] + radii[i]) xindices = numpy.clip(xindices, 0, N - 1) yindices = range(centers[i][1] - radii[i], centers[i][1] + radii[i]) yindices = numpy.clip(yindices, 0, N - 1) if len(xindices) == 0 or len(yindices) == 0: continue coordinates = numpy.meshgrid(xindices, yindices) img[coordinates] = colors[i] # Load into memory map img.tofile('random_squares.raw') img_memmap = numpy.memmap('random_squares.raw', shape=img.shape) # Display image matplotlib.pyplot.imshow(img_memmap) matplotlib.pyplot.axis('off') matplotlib.pyplot.show()
if x2y2 < R**2 and x2y2 > 0.0: weight = (0.25 / np.pi)**2 zlim = np.sqrt(R**2 - x2y2) integral = integ.quad(integrand, -zlim, zlim, args=(kapparho, x2y2))[0] return weight * np.exp( -kapparho * zlim) * integral * kapparho * dx * dy else: if x2y2 == 0.0 and args.direct_light: return 0.25 * np.exp(-kapparho * R) / np.pi else: return 0.0 # open the image file data = np.memmap(args.file, dtype=np.float64, shape=(args.nx, args.ny), mode="r") profile_data = data.reshape(-1) xy = np.array( np.meshgrid(np.linspace(-1.0, 1.0, args.nx), np.linspace(-1.0, 1.0, args.ny))).transpose() xy = xy.reshape((-1, 2)) profile_radius = np.sqrt(xy[:, 0]**2 + xy[:, 1]**2) image_xy = xy.reshape((args.nx, args.ny, 2)) image_data = data.reshape((args.nx, args.ny)) ra = np.linspace(0.0, 1.0, 100) dx = 2.0 / args.nx dy = 2.0 / args.ny
def _parse_header(self): with io.open(self.filename, 'rb') as fid: f = StructFile(fid) # Name f.seek(64) surname = f.read(22).strip(b' ') firstname = f.read(20).strip(b' ') # Date day, month, year, hour, minute, sec = f.read_f('bbbbbb', offset=128) rec_datetime = datetime.datetime(year + 1900, month, day, hour, minute, sec) Data_Start_Offset, Num_Chan, Multiplexer, Rate_Min, Bytes = f.read_f( 'IHHHH', offset=138) # header version header_version, = f.read_f('b', offset=175) assert header_version == 4 # area f.seek(176) zone_names = [ 'ORDER', 'LABCOD', 'NOTE', 'FLAGS', 'TRONCA', 'IMPED_B', 'IMPED_E', 'MONTAGE', 'COMPRESS', 'AVERAGE', 'HISTORY', 'DVIDEO', 'EVENT A', 'EVENT B', 'TRIGGER' ] zones = {} for zname in zone_names: zname2, pos, length = f.read_f('8sII') zones[zname] = zname2, pos, length assert zname == zname2.decode('ascii').strip(' ') # raw signals memmap sig_dtype = 'u' + str(Bytes) self._raw_signals = np.memmap(self.filename, dtype=sig_dtype, mode='r', offset=Data_Start_Offset).reshape( -1, Num_Chan) # Reading Code Info zname2, pos, length = zones['ORDER'] f.seek(pos) code = np.frombuffer(f.read(Num_Chan * 2), dtype='u2') units_code = { -1: 'nV', 0: 'uV', 1: 'mV', 2: 1, 100: 'percent', 101: 'dimensionless', 102: 'dimensionless' } sig_channels = [] sig_grounds = [] for c in range(Num_Chan): zname2, pos, length = zones['LABCOD'] f.seek(pos + code[c] * 128 + 2, 0) chan_name = f.read(6).strip(b"\x00").decode('ascii') ground = f.read(6).strip(b"\x00").decode('ascii') sig_grounds.append(ground) logical_min, logical_max, logical_ground, physical_min, physical_max = f.read_f( 'iiiii') k, = f.read_f('h') units = units_code.get(k, 'uV') factor = float(physical_max - physical_min) / float(logical_max - logical_min + 1) gain = factor offset = -logical_ground * factor f.seek(8, 1) sampling_rate, = f.read_f('H') sampling_rate *= Rate_Min chan_id = c group_id = 0 sig_channels.append((chan_name, chan_id, sampling_rate, sig_dtype, units, gain, offset, group_id)) sig_channels = np.array(sig_channels, dtype=_signal_channel_dtype) assert np.unique(sig_channels['sampling_rate']).size == 1 self._sampling_rate = float( np.unique(sig_channels['sampling_rate'])[0]) # Event channels event_channels = [] event_channels.append(('Trigger', '', 'event')) event_channels.append(('Note', '', 'event')) event_channels.append(('Event A', '', 'epoch')) event_channels.append(('Event B', '', 'epoch')) event_channels = np.array(event_channels, dtype=_event_channel_dtype) # Read trigger and notes self._raw_events = [] ev_dtypes = [ ('TRIGGER', [('start', 'u4'), ('label', 'u2')]), ('NOTE', [('start', 'u4'), ('label', 'S40')]), ('EVENT A', [('label', 'u4'), ('start', 'u4'), ('stop', 'u4')]), ('EVENT B', [('label', 'u4'), ('start', 'u4'), ('stop', 'u4')]), ] for zname, ev_dtype in ev_dtypes: zname2, pos, length = zones[zname] dtype = np.dtype(ev_dtype) rawevent = np.memmap(self.filename, dtype=dtype, mode='r', offset=pos, shape=length // dtype.itemsize) keep = (rawevent['start'] >= rawevent['start'][0]) & ( rawevent['start'] < self._raw_signals.shape[0]) & (rawevent['start'] != 0) rawevent = rawevent[keep] self._raw_events.append(rawevent) # No spikes unit_channels = [] unit_channels = np.array(unit_channels, dtype=_unit_channel_dtype) # fille into header dict self.header = {} self.header['nb_block'] = 1 self.header['nb_segment'] = [1] self.header['signal_channels'] = sig_channels self.header['unit_channels'] = unit_channels self.header['event_channels'] = event_channels # insert some annotation at some place self._generate_minimal_annotations() bl_annotations = self.raw_annotations['blocks'][0] seg_annotations = bl_annotations['segments'][0] for d in (bl_annotations, seg_annotations): d['rec_datetime'] = rec_datetime d['firstname'] = firstname d['surname'] = surname d['header_version'] = header_version for c in range(sig_channels.size): anasig_an = seg_annotations['signals'][c] anasig_an['ground'] = sig_grounds[c] channel_an = self.raw_annotations['signal_channels'][c] channel_an['ground'] = sig_grounds[c]
def main(): import numpy as np import os, sys, time, getopt from auxil import subset from ipyparallel import Client from osgeo import gdal from osgeo.gdalconst import GA_ReadOnly, GDT_Byte from tempfile import NamedTemporaryFile usage = ''' Usage: ------------------------------------------------ Sequential change detection for polarimetric SAR images python %s [OPTIONS] infiles* outfile enl Options: -h this help -m run 3x3 median filter on p-values prior to thresholding (e.g. for noisy satellite data) -d <list> files are to be co-registered to a subset dims = [x0,y0,rows,cols] of the first image, otherwise it is assumed that the images are co-registered and have identical spatial dimensions -s <float> significance level for change detection (default 0.0001) infiles: full paths to all input files: /path/to/infile_1 /path/to/infile_1 ... /path/to/infile_k outfile: without path (will be written to same directory as infile_1) enl: equivalent number of looks -------------------------------------------------''' % sys.argv[0] options, args = getopt.getopt(sys.argv[1:], 'hmd:s:') medianfilter = False dims = None significance = 0.0001 for option, value in options: if option == '-h': print usage return elif option == '-m': medianfilter = True elif option == '-d': dims = eval(value) elif option == '-s': significance = eval(value) k = len(args) - 2 fns = args[0:k] n = np.float64(eval(args[-1])) outfn = args[-2] gdal.AllRegister() start = time.time() # first SAR image try: inDataset1 = gdal.Open(fns[0], GA_ReadOnly) cols = inDataset1.RasterXSize rows = inDataset1.RasterYSize bands = inDataset1.RasterCount except Exception as e: print 'Error: %s -- Could not read file' % e sys.exit(1) if dims is not None: # images are not yet co-registered, so subset first image and register the others _, _, cols, rows = dims fn0 = subset.subset(fns[0], dims) args1 = [(fns[0], fns[i], dims) for i in range(1, k)] try: print ' \nattempting parallel execution of co-registration ...' start1 = time.time() c = Client() print 'available engines %s' % str(c.ids) v = c[:] v.execute('from registersar import register') fns = v.map_sync(call_register, args1) print 'elapsed time for co-registration: ' + str(time.time() - start1) except Exception as e: start1 = time.time() print '%s \nFailed, so running sequential co-registration ...' % e fns = map(call_register, args1) print 'elapsed time for co-registration: ' + str(time.time() - start) fns.insert(0, fn0) # point inDataset1 to the subset image for correct georefrerencing inDataset1 = gdal.Open(fn0, GA_ReadOnly) print '===============================================' print ' Multi-temporal SAR Change Detection' print '===============================================' print time.asctime() print 'First (reference) filename: %s' % fns[0] print 'number of images: %i' % k print 'equivalent number of looks: %f' % n print 'significance level: %f' % significance if bands == 9: print 'Quad ploarization' elif bands == 4: print 'Dual polarizaton' elif bands == 3: print 'Quad polarization, diagonal only' elif bands == 2: print 'Dual polarization, diagonal only' else: print 'Intensity image' # output file path = os.path.abspath(fns[0]) dirn = os.path.dirname(path) outfn = dirn + '/' + outfn # create temporary, memory-mapped array of change indices p(Ri<ri) mm = NamedTemporaryFile() pvarray = np.memmap(mm.name, dtype=np.float64, mode='w+', shape=(k - 1, k - 1, rows * cols)) lnQs = np.zeros(k - 1) print 'pre-calculating Rj and p-values ...' start1 = time.time() try: print 'attempting parallel calculation ...' c = Client() print 'available engines %s' % str(c.ids) v = c[:] print 'ell = ', sys.stdout.flush() for i in range(k - 1): print i + 1, sys.stdout.flush() args1 = [(fns[i:j + 2], n, cols, rows, bands) for j in range(i, k - 1)] results = v.map_sync(PV, args1) # list of tuples (p-value, lnRj) pvs = [result[0] for result in results] lnRjs = np.array([result[1] for result in results]) lnQs[i] = np.sum(lnRjs) if medianfilter: pvs = v.map_sync(call_median_filter, pvs) for j in range(i, k - 1): pvarray[i, j, :] = pvs[j - i].ravel() except Exception as e: print '%s \nfailed, so running sequential calculation ...' % e print 'ell= ', sys.stdout.flush() for i in range(k - 1): print i + 1, sys.stdout.flush() args1 = [(fns[i:j + 2], n, cols, rows, bands) for j in range(i, k - 1)] results = map(PV, args1) # list of tuples (p-value, lnRj) pvs = [result[0] for result in results] lnRjs = np.array([result[1] for result in results]) lnQs[i] = np.sum(lnRjs) if medianfilter: pvs = map(call_median_filter, pvs) for j in range(i, k - 1): pvarray[i, j, :] = pvs[j - i].ravel() print '\nelapsed time for p-value calculation: ' + str(time.time() - start1) cmap, smap, fmap, bmap = change_maps(pvarray, significance) # write to file system cmap = np.reshape(cmap, (rows, cols)) fmap = np.reshape(fmap, (rows, cols)) smap = np.reshape(smap, (rows, cols)) bmap = np.reshape(bmap, (rows, cols, k - 1)) driver = inDataset1.GetDriver() basename = os.path.basename(outfn) name, _ = os.path.splitext(basename) outfn1 = outfn.replace(name, name + '_cmap') outDataset = driver.Create(outfn1, cols, rows, 1, GDT_Byte) geotransform = inDataset1.GetGeoTransform() if geotransform is not None: outDataset.SetGeoTransform(geotransform) projection = inDataset1.GetProjection() if projection is not None: outDataset.SetProjection(projection) outBand = outDataset.GetRasterBand(1) outBand.WriteArray(cmap, 0, 0) outBand.FlushCache() print 'last change map written to: %s' % outfn1 outfn2 = outfn.replace(name, name + '_fmap') outDataset = driver.Create(outfn2, cols, rows, 1, GDT_Byte) if geotransform is not None: outDataset.SetGeoTransform(geotransform) if projection is not None: outDataset.SetProjection(projection) outBand = outDataset.GetRasterBand(1) outBand.WriteArray(fmap, 0, 0) outBand.FlushCache() print 'frequency map written to: %s' % outfn2 outfn3 = outfn.replace(name, name + '_bmap') outDataset = driver.Create(outfn3, cols, rows, k - 1, GDT_Byte) if geotransform is not None: outDataset.SetGeoTransform(geotransform) if projection is not None: outDataset.SetProjection(projection) for i in range(k - 1): outBand = outDataset.GetRasterBand(i + 1) outBand.WriteArray(bmap[:, :, i], 0, 0) outBand.FlushCache() print 'bitemporal map image written to: %s' % outfn3 outfn4 = outfn.replace(name, name + '_smap') outDataset = driver.Create(outfn4, cols, rows, 1, GDT_Byte) if geotransform is not None: outDataset.SetGeoTransform(geotransform) if projection is not None: outDataset.SetProjection(projection) outBand = outDataset.GetRasterBand(1) outBand.WriteArray(smap, 0, 0) outBand.FlushCache() print 'first change map written to: %s' % outfn4 print 'total elapsed time: ' + str(time.time() - start) outDataset = None inDataset1 = None
def __init__(self, training_path, epoch, tokenizer, num_data_epochs, reduce_memory=False): self.vocab = tokenizer.vocab self.tokenizer = tokenizer self.epoch = epoch self.data_epoch = int(epoch % num_data_epochs) logger.info('training_path: {}'.format(training_path)) data_file = training_path / "epoch_{}.json".format(self.data_epoch) metrics_file = training_path / "epoch_{}_metrics.json".format( self.data_epoch) logger.info('data_file: {}'.format(data_file)) logger.info('metrics_file: {}'.format(metrics_file)) assert data_file.is_file() and metrics_file.is_file() metrics = json.loads(metrics_file.read_text()) num_samples = metrics['num_training_examples'] seq_len = metrics['max_seq_len'] self.temp_dir = None self.working_dir = None if reduce_memory: self.temp_dir = TemporaryDirectory() self.working_dir = Path('/cache') input_ids = np.memmap(filename=self.working_dir / 'input_ids.memmap', mode='w+', dtype=np.int32, shape=(num_samples, seq_len)) input_masks = np.memmap(filename=self.working_dir / 'input_masks.memmap', shape=(num_samples, seq_len), mode='w+', dtype=np.bool) segment_ids = np.memmap(filename=self.working_dir / 'segment_ids.memmap', shape=(num_samples, seq_len), mode='w+', dtype=np.bool) lm_label_ids = np.memmap(filename=self.working_dir / 'lm_label_ids.memmap', shape=(num_samples, seq_len), mode='w+', dtype=np.int32) lm_label_ids[:] = -1 is_nexts = np.memmap(filename=self.working_dir / 'is_nexts.memmap', shape=(num_samples, ), mode='w+', dtype=np.bool) else: input_ids = np.zeros(shape=(num_samples, seq_len), dtype=np.int32) input_masks = np.zeros(shape=(num_samples, seq_len), dtype=np.bool) segment_ids = np.zeros(shape=(num_samples, seq_len), dtype=np.bool) lm_label_ids = np.full(shape=(num_samples, seq_len), dtype=np.int32, fill_value=-1) is_nexts = np.zeros(shape=(num_samples, ), dtype=np.bool) logging.info("Loading training examples for epoch {}".format(epoch)) with data_file.open() as f: for i, line in enumerate( tqdm(f, total=num_samples, desc="Training examples")): line = line.strip() example = json.loads(line) features = convert_example_to_features(example, tokenizer, seq_len) input_ids[i] = features.input_ids segment_ids[i] = features.segment_ids input_masks[i] = features.input_mask lm_label_ids[i] = features.lm_label_ids is_nexts[i] = features.is_next # assert i == num_samples - 1 # Assert that the sample count metric was true logging.info("Loading complete!") self.num_samples = num_samples self.seq_len = seq_len self.input_ids = input_ids self.input_masks = input_masks self.segment_ids = segment_ids self.lm_label_ids = lm_label_ids self.is_nexts = is_nexts
def create_np_memmap_file(path, column_size, row_size): os.makedirs(os.path.dirname(path), exist_ok=True) np.memmap(path, dtype='float32', mode='w+', shape=(column_size, row_size))
def testTokenize(self): import shutil import tempfile class TestEnum(Enum): VAL1 = 'val1' tempdir = tempfile.mkdtemp('mars_test_utils_') try: filename = os.path.join(tempdir, 'test_npa.dat') mmp_array = np.memmap(filename, dtype=float, mode='w+', shape=(3, 4)) mmp_array[:] = np.random.random((3, 4)).astype(float) mmp_array.flush() del mmp_array mmp_array1 = np.memmap(filename, dtype=float, shape=(3, 4)) mmp_array2 = np.memmap(filename, dtype=float, shape=(3, 4)) try: v = [ 1, 2.3, '456', u'789', b'101112', None, np.ndarray, [912, 'uvw'], np.arange(0, 10), np.array(10), np.array([b'\x01\x32\xff']), np.int64, TestEnum.VAL1 ] copy_v = copy.deepcopy(v) self.assertEqual( utils.tokenize(v + [mmp_array1], ext_data=1234), utils.tokenize(copy_v + [mmp_array2], ext_data=1234)) finally: del mmp_array1, mmp_array2 finally: shutil.rmtree(tempdir) v = {'a', 'xyz', 'uvw'} self.assertEqual(utils.tokenize(v), utils.tokenize(copy.deepcopy(v))) v = dict(x='abcd', y=98765) self.assertEqual(utils.tokenize(v), utils.tokenize(copy.deepcopy(v))) v = dict(x=dict(a=1, b=[1, 2, 3]), y=12345) self.assertEqual(utils.tokenize(v), utils.tokenize(copy.deepcopy(v))) # pandas relative if pd is not None: df = pd.DataFrame([[utils.to_binary('测试'), utils.to_text('数据')]], index=['a'], columns=['中文', 'data']) v = [ df, df.index, df.columns, df['data'], pd.Categorical(list('ABCD')) ] self.assertEqual(utils.tokenize(v), utils.tokenize(copy.deepcopy(v))) non_tokenizable_cls = type('non_tokenizable_cls', (object, ), {}) with self.assertRaises(TypeError): utils.tokenize(non_tokenizable_cls()) class CustomizedTokenize(object): def __mars_tokenize__(self): return id(type(self)), id(non_tokenizable_cls) self.assertEqual(utils.tokenize(CustomizedTokenize()), utils.tokenize(CustomizedTokenize())) v = lambda x: x + 1 self.assertEqual(utils.tokenize(v), utils.tokenize(copy.deepcopy(v))) def f(a, b): return np.add(a, b) self.assertEqual(utils.tokenize(f), utils.tokenize(copy.deepcopy(f))) partial_f = partial(f, 1) self.assertEqual(utils.tokenize(partial_f), utils.tokenize(copy.deepcopy(partial_f)))
import numpy import matplotlib.pyplot as plt data = numpy.memmap("output2.wav", dtype='h', mode='r') #dtype of h is: h : <type 'numpy.int16'> #this can be found with #for k,v in np.sctypeDict.iteritems(): print '{0:14s} : {1:40s}'.format(str(k), v) print "VALUES:", data plt.plot(data) plt.show()
def create_mask(self, shape): print "Creating a mask" self.temp_file = tempfile.mktemp() shape = shape[0] + 1, shape[1] + 1, shape[2] + 1 self.matrix = numpy.memmap(self.temp_file, mode='w+', dtype='uint8', shape=shape)
yt = {} for fold in range(lstm.stratifications): #for fold in range(1): print("============ fold {}".format(fold)) yp[fold] = {} ycn[fold] = {} yc[fold] = {} ytn[fold] = {} yt[fold] = {} for task in tasks: #for task in tasks[:1]: print("-------- task {}".format(task)) XTest = lstm.getXTest(fold, task) yp[fold][task] = np.memmap( "./tmp/yp-" + str(fold) + "-" + str(task) + ".dat", mode='w+', shape=(XTest.shape[0], lstm.model[fold][task].output_shape[1]), dtype=np.float) if not task in ftTasks: yp[fold][task][:] = lstm.model[fold][task].predict_proba(XTest) ycn[fold][task] = lstm.model[fold][task].predict_classes(XTest) else: XTestFT = lstm.getXTestFT(fold, task) yp[fold][task][:] = lstm.model[fold][task].predict_proba( [XTestFT, XTest]) ycn[fold][task] = lstm.model[fold][task].predict_classes( [XTestFT, XTest]) if not task in ftTasks: yt[fold][task] = lstm.getYTestName(