def savez(file, *args, **kwds): __doc__ = numpy.savez.__doc__ import zipfile from numpy.lib import format if isinstance(file, basestring): if not file.endswith('.npz'): file = file + '.npz' namedict = kwds for i, val in enumerate(args): key = 'arr_%d' % i if key in namedict.keys(): raise ValueError, "Cannot use un-named variables and keyword %s" % key namedict[key] = val zip = zipfile.ZipFile(file, mode="w") # Place to write temporary .npy files # before storing them in the zip. We need to path this to have a working # function in parallel ! import tempfile direc = tempfile.mkdtemp() for key, val in namedict.iteritems(): fname = key + '.npy' filename = os.path.join(direc, fname) fid = open(filename, 'wb') format.write_array(fid, numpy.asanyarray(val)) fid.close() zip.write(filename, arcname=fname) zip.close() shutil.rmtree(direc)
def _pickle_array(arr): arr = arr.view(np.ndarray) buf = BytesIO() write_array(buf, arr) return buf.getvalue()
def roundtrip_truncated(arr): f = BytesIO() format.write_array(f, arr) #BytesIO is one byte short f2 = BytesIO(f.getvalue()[0:-1]) arr2 = format.read_array(f2) return arr2
def test_memmap_roundtrip(): # Fixme: test crashes nose on windows. if not (sys.platform == 'win32' or sys.platform == 'cygwin'): for arr in basic_arrays + record_arrays: if arr.dtype.hasobject: # Skip these since they can't be mmap'ed. continue # Write it out normally and through mmap. nfn = os.path.join(tempdir, 'normal.npy') mfn = os.path.join(tempdir, 'memmap.npy') fp = open(nfn, 'wb') try: format.write_array(fp, arr) finally: fp.close() fortran_order = ( arr.flags.f_contiguous and not arr.flags.c_contiguous) ma = format.open_memmap(mfn, mode='w+', dtype=arr.dtype, shape=arr.shape, fortran_order=fortran_order) ma[...] = arr del ma # Check that both of these files' contents are the same. fp = open(nfn, 'rb') normal_bytes = fp.read() fp.close() fp = open(mfn, 'rb') memmap_bytes = fp.read() fp.close() yield assert_equal_, normal_bytes, memmap_bytes # Check that reading the file using memmap works. ma = format.open_memmap(nfn, mode='r') del ma
def test_memmap_roundtrip(): # XXX: test crashes nose on windows. Fix this if not (sys.platform == "win32" or sys.platform == "cygwin"): for arr in basic_arrays + record_arrays: if arr.dtype.hasobject: # Skip these since they can't be mmap'ed. continue # Write it out normally and through mmap. nfn = os.path.join(tempdir, "normal.npy") mfn = os.path.join(tempdir, "memmap.npy") fp = open(nfn, "wb") try: format.write_array(fp, arr) finally: fp.close() fortran_order = arr.flags.f_contiguous and not arr.flags.c_contiguous ma = format.open_memmap(mfn, mode="w+", dtype=arr.dtype, shape=arr.shape, fortran_order=fortran_order) ma[...] = arr del ma # Check that both of these files' contents are the same. fp = open(nfn, "rb") normal_bytes = fp.read() fp.close() fp = open(mfn, "rb") memmap_bytes = fp.read() fp.close() yield assert_equal, normal_bytes, memmap_bytes # Check that reading the file using memmap works. ma = format.open_memmap(nfn, mode="r") # yield assert_array_equal, ma, arr del ma
def savez(self, *args, **kwds): import os import numpy.lib.format as format namedict = kwds for val in args: key = 'arr_%d' % self.i if key in namedict.keys(): raise ValueError( "Cannot use un-named variables and keyword %s" % key) namedict[key] = val self.i += 1 try: for key, val in namedict.iteritems(): fname = key + '.npy' fid = open(self.tmpfile, 'wb') try: format.write_array(fid, np.asanyarray(val)) fid.close() fid = None self.zip.write(self.tmpfile, arcname=fname) finally: if fid: fid.close() finally: os.remove(self.tmpfile)
def test_read_array_header_2_0(): s = BytesIO() arr = np.ones((3, 6), dtype=float) format.write_array(s, arr, version=(2, 0)) s.seek(format.MAGIC_LEN) shape, fortran, dtype = format.read_array_header_2_0(s) assert_((shape, fortran, dtype) == ((3, 6), False, float))
def test_read_array_header_2_0(): s = BytesIO() arr = np.ones((3, 6), dtype=float) format.write_array(s, arr, version=(2, 0)) s.seek(format.MAGIC_LEN) shape, fortran, dtype = format.read_array_header_2_0(s) assert_(s.tell() % format.ARRAY_ALIGN == 0) assert_((shape, fortran, dtype) == ((3, 6), False, float))
def save(file, iarray, metafile=None, version=(1, 0)): """Save a info array to a .npy file and a metadata file. Similar to the numpy.save function. Parameters ---------- file: file handle or str File or file name to write the array to in .npy format. iarray: InfoArray object or array with similar interface Array to be written to file with meta data. metafile: str File name for the meta data. The `info` attribute of `iarray` will be written here. Default is None, where the it is assumed to be the file name associated with `file` with ".meta" appended. """ # Restrict to version (1,0) because we've only written write_header for # this version. if version != (1, 0): raise ValueError("Only version (1,0) is safe from this function.") # Make sure that the meta data will be representable as a string. infostring = repr(iarray.info) try: safe_eval(infostring) except SyntaxError: raise ValueError # Save the array in .npy format. if isinstance(file, basestring): fid = open(file, "wb") else: fid = file npfor.write_array(fid, iarray, version=version) # Figure out what the filename for the meta data should be. if metafile is None: try: fname = file.name except AttributeError: fname = file metafile = fname + ".meta" # Save the meta data. info_fid = open(metafile, 'w') try: info_fid.write(infostring) finally: info_fid.close()
def _savez(file, args, kwds, compress): # Import is postponed to here since zipfile depends on gzip, an optional # component of the so-called standard library. import zipfile # Import deferred for startup time improvement import tempfile if isinstance(file, basestring): if not file.endswith(".npz"): file = file + ".npz" namedict = kwds for i, val in enumerate(args): key = "arr_%d" % i if key in namedict.keys(): msg = "Cannot use un-named variables and keyword %s" % key raise ValueError, msg namedict[key] = val if compress: compression = zipfile.ZIP_DEFLATED else: compression = zipfile.ZIP_STORED zip = zipfile_factory(file, mode="w", compression=compression) # Stage arrays in a temporary file on disk, before writing to zip. fd, tmpfile = tempfile.mkstemp(suffix="-numpy.npy") os.close(fd) try: for key, val in namedict.iteritems(): fname = key + ".npy" fid = open(tmpfile, "wb") try: format.write_array(fid, numpy.asanyarray(val)) fid.close() fid = None zip.write(tmpfile, arcname=fname) finally: if fid: fid.close() finally: os.remove(tmpfile) zip.close()
def _savez(file, args, kwds, compress): # Import is postponed to here since zipfile depends on gzip, an optional # component of the so-called standard library. import zipfile # Import deferred for startup time improvement import tempfile if isinstance(file, basestring): if not file.endswith('.npz'): file = file + '.npz' namedict = kwds for i, val in enumerate(args): key = 'arr_%d' % i if key in namedict.keys(): msg = "Cannot use un-named variables and keyword %s" % key raise ValueError, msg namedict[key] = val if compress: compression = zipfile.ZIP_DEFLATED else: compression = zipfile.ZIP_STORED zip = zipfile_factory(file, mode="w", compression=compression) # Stage arrays in a temporary file on disk, before writing to zip. fd, tmpfile = tempfile.mkstemp(suffix='-numpy.npy') os.close(fd) try: for key, val in namedict.iteritems(): fname = key + '.npy' fid = open(tmpfile, 'wb') try: format.write_array(fid, numpy.asanyarray(val)) fid.close() fid = None zip.write(tmpfile, arcname=fname) finally: if fid: fid.close() finally: os.remove(tmpfile) zip.close()
def test_version_2_0(): f = BytesIO() # requires more than 2 byte for header dt = [(("%d" % i) * 100, float) for i in range(500)] d = np.ones(1000, dtype=dt) format.write_array(f, d, version=(2, 0)) with warnings.catch_warnings(record=True) as w: warnings.filterwarnings('always', '', UserWarning) format.write_array(f, d) assert_(w[0].category is UserWarning) f.seek(0) n = format.read_array(f) assert_array_equal(d, n) # 1.0 requested but data cannot be saved this way assert_raises(ValueError, format.write_array, f, d, (1, 0))
def test_read_magic(): s1 = BytesIO() s2 = BytesIO() arr = np.ones((3, 6), dtype=float) format.write_array(s1, arr, version=(1, 0)) format.write_array(s2, arr, version=(2, 0)) s1.seek(0) s2.seek(0) version1 = format.read_magic(s1) version2 = format.read_magic(s2) assert_(version1 == (1, 0)) assert_(version2 == (2, 0)) assert_(s1.tell() == format.MAGIC_LEN) assert_(s2.tell() == format.MAGIC_LEN)
def test_unicode_field_names(): # gh-7391 arr = np.array( [(1, 3), (1, 2), (1, 3), (1, 2)], dtype=[ ('int', int), (u'\N{CJK UNIFIED IDEOGRAPH-6574}\N{CJK UNIFIED IDEOGRAPH-5F62}', int) ]) fname = os.path.join(tempdir, "unicode.npy") with open(fname, 'wb') as f: format.write_array(f, arr, version=(3, 0)) with open(fname, 'rb') as f: arr2 = format.read_array(f) assert_array_equal(arr, arr2) # notifies the user that 3.0 is selected with open(fname, 'wb') as f: with assert_warns(UserWarning): format.write_array(f, arr, version=None)
def savez(file, version, *args, compress=True, **kwargs): namedict = kwargs for i, val in enumerate(args): key = 'arr_%d' % i if key in namedict.keys(): raise ValueError("Cannot use un-named variables and keyword %s" % key) namedict[key] = val if compress: compression = zipfile.ZIP_DEFLATED else: compression = zipfile.ZIP_STORED with zipfile_factory(file, mode="a", compression=compression) as zipf: # Write file format version zipf.writestr('version', str(version)) # Write directly to a ZIP file for key, val in namedict.items(): if isinstance(val, (str, bytes)): zipf.writestr(key, val) else: try: s = json.dumps(val, indent=4) except TypeError: if type(val).__module__ == 'pandas.core.frame': fname = key + '.parquet' force_zip64 = val.values.nbytes >= 2**30 with zipf.open(fname, 'w', force_zip64=force_zip64) as fid: pq.write_table(pa.Table.from_pandas(val), fid) else: fname = key + '.npy' val = np.asanyarray(val) force_zip64 = val.nbytes >= 2**30 with zipf.open(fname, 'w', force_zip64=force_zip64) as fid: format.write_array(fid, val, allow_pickle=False) else: zipf.writestr(key, s)
def savez(self, *args, **kwds): import os import numpy.lib.format as fmt namedict = kwds for val in args: key = 'arr_%d' % self._i if key in namedict.keys(): raise ValueError("Cannot use un-named variables and keyword %s" % key) namedict[key] = val self._i += 1 try: for key, val in namedict.items(): fname = key + '.npy' fid = open(self.tmpfile, 'wb') with open(self.tmpfile, 'wb') as fid: fmt.write_array(fid, np.asanyarray(val), allow_pickle=True) self.zip.write(self.tmpfile, arcname=fname) finally: os.remove(self.tmpfile)
def test_write_version_1_0(): f = StringIO() arr = np.arange(1) # These should pass. format.write_array(f, arr, version=(1, 0)) format.write_array(f, arr) # These should all fail. bad_versions = [ (1, 1), (0, 0), (0, 1), (2, 0), (2, 2), (255, 255), ] for version in bad_versions: try: format.write_array(f, arr, version=version) except ValueError: pass else: raise AssertionError( "we should have raised a ValueError for the bad version %r" % (version, ))
def _savez(file, args, kwds, compress): if isinstance(file, basestring): if not file.endswith('.npz'): file = file + '.npz' namedict = kwds for i, val in enumerate(args): key = 'arr_%d' % i if key in namedict.keys(): msg = "Cannot use un-named variables and keyword %s" % key raise ValueError, msg namedict[key] = val if compress: compression = zipfile.ZIP_DEFLATED else: compression = zipfile.ZIP_STORED zip = zipfile_factory(file, mode="w", compression=compression) # Stage arrays in a temporary file on disk, before writing to zip. fd, tmpfile = tempfile.mkstemp(suffix='-numpy.npy') os.close(fd) try: for key, val in namedict.iteritems(): fname = key + '.npy' fid = open(tmpfile, 'wb') try: format.write_array(fid, numpy.asanyarray(val)) fid.close() fid = None zip.write(tmpfile, arcname=fname) finally: if fid: fid.close() finally: os.remove(tmpfile) zip.close()
def test_memmap_roundtrip(): # XXX: test crashes nose on windows. Fix this if not (sys.platform == 'win32' or sys.platform == 'cygwin'): for arr in basic_arrays + record_arrays: if arr.dtype.hasobject: # Skip these since they can't be mmap'ed. continue # Write it out normally and through mmap. nfn = os.path.join(tempdir, 'normal.npy') mfn = os.path.join(tempdir, 'memmap.npy') fp = open(nfn, 'wb') try: format.write_array(fp, arr) finally: fp.close() fortran_order = (arr.flags.f_contiguous and not arr.flags.c_contiguous) ma = format.open_memmap(mfn, mode='w+', dtype=arr.dtype, shape=arr.shape, fortran_order=fortran_order) ma[...] = arr del ma # Check that both of these files' contents are the same. fp = open(nfn, 'rb') normal_bytes = fp.read() fp.close() fp = open(mfn, 'rb') memmap_bytes = fp.read() fp.close() yield assert_equal, normal_bytes, memmap_bytes # Check that reading the file using memmap works. ma = format.open_memmap(nfn, mode='r') #yield assert_array_equal, ma, arr del ma
def test_memmap_roundtrip(): # Fixme: used to crash on windows if not (sys.platform == "win32" or sys.platform == "cygwin"): for arr in basic_arrays + record_arrays: if arr.dtype.hasobject: # Skip these since they can't be mmap'ed. continue # Write it out normally and through mmap. nfn = os.path.join(tempdir, "normal.npy") mfn = os.path.join(tempdir, "memmap.npy") fp = open(nfn, "wb") try: format.write_array(fp, arr) finally: fp.close() fortran_order = arr.flags.f_contiguous and not arr.flags.c_contiguous ma = format.open_memmap( mfn, mode="w+", dtype=arr.dtype, shape=arr.shape, fortran_order=fortran_order, ) ma[...] = arr del ma # Check that both of these files' contents are the same. fp = open(nfn, "rb") normal_bytes = fp.read() fp.close() fp = open(mfn, "rb") memmap_bytes = fp.read() fp.close() assert_equal_(normal_bytes, memmap_bytes) # Check that reading the file using memmap works. ma = format.open_memmap(nfn, mode="r") del ma
def test_version_2_0(): f = BytesIO() # requires more than 2 byte for header dt = [(("%d" % i) * 100, float) for i in range(500)] d = np.ones(1000, dtype=dt) format.write_array(f, d, version=(2, 0)) with warnings.catch_warnings(record=True) as w: warnings.filterwarnings("always", "", UserWarning) format.write_array(f, d) assert_(w[0].category is UserWarning) # check alignment of data portion f.seek(0) header = f.readline() assert_(len(header) % format.ARRAY_ALIGN == 0) f.seek(0) n = format.read_array(f) assert_array_equal(d, n) # 1.0 requested but data cannot be saved this way assert_raises(ValueError, format.write_array, f, d, (1, 0))
def test_memmap_roundtrip(): # Fixme: used to crash on windows if not (sys.platform == 'win32' or sys.platform == 'cygwin'): for arr in basic_arrays + record_arrays: if arr.dtype.hasobject: # Skip these since they can't be mmap'ed. continue # Write it out normally and through mmap. nfn = os.path.join(tempdir, 'normal.npy') mfn = os.path.join(tempdir, 'memmap.npy') with open(nfn, 'wb') as fp: format.write_array(fp, arr) fortran_order = (arr.flags.f_contiguous and not arr.flags.c_contiguous) ma = format.open_memmap(mfn, mode='w+', dtype=arr.dtype, shape=arr.shape, fortran_order=fortran_order) ma[...] = arr del ma if IS_PYPY: break_cycles() # Check that both of these files' contents are the same. with open(nfn, 'rb') as fp: normal_bytes = fp.read() with open(mfn, 'rb') as fp: memmap_bytes = fp.read() assert_equal_(normal_bytes, memmap_bytes) # Check that reading the file using memmap works. ma = format.open_memmap(nfn, mode='r') del ma if IS_PYPY: break_cycles()
def test_ensemble_matrix_json(client, simple_ensemble, get, post): from numpy.lib.format import write_array, read_array ensemble_id = simple_ensemble() matrix = np.random.rand(5, 8, 13) # POST post_url = f"/ensembles/{ensemble_id}/records/mat/matrix" if post == "json": resp = client.post(post_url, json=matrix.tolist()) elif post == "numpy": stream = io.BytesIO() write_array(stream, matrix) resp = client.post( post_url, data=stream.getvalue(), headers={"content-type": "application/x-numpy"}, ) else: raise NotImplementedError() # GET get_url = f"/ensembles/{ensemble_id}/records/mat" if get == "json": resp = client.get(f"/ensembles/{ensemble_id}/records/mat") assert resp.json() == matrix.tolist() elif get == "numpy": resp = client.get( f"/ensembles/{ensemble_id}/records/mat", headers={"accept": "application/x-numpy"}, ) stream = io.BytesIO(resp.content) assert (read_array(stream) == matrix).all() else: raise NotImplementedError()
def test_write_version_1_0(): f = StringIO() arr = np.arange(1) # These should pass. format.write_array(f, arr, version=(1, 0)) format.write_array(f, arr) # These should all fail. bad_versions = [(1, 1), (0, 0), (0, 1), (2, 0), (2, 2), (255, 255)] for version in bad_versions: try: format.write_array(f, arr, version=version) except ValueError: pass else: raise AssertionError("we should have raised a ValueError for the bad version %r" % (version,))
def to_string(arr): f = StringIO() format.write_array(f, arr) s = f.getvalue() return s
def toSparse(source, idx2label, Format='NPZ'): """ Convert intra-chromosomal contact matrices to sparse ones. Parameters ---------- source : str Hdf5 file name. idx2label : dict A dictionary for conversion between zero-based indices and string chromosome labels. Format : {'NPZ', 'HDF5'} Output format. (Default: HDF5) """ lib = h5dict(source, mode='r') ## Uniform numpy-structured-array format itype = np.dtype({ 'names': ['bin1', 'bin2', 'IF'], 'formats': [np.int, np.int, np.float] }) ## Create a Zip file in NPZ case if Format.upper() == 'NPZ': output = source.replace('.hm', '-sparse.npz') Zip = zipfile.ZipFile(output, mode='w', allowZip64=True) fd, tmpfile = tempfile.mkstemp(suffix='-numpy.npy') os.close(fd) if Format.upper() == 'HDF5': output = source.replace('.hm', '-sparse.hm') odict = h5dict(output) log.log(21, 'Sparse Matrices will be saved to %s', output) log.log(21, 'Only intra-chromosomal matrices will be taken into account') log.log(21, 'Coverting ...') for i in lib: if (i != 'resolution') and (len(set(i.split())) == 1): # Used for the dict-like key key = idx2label[int(i.split()[0])] log.log(21, 'Chromosome %s ...', key) # 2D-Matrix H = lib[i] # Triangle Array Triu = np.triu(H) # Sparse Matrix in Memory x, y = np.nonzero(Triu) values = Triu[x, y] sparse = np.zeros(values.size, dtype=itype) sparse['bin1'] = x sparse['bin2'] = y sparse['IF'] = values if Format.upper() == 'HDF5': # Really Simple, just like a dictionary odict[key] = sparse if Format.upper() == 'NPZ': # Much more complicated, but we need make a compatible # file interface for other API fname = key + '.npy' fid = open(tmpfile, 'wb') try: write_array(fid, np.asanyarray(sparse)) fid.close() fid = None Zip.write(tmpfile, arcname=fname) finally: if fid: fid.close() log.log(21, 'Done!') os.remove(tmpfile) Zip.close()
def test_write_version(): f = BytesIO() arr = np.arange(1) # These should pass. format.write_array(f, arr, version=(1, 0)) format.write_array(f, arr) format.write_array(f, arr, version=None) format.write_array(f, arr) format.write_array(f, arr, version=(2, 0)) format.write_array(f, arr) # These should all fail. bad_versions = [ (1, 1), (0, 0), (0, 1), (2, 2), (255, 255), ] for version in bad_versions: with assert_raises_regex(ValueError, 'we only support format version.*'): format.write_array(f, arr, version=version)
def _savez(file, args, kwds, compress): # Import is postponed to here since zipfile depends on gzip, an optional # component of the so-called standard library. import zipfile import tempfile import shutil if isinstance(file, basestring): if not file.endswith('.npz'): file = file + '.npz' namedict = kwds for i, val in enumerate(args): key = 'arr_%d' % i if key in namedict.keys(): raise ValueError("Cannot use un-named variables and keyword %s" % key) namedict[key] = val if compress: compression = zipfile.ZIP_DEFLATED else: compression = zipfile.ZIP_STORED # use compress keyword if given; only active in savez if 'compress' in namedict.keys(): if namedict['compress']: compression = zipfile.ZIP_DEFLATED del namedict['compress'] # append or update if 'append' in namedict.keys(): appendit = namedict['append'] del namedict['append'] else: appendit = False if 'update' in namedict.keys(): updateit = namedict['update'] del namedict['update'] else: updateit = False if appendit and updateit: raise KeyError("append and update mutually exclusive.") # check if file exists, otherwise it will be a simple write if not os.path.isfile(file): appendit = False updateit = False inzipf = [] else: zipf = zipfile.ZipFile(file, mode="r") inzipf = zipf.namelist() inzipf = [ i[:-4] for i in inzipf ] zipf.close() allkeys = set(namedict.keys()) allkeys.update(inzipf) # append if keyword is True if appendit: mode = "a" # check if new arrays already exist in zip file if len(inzipf) != 0: for key in namedict: if key in inzipf: raise ValueError("array name already in npz-file: %s" % key) else: mode = "w" if not updateit: # Just add arrays to existing or non-existing file; duplicates were checked before zipf = zipfile.ZipFile(file, mode=mode, compression=compression, allowZip64=True) # Stage arrays in a temporary file on disk, before writing to zip. fd, tmpfile = tempfile.mkstemp(suffix='-numpy.npy') os.close(fd) try: for key, val in namedict.items(): fname = key + '.npy' fid = open(tmpfile, 'wb') try: format.write_array(fid, np.asanyarray(val)) fid.close() fid = None zipf.write(tmpfile, arcname=fname) finally: if fid: fid.close() finally: os.remove(tmpfile) zipf.close() else: # open existing zip file in read mode zipr = zipfile.ZipFile(file, mode="r") # open temporary zip file in write mode tempdir = tempfile.mkdtemp() try: tempname = os.path.join(tempdir, 'new.zip') zipw = zipfile.ZipFile(tempname, mode="w", compression=compression, allowZip64=True) for key in allkeys: # if in namedict then write new, else extract it from zipfile if key in namedict.keys(): # Stage arrays in a temporary file on disk, before writing to zip. fd, tmpfile = tempfile.mkstemp(suffix='-numpy.npy') os.close(fd) try: fname = key + '.npy' fid = open(tmpfile, 'wb') try: format.write_array(fid, np.asanyarray(namedict[key])) fid.close() fid = None zipw.write(tmpfile, arcname=fname) finally: if fid: fid.close() finally: os.remove(tmpfile) else: fname = key + '.npy' zipr.extract(fname, tempdir) tmpfile = os.path.join(tempdir, fname) zipw.write(tmpfile, arcname=fname) os.remove(tmpfile) # close both files and move new to old zipr.close() zipw.close() shutil.move(tempname, file) finally: shutil.rmtree(tempdir)
def tobytes(array): fp = BytesIO() write_array(fp, array, allow_pickle=False) return fp.getvalue()
def roundtrip(arr): f = BytesIO() format.write_array(f, arr) f2 = BytesIO(f.getvalue()) arr2 = format.read_array(f2, allow_pickle=True) return arr2
def toSparse(source, csr=False): """ Convert intra-chromosomal contact matrices to sparse ones. Parameters ---------- source : str Hdf5 file name. idx2label : dict A dictionary for conversion between zero-based indices and string chromosome labels. csr : bool Whether to use CSR (Compressed Row Storage) format or not. """ import zipfile, tempfile from numpy.lib.format import write_array from scipy import sparse lib = h5dict(source, mode='r') ## Uniform numpy-structured-array format itype = np.dtype({ 'names': ['bin1', 'bin2', 'IF'], 'formats': [np.int, np.int, np.float] }) ## Create a Zip file in NPZ case if not csr: output = source.replace('.hm', '-sparse.npz') else: output = source.replace('.hm', '-csrsparse.npz') Zip = zipfile.ZipFile(output, mode='w', allowZip64=True) fd, tmpfile = tempfile.mkstemp(suffix='-numpy.npy') os.close(fd) log.log(21, 'Sparse Matrices will be saved to %s', output) log.log(21, 'Only intra-chromosomal matrices will be taken into account') log.log(21, 'Coverting ...') count = 0 for i in lib: if (i != 'resolution') and (i != 'genomeInformation') and (len( set(i.split())) == 1): # Used for the dict-like key key = lib['genomeInformation']['idx2label'][int(i.split()[0])] log.log(21, 'Chromosome %s ...', key) # 2D-Matrix H = lib[i] if not csr: # Triangle Array Triu = np.triu(H) # Sparse Matrix in Memory x, y = np.nonzero(Triu) values = Triu[x, y] temp = np.zeros(values.size, dtype=itype) temp['bin1'] = x temp['bin2'] = y temp['IF'] = values else: temp = sparse.triu(H, format='csr') fname = key + '.npy' fid = open(tmpfile, 'wb') try: write_array(fid, np.asanyarray(temp)) fid.close() fid = None Zip.write(tmpfile, arcname=fname) finally: if fid: fid.close() log.log(21, 'Done!') count += 1 if count == 0: log.warning('Empty source file!') # Other information for i in ['resolution', 'genomeInformation']: fname = '.'.join([i, 'npy']) fid = open(tmpfile, 'wb') try: write_array(fid, np.asanyarray(lib[i])) fid.close() fid = None Zip.write(tmpfile, arcname=fname) finally: if fid: fid.close() os.remove(tmpfile) Zip.close()
def roundtrip_randsize(arr): f = BytesIO() format.write_array(f, arr) f2 = BytesIOSRandomSize(f.getvalue()) arr2 = format.read_array(f2) return arr2
def roundtrip(arr): f = BytesIO() format.write_array(f, arr) f2 = BytesIO(f.getvalue()) arr2 = format.read_array(f2) return arr2
def toSparse(source, idx2label, csr = False): """ Convert intra-chromosomal contact matrices to sparse ones. Parameters ---------- source : str Hdf5 file name. idx2label : dict A dictionary for conversion between zero-based indices and string chromosome labels. csr : bool Whether to use CSR (Compressed Row Storage) format or not. """ import zipfile, tempfile from numpy.lib.format import write_array from scipy import sparse lib = h5dict(source, mode = 'r') ## Uniform numpy-structured-array format itype = np.dtype({'names':['bin1', 'bin2', 'IF'], 'formats':[np.int, np.int, np.float]}) ## Create a Zip file in NPZ case if not csr: output = source.replace('.hm', '-sparse.npz') else: output = source.replace('.hm', '-csrsparse.npz') Zip = zipfile.ZipFile(output, mode = 'w', allowZip64 = True) fd, tmpfile = tempfile.mkstemp(suffix = '-numpy.npy') os.close(fd) log.log(21, 'Sparse Matrices will be saved to %s', output) log.log(21, 'Only intra-chromosomal matrices will be taken into account') log.log(21, 'Coverting ...') count = 0 for i in lib: if (i != 'resolution') and (len(set(i.split())) == 1): # Used for the dict-like key key = idx2label[int(i.split()[0])] log.log(21, 'Chromosome %s ...', key) # 2D-Matrix H = lib[i] if not csr: # Triangle Array Triu = np.triu(H) # Sparse Matrix in Memory x, y = np.nonzero(Triu) values = Triu[x, y] temp = np.zeros(values.size, dtype = itype) temp['bin1'] = x temp['bin2'] = y temp['IF'] = values else: temp = sparse.triu(H, format = 'csr') fname = key + '.npy' fid = open(tmpfile, 'wb') try: write_array(fid, np.asanyarray(temp)) fid.close() fid = None Zip.write(tmpfile, arcname = fname) finally: if fid: fid.close() log.log(21, 'Done!') count += 1 # Store the resolution information if 'resolution' in lib: fname = 'resolution.npy' fid = open(tmpfile, 'wb') try: write_array(fid, np.asanyarray(lib['resolution'])) fid.close() fid = None Zip.write(tmpfile, arcname = fname) finally: if fid: fid.close() if count == 0: log.warning('Empty source file!') os.remove(tmpfile) Zip.close()
def toString(data): f= StringIO() format.write_array(f,data) return f.getvalue()
def __init__(self, datasets, chroms=['#','X'], maxsize=4000000, npzpre=None, cache=None): self.chroms = set(chroms) data = datasets self._npzpre = npzpre if not self._npzpre is None: self._npzpre = os.path.abspath(os.path.expanduser(npzpre)) for res in data: for rep in data[res]: rl = '%dK' % (res//1000) output = '.'.join([self._npzpre, rl, rep, 'npz']) if os.path.exists(output): log.error('The destination npz file will be overriden, reset npz prefix and run again ...') log.error('Exit ...') sys.exit(1) # We don't read data in memory at this point. # We only construct the mapping for loading convenience self.data = {} for res in data: for rep in data[res]: if data[res][rep].endswith('.npz'): lib = np.load(data[res][rep]) for i in lib.files: if ((not self.chroms) or (i.isdigit() and '#' in self.chroms) or (i in self.chroms)): if not i in self.data: self.data[i] = {res:{rep:lib}} else: if res in self.data[i]: self.data[i][res][rep] = lib else: self.data[i][res] = {rep:lib} else: Map = self._scanFolder(data[res][rep]) for i in Map: if not i in self.data: self.data[i] = {res:{rep:Map[i]}} else: if res in self.data[i]: self.data[i][res][rep] = Map[i] else: self.data[i][res] = {rep:Map[i]} if cache is None: self._cache = tempfile.gettempdir() else: self._cache = os.path.abspath(os.path.expanduser(cache)) if not os.path.isdir(cache): os.makedirs(cache) self._intertype = np.dtype({'names':['bin1', 'bin2', 'IF'], 'formats':[np.int, np.int, np.float]}) # Before starting calling, we make cache data under the cache folder for chrom in self.data: log.debug('Chrom %s:', chrom) ms = self.data[chrom] for res in ms: for rep in ms[res]: log.debug(' resolution: %d, %s', res, rep) if type(ms[res][rep])==str: tdata = np.loadtxt(ms[res][rep], dtype = self._intertype) else: tdata = ms[res][rep][chrom] work = Chrom(chrom, res, tdata, rep, maxsize) work.Label = rep tl = time.strftime('%Y%m%d%H%M%S', time.localtime(time.time())) kw = {'suffix':tl, 'dir':self._cache} fd, tmpfil = tempfile.mkstemp(**kw) os.close(fd) with open(tmpfil, 'wb') as output: log.debug(' Cache Chrom object into %s ...', tmpfil) cPickle.dump(work, output, protocol = 2) self.data[chrom][res][rep] = tmpfil time.sleep(3) if not self._npzpre is None: rl = '%dK' % (res//1000) output = '.'.join([self._npzpre, rl, rep, 'npz']) if not os.path.exists(output): Zip = zipfile.ZipFile(output, mode = 'w', allowZip64 = True) else: Zip = zipfile.ZipFile(output, mode = 'a', allowZip64 = True) tl = time.strftime('%Y%m%d%H%M%S', time.localtime(time.time())) fd, tmpfile = tempfile.mkstemp(suffix = '.'.join([tl, 'npy'])) os.close(fd) log.debug(' Save data into %s for accelerating IO next time ...', output) fname = '.'.join([chrom, 'npy']) fid = open(tmpfile, 'wb') try: write_array(fid, tdata) fid.close() fid = None Zip.write(tmpfile, arcname = fname) finally: if fid: fid.close() os.remove(tmpfile) Zip.close()
async def _get_record_data( record: ds.Record, accept: Optional[str], realization_index: Optional[int] = None) -> Response: type_ = record.record_info.record_type if type_ == ds.RecordType.f64_matrix: if realization_index is None: content = record.f64_matrix.content else: content = record.f64_matrix.content[realization_index] if accept == "application/x-numpy": from numpy.lib.format import write_array stream = io.BytesIO() write_array(stream, np.array(content)) return Response( content=stream.getvalue(), media_type="application/x-numpy", ) if accept == "text/csv": data = pd.DataFrame(content) labels = record.f64_matrix.labels if labels is not None and realization_index is None: data.columns = labels[0] data.index = labels[1] elif labels is not None and realization_index is not None: # The output is such that rows are realizations. Because # `content` is a 1d list in this case, it treats each element as # its own row. We transpose the data so that all of the data # falls on the same row. data = data.T data.columns = labels[0] data.index = [realization_index] return Response( content=data.to_csv().encode(), media_type="text/csv", ) else: return content if type_ == ds.RecordType.file: f = record.file if f.content is not None: return Response( content=f.content, media_type=f.mimetype, headers={ "Content-Disposition": f'attachment; filename="{f.filename}"' }, ) elif f.az_container is not None and f.az_blob is not None: blob = azure_blob_container.get_blob_client(f.az_blob) download = await blob.download_blob() async def chunk_generator() -> AsyncGenerator[bytes, None]: async for chunk in download.chunks(): yield chunk return StreamingResponse( chunk_generator(), media_type=f.mimetype, headers={ "Content-Disposition": f'attachment; filename="{f.filename}"' }, ) raise NotImplementedError( f"Getting record data for type {type_} and Accept header {accept} not implemented" )
def toString(self, data): f = StringIO() format.write_array(f, data) return f.getvalue()