def scan_local(local_directory, url=None, logging_level=logging.WARNING): ''' Bootstrap a Hashdown by recursively walking a local directory and finding the local MD5 hashes. (A local hash might be wrong if the files are out of date or have OS-dependent line endings.) Typically, you'll then want to save the result to a JSON file and then edit that JSON file manually to remove uninteresting files. :param local_directory: Local directory to recursively walk :type path: string :param url: URL to give to the Hashdown. (It will not be checked.) :type path: string :param logging_level: Logging level for printing progress of the walk. Default is logging.WARNING) :rtype: :class:`.Hashdown` ''' from pysnptools.util.filecache import LocalCache file_to_hash = {} localcache = LocalCache(local_directory) with log_in_place("scanning", logging_level) as updater: for file in localcache.walk(): updater(file) with localcache.open_read(file) as full_file: hash = Hashdown._get_hash(full_file) file_to_hash[file] = hash return Hashdown(url, file_to_hash=file_to_hash)
def _read(self, row_index_or_none, col_index_or_none, order, dtype, force_python_only, view_ok): self._run_once() import pysnptools.util as pstutil dtype = np.dtype(dtype) if order == 'A': order = 'F' row_index_count = len( row_index_or_none ) if row_index_or_none is not None else self._iid_count # turn to a count of the index positions e.g. all of them col_index = col_index_or_none if col_index_or_none is not None else np.arange( self._sid_count ) # turn to an array of index positions, e.g. 0,1,200,2200,10 batch_index = col_index // self._block_size #find the batch index of each index position, e.g. 0,0,0,2,0 val = np.empty((row_index_count, len(col_index)), order=order, dtype=dtype) #allocate memory for result list_batch_index = list(set(batch_index)) with log_in_place("working on snpgen batch", logging.INFO) as updater: for ii, i in enumerate( list_batch_index ): #for each distinct batch index, generate snps updater("{0} of {1}".format(ii, len(list_batch_index))) start = i * self._block_size #e.g. 0 (then 2000) stop = start + self._block_size #e.g. 1000, then 3000 batch_val = self._get_val2(start, stop, order=order, dtype=dtype) # generate whole batch a = ( batch_index == i ) #e.g. [True,True,True,False,True], then [False,False,False,True,False] b = col_index[a] - start #e.g. 0,1,200,10, then 200 val[:, a] = batch_val[:, b] if row_index_or_none is None else pstutil.sub_matrix( batch_val, row_index_or_none, b) return val
def genwrite( filename, distreader, decimal_places=None, id_rsid_function=default_id_rsid_function, sample_function=default_sample_function, block_size=None, ): """Writes a :class:`DistReader` to Gen format :param filename: the name of the file to create (will also create *filename_without_gen*.sample file.) :type filename: string :param distreader: The data that should be written to disk. It can also be any distreader, for example, :class:`.DistNpz`, :class:`.DistData`, or another :class:`.Bgen`. :type distreader: :class:`DistReader` :param decimal_places: (Default: None) Number of decimal places with which to write the text numbers. *None* writes to full precision. :type bits: int or None :param id_rsid_function: Function to turn a a :attr:`DistReader.sid` into a GEN (SNP) id and rsid. (Default: :meth:`bgen.default_id_rsid_function`.) :type id_rsid_function: function :param sid_function: Function to turn a GEN (SNP) id and rsid into a :attr:`DistReader.sid`. (Default: :meth:`bgen.default_sid_function`.) :type sid_function: function :param block_size: The number of SNPs to read in a batch from *distreader*. Defaults to a *block_size* such that *block_size* \* *iid_count* is about 100,000. :type block_size: number :rtype: None >>> from pysnptools.distreader import DistHdf5, Bgen >>> import pysnptools.util as pstutil >>> from pysnptools.util import example_file # Download and return local file name >>> hdf5_file = example_file("pysnptools/examples/toydata.snpmajor.dist.hdf5") >>> distreader = DistHdf5(hdf5_file)[:,:10] # A reader for the first 10 SNPs in Hdf5 format >>> pstutil.create_directory_if_necessary("tempdir/toydata10.bgen") >>> Bgen.genwrite("tempdir/toydata10.gen",distreader) # Write data in GEN format """ # https://www.cog-genomics.org/plink2/formats#gen # https://web.archive.org/web/20181010160322/http://www.stats.ox.ac.uk/~marchini/software/gwas/file_format.html block_size = block_size or max( (100 * 1000) // max(1, distreader.row_count), 1) if decimal_places is None: def format(num): return f"{num}" format_function = format else: def format(num): return ("{0:." + str(decimal_places) + "f}").format(num) format_function = format start = 0 updater_freq = 10000 index = -1 with log_in_place("writing text values ", logging.INFO) as updater: with open(filename + ".temp", "w", newline="\n") as genfp: while start < distreader.sid_count: distdata = distreader[:, start:start + block_size].read(view_ok=True) for sid_index in range(distdata.sid_count): id, rsid = id_rsid_function(distdata.sid[sid_index]) assert id.strip() != "", "id cannot be whitespace" assert rsid.strip() != "", "rsid cannot be whitespace" genfp.write("{0} {1} {2} {3} A G".format( int(distdata.pos[sid_index, 0]), id, rsid, int(distdata.pos[sid_index, 2]), )) for iid_index in range(distdata.iid_count): index += 1 if (updater_freq > 1 and index > 0 and index % updater_freq == 0): updater("{0:,} of {1:,} ({2:.2%})".format( index, distreader.iid_count * distreader.sid_count, 1.0 * index / (distreader.iid_count * distreader.sid_count), )) prob_dist = distdata.val[iid_index, sid_index, :] if not np.isnan(prob_dist).any(): s = " " + " ".join((format_function(num) for num in prob_dist)) genfp.write(s) else: genfp.write(" 0 0 0") genfp.write("\n") start += distdata.sid_count sample_filename = os.path.splitext(filename)[0] + ".sample" # https://www.well.ox.ac.uk/~gav/qctool_v2/documentation/sample_file_formats.html with open(sample_filename, "w", newline="\n") as samplefp: samplefp.write("ID\n") samplefp.write("0\n") for f, i in distreader.iid: samplefp.write("{0}\n".format(sample_function(f, i))) if os.path.exists(filename): os.remove(filename) shutil.move(filename + ".temp", filename)
def _run_once(self): if self._ran_once: return assert os.path.exists( self.filename), "Expect file to exist ('{0}')".format( self.filename) verbose = logging.getLogger().level <= logging.INFO self._open_bgen = open_bgen(self.filename, self._sample, verbose=verbose) assert (self._open_bgen.nvariants == 0 or self._open_bgen.nalleles[0] == 2), "expect number of alleles to be 2" assert (self._open_bgen.nvariants == 0 or not self._open_bgen.phased[0]), "expect data to be unphased" if self._col_property_key not in self._open_bgen._metadata2_memmaps: logging.info("Extending metadata file with PySnpTools metadata") assert (self._default_iid_key not in self._open_bgen._metadata2_memmaps and self._default_sid_key not in self._open_bgen._metadata2_memmaps), "real assert" metadata2_temp = self._open_bgen._metadata2_path.parent / ( self._open_bgen._metadata2_path.name + ".temp") if metadata2_temp.exists(): metadata2_temp.unlink() metadata2_path = self._open_bgen._metadata2_path del self._open_bgen shutil.copy(metadata2_path, metadata2_temp) with MultiMemMap(metadata2_temp, mode="r+") as metadata2_memmaps: samples = metadata2_memmaps["samples"] row = metadata2_memmaps.append_empty( self._default_iid_key, shape=(len(samples), 2), dtype=str(samples.dtype), ) if len(samples) == 0 or "," not in samples[0]: logging.info( "No comma in first sample, so extending metadata file with 'no-comma' default iids" ) row[:, 0] = "0" row[:, 1] = samples else: # Later: Do this in chunks of size about len(samples) using old np.stack(np.core.defchararray.split(samples,',',maxsplit=2)) code (see pre 8/1/2020 code) with log_in_place("splitting samples on commas", logging.INFO) as updater: for i, val in enumerate(samples): if i % 1000 == 0: updater(f"{i:,} of {len(samples):,}") row[i, :] = default_iid_function(val) col_property = metadata2_memmaps.append_empty( self._col_property_key, shape=(len(metadata2_memmaps["ids"]), 3), dtype="float", ) col_property[:, 2] = metadata2_memmaps["positions"] # Do something fast if all numbers try: col_property[:, 0] = metadata2_memmaps["chromosomes"] except Exception: # If that doesn't work, do something slow with log_in_place( "converting chromosomes strings to numbers, one at a time", logging.INFO) as updater: for i, val in enumerate( metadata2_memmaps["chromosomes"]): if i % 1000 == 0: updater(f"{i:,} of {len(col_property):,}") try: col_property[i, 0] = int(val) except Exception: col_property[i, 0] = 0 rsid_list = metadata2_memmaps["rsids"] id_list = metadata2_memmaps["ids"] assert str(rsid_list.dtype).startswith("<U") and str( id_list.dtype).startswith("<U"), "real assert" if _all_equal_in_parts(rsid_list, "0") or _all_equal_in_parts( rsid_list, ""): col = metadata2_memmaps.append_empty( self._default_sid_key, shape=len(metadata2_memmaps["ids"]), dtype=str(id_list.dtype), ) col[:] = id_list else: max_length = (int(str(rsid_list.dtype)[2:]) + 1 + int(str(id_list.dtype)[2:])) col = metadata2_memmaps.append_empty( self._default_sid_key, shape=len(metadata2_memmaps["ids"]), dtype=f"<U{max_length}", ) for _, _, start, end in _parts(len(col)): col[start:end] = np.char.add( np.char.add(id_list[start:end], ","), rsid_list[start:end]) metadata2_path.unlink() shutil.copy(metadata2_temp, metadata2_path) self._open_bgen = open_bgen(self.filename, self._sample, verbose=verbose) else: assert (self._default_iid_key in self._open_bgen._metadata2_memmaps and self._default_sid_key in self._open_bgen._metadata2_memmaps), "real assert" self._row = self._apply_iid_function(self._open_bgen.samples) self._col = self._apply_sid_function(self._open_bgen.ids, self._open_bgen.rsids) self._col_property = self._open_bgen._metadata2_memmaps[ self._col_property_key] self._assert_iid_sid_pos(check_val=False) self._ran_once = True
def _map_metadata(self, metafile_filepath): with log_in_place("metadata", logging.INFO) as updater: with bgen_metafile(Path(metafile_filepath)) as mf: nparts = mf.npartitions ( id_list, rsid_list, chrom_list, position_list, vaddr_list, nalleles_list, allele_ids_list, ncombinations_list, phased_list, ) = ([], [], [], [], [], [], [], [], []) for ipart2 in range(nparts): # LATER multithread? # LATER in notebook this message doesn't appear on one line updater("step 2: part {0:,} of {1:,}".format( ipart2, nparts)) ( nvariants, vid, rsid, chrom, position, nalleles, allele_ids, offset, ) = _inner_read_partition(mf, ipart2) id_list.append(vid) rsid_list.append(rsid) chrom_list.append(chrom) position_list.append(position) nalleles_list.append(nalleles) allele_ids_list.append(allele_ids) vaddr_list.append(offset) # LATER use concatenate(...out=) instead self._ids = np.array(np.concatenate(id_list), dtype="str") # dtype needed to make unicode self._rsids = np.array(np.concatenate(rsid_list), dtype="str") self._vaddr = np.concatenate(vaddr_list) self._chromosomes = np.array(np.concatenate(chrom_list), dtype="str") self._positions = np.concatenate(position_list) self._nalleles = np.concatenate(nalleles_list) self._allele_ids = np.array(np.concatenate(allele_ids_list), dtype="str") for i, vaddr0 in enumerate(self._vaddr): if i % 1000 == 0: updater("step 3: part {0:,} of {1:,}".format( i, self.nvariants)) genotype = lib.bgen_file_open_genotype(self._bgen._bgen_file, vaddr0) ncombinations_list.append(lib.bgen_genotype_ncombs(genotype)) phased_list.append(lib.bgen_genotype_phased(genotype)) lib.bgen_genotype_close(genotype) self._ncombinations = np.array(ncombinations_list, dtype="int") self._phased = np.array(phased_list, dtype="bool")
def read( self, index: Optional[Any] = None, dtype: Optional[Union[type, str]] = np.float64, order: Optional[str] = "F", max_combinations: Optional[int] = None, return_probabilities: Optional[bool] = True, return_missings: Optional[bool] = False, return_ploidies: Optional[bool] = False, ) -> Union[None, np.ndarray, Tuple[np.ndarray, np.ndarray], Tuple[ np.ndarray, np.ndarray, np.ndarray], ]: """ Read genotype information from an :class:`open_bgen` object. Parameters ---------- index An expression specifying the samples and variants to read. (See :ref:`read_examples`, below). Defaults to ``None``, meaning read all. dtype : data-type The desired data-type for the returned probability array. Defaults to :class:`numpy.float64`. Use :class:`numpy.float32` or :class:`numpy.float16`, when appropriate, to save 50% or 75% of memory. (See :ref:`read_notes`, below). order : {'F','C'} The desired memory layout for the returned probability array. Defaults to ``F`` (Fortran order, which is variant-major). max_combinations : int or ``None``. The number of values to allocate for each probability distribution. Defaults to a number just large enough for any data in the file. For unphased, diploid, biallelic data, it will default to 3. For phased, diploid, biallelic data, it will default to 4. Any overallocated space is filled with :const:`numpy.nan`. return_probabilities: bool Read and return the probabilities for samples and variants specified. Defaults to ``True``. return_missings: bool Return a boolean array telling which probabilities are missing. Defaults to ``False``. return_ploidies: bool Read and return the ploidy for the samples and variants specified. Defaults to ``False``. Returns ------- zero to three :class:`numpy.ndarray` always in this order: * a :class:`numpy.ndarray` of probabilities with ``dtype`` and shape `(nsamples_out,nvariants_out,max_combinations)`, if ``return_probabilities`` is ``True`` (the default). Missing data is filled with :const:`numpy.nan`. * a :class:`numpy.ndarray` of ``bool`` of shape `(nsamples_out,nvariants_out)`, if ``return_missings`` is ``True`` * a :class:`numpy.ndarray` of ``int`` of shape `(nsamples_out,nvariants_out)`, if ``return_ploidies`` is ``True`` .. _read_notes: Notes ------ * About ``dtype`` If you know the compression level of your BGEN file, you can sometimes save 50% or 75% on memory with ``dtype``. (Test with your data to confirm you are not losing any precision.) The approximate relationship is: * BGEN compression 1 to 10 bits: ``dtype`` ='float16' * BGEN compression 11 to 23 bits: ``dtype`` ='float32' * BGEN compression 24 to 32 bits: ``dtype`` ='float64' (default) .. _read_examples: Examples -------- * Index Examples To read all data in a BGEN file, set ``index`` to ``None``. This is the default. .. doctest:: >>> import numpy as np >>> from bgen_reader import example_filepath, open_bgen >>> >>> with open_bgen(example_filepath("haplotypes.bgen"), verbose=False) as bgen_h: ... print(bgen_h.read()) #real all [[[1. 0. 1. 0.] [0. 1. 1. 0.] [1. 0. 0. 1.] [0. 1. 0. 1.]] <BLANKLINE> [[0. 1. 1. 0.] [1. 0. 0. 1.] [0. 1. 0. 1.] [1. 0. 1. 0.]] <BLANKLINE> [[1. 0. 0. 1.] [0. 1. 0. 1.] [1. 0. 1. 0.] [0. 1. 1. 0.]] <BLANKLINE> [[0. 1. 0. 1.] [1. 0. 1. 0.] [0. 1. 1. 0.] [1. 0. 0. 1.]]] To read selected variants, set ``index`` to an ``int``, a list of ``int``, a :class:`slice`, or a list of ``bool``. Negative integers count from the end of the data. .. doctest:: >>> bgen_e = open_bgen(example_filepath("example.bgen"), verbose=False) >>> probs = bgen_e.read(5) # read the variant indexed by 5. >>> print(probs.shape) # print the dimensions of the returned numpy array. (500, 1, 3) >>> probs = bgen_e.read([5,6,1]) # read the variant indexed by 5, 6, and 1 >>> print(probs.shape) (500, 3, 3) >>> probs = bgen_e.read(slice(5)) #read the first 5 variants >>> print(probs.shape) (500, 5, 3) >>> probs = bgen_e.read(slice(2,5)) #read variants from 2 (inclusive) to 5 (exclusive) >>> print(probs.shape) (500, 3, 3) >>> probs = bgen_e.read(slice(2,None)) # read variants starting at index 2. >>> print(probs.shape) (500, 197, 3) >>> probs = bgen_e.read(slice(None,None,10)) #read every 10th variant >>> print(probs.shape) (500, 20, 3) >>> print(np.unique(bgen_e.chromosomes)) # print unique chrom values ['01'] >>> probs = bgen_e.read(bgen_e.chromosomes=='01') # read all variants in chrom 1 >>> print(probs.shape) (500, 199, 3) >>> probs = bgen_e.read(-1) # read the last variant >>> print(probs.shape) (500, 1, 3) To read selected samples, set ``index`` to a tuple of the form ``(sample_index,None)``, where ``sample index`` follows the form of ``variant index``, above. .. doctest:: >>> probs = bgen_e.read((0,None)) # Read 1st sample (across all variants) >>> print(probs.shape) (1, 199, 3) >>> probs = bgen_e.read((slice(None,None,10),None)) # Read every 10th sample >>> print(probs.shape) (50, 199, 3) To read selected samples and selected variants, set ``index`` to a tuple of the form ``(sample_index,variant_index)``, where ``sample index`` and ``variant_index`` follow the forms above. .. doctest:: >>> # Read samples 10 (inclusive) to 20 (exclusive) and the first 15 variants. >>> probs = bgen_e.read((slice(10,20),slice(15))) >>> print(probs.shape) (10, 15, 3) >>> #read last and 2nd-to-last sample and the last variant >>> probs = bgen_e.read(([-1,-2],-1)) >>> print(probs.shape) (2, 1, 3) * Multiple Return Example Read probabilities, missingness, and ploidy. Print all unique ploidies values. .. doctest:: >>> probs,missing,ploidy = bgen_e.read(return_missings=True,return_ploidies=True) >>> print(np.unique(ploidy)) [2] """ # LATER could allow strings (variant names) and lists of strings if not hasattr(self, "_bgen_context_manager"): raise ValueError("I/O operation on a closed file") max_combinations = (max_combinations if max_combinations is not None else self.max_combinations ) # Can't use 'or' because it treats 0 as False samples_index, variants_index = self._split_index(index) samples_index = self._sample_range[ samples_index] # converts slice(), etc to a list of numbers vaddr = self._vaddr[variants_index] ncombinations = self._ncombinations[variants_index] if len(ncombinations) > 0 and max(ncombinations) > max_combinations: raise ValueError( "Need at least {0} max_combinations, but only {1} given". format(max(ncombinations), max_combinations)) # allocating prob_buffer only when its size changes makes reading # 10x5M data 30% faster if return_probabilities: val = np.full( (len(samples_index), len(vaddr), max_combinations), np.nan, dtype=dtype, order=order, ) prob_buffer = None if return_missings: missing_val = np.full((len(samples_index), len(vaddr)), False, dtype="bool", order=order) if return_ploidies: ploidy_val = np.full((len(samples_index), len(vaddr)), 0, dtype="int", order=order) # LATER multithread? approx_read_seconds = len(vaddr) / 20000.0 + len( vaddr) * self.nsamples / (2 * 1000 * 1000.0) vaddr_per_second = max(1, len(vaddr) // int(max(1, approx_read_seconds))) vaddr_per_second = 10**( int(math.log10(vaddr_per_second) + 0.5) ) # Do "logarithmic rounding" to make numbers look nicer, e.g. 999 -> 1000 with log_in_place("reading", logging.INFO) as updater: for out_index, vaddr0 in enumerate(vaddr): if out_index % vaddr_per_second == 0: updater("part {0:,} of {1:,}".format( out_index, len(vaddr))) genotype = lib.bgen_file_open_genotype(self._bgen._bgen_file, vaddr0) if return_probabilities: if (prob_buffer is None or ncombinations[out_index] != prob_buffer.shape[-1]): prob_buffer = np.full( (len(self._samples), ncombinations[out_index]), np.nan, order="C", dtype="float64", ) lib.bgen_genotype_read( genotype, ffi.cast("double *", prob_buffer.ctypes.data)) val[:, out_index, :ncombinations[out_index]] = ( prob_buffer if (samples_index is self._sample_range) else prob_buffer[samples_index, :]) if return_missings: missing_val[:, out_index] = [ lib.bgen_genotype_missing(genotype, i) for i in samples_index ] if return_ploidies: ploidy_val[:, out_index] = [ lib.bgen_genotype_ploidy(genotype, i) for i in samples_index ] lib.bgen_genotype_close(genotype) result_array = (([val] if return_probabilities else []) + ([missing_val] if return_missings else []) + ([ploidy_val] if return_ploidies else [])) if len(result_array) == 1: return result_array[0] else: return tuple(result_array)
def write(filename, snpreader, standardizer=Identity(), order='A', dtype=None, block_size=None, num_threads=None): """Writes a :class:`SnpReader` to :class:`SnpMemMap` format. :param filename: the name of the file to create :type filename: string :param snpreader: The data that should be written to disk. :type snpreader: :class:`SnpReader` :rtype: :class:`.SnpMemMap` >>> import pysnptools.util as pstutil >>> from pysnptools.util import example_file # Download and return local file name >>> from pysnptools.snpreader import Bed, SnpMemMap >>> bed_file = example_file("pysnptools/examples/toydata.5chrom.*","*.bed") >>> bed = Bed(bed_file) >>> pstutil.create_directory_if_necessary("tempdir/toydata.5chrom.snp.memmap") #LATER should we just promise to create directories? >>> SnpMemMap.write("tempdir/toydata.5chrom.snp.memmap",bed) # Write bed in SnpMemMap format SnpMemMap('tempdir/toydata.5chrom.snp.memmap') """ block_size = block_size or max( (100_000) // max(1, snpreader.row_count), 1) if hasattr(snpreader, 'val'): order = PstMemMap._order(snpreader) if order == 'A' else order dtype = dtype or snpreader.val.dtype else: order = 'F' if order == 'A' else order dtype = dtype or np.float64 dtype = np.dtype(dtype) snpmemmap = SnpMemMap.empty(iid=snpreader.iid, sid=snpreader.sid, filename=filename + '.temp', pos=snpreader.col_property, order=order, dtype=dtype) if hasattr(snpreader, 'val'): standardizer.standardize(snpreader, num_threads=num_threads) snpmemmap.val[:, :] = snpreader.val else: with log_in_place("SnpMemMap write sid_index ", logging.INFO) as updater: for start in range(0, snpreader.sid_count, block_size): updater('{0} of {1}'.format(start, snpreader.sid_count)) snpdata = snpreader[:, start:start + block_size].read( order=order, dtype=dtype, num_threads=num_threads) standardizer.standardize(snpdata, num_threads=num_threads) snpmemmap.val[:, start:start + snpdata.sid_count] = snpdata.val snpmemmap.flush() if os.path.exists(filename): os.remove(filename) shutil.move(filename + '.temp', filename) logging.debug("Done writing " + filename) return SnpMemMap(filename)
def write(filename, distreader, order='A', dtype=None, block_size=None): """Writes a :class:`DistReader` to :class:`DistMemMap` format. :param filename: the name of the file to create :type filename: string :param distreader: The data that should be written to disk. It can also be any distreader, for example, :class:`.DistNpz`, :class:`.DistData`, or another :class:`.Bgen`. :type distreader: :class:`DistReader` :param order: {'A' (default), 'F', 'C'}, optional -- Specify the order of the ndarray. By default, will match the order of the input if knowable; otherwise, 'F' :type order: string or None :param dtype: {None (default), numpy.float64, numpy.float32}, optional -- The data-type for the :attr:`DistMemMap.val` ndarray. By default, will match the order of the input if knowable; otherwise np.float64. :type dtype: data-type :param block_size: The number of SNPs to read in a batch from *distreader*. Defaults to a *block_size* such that *block_size* \* *iid_count* is about 100,000. :type block_size: number :rtype: :class:`.DistMemMap` >>> import pysnptools.util as pstutil >>> from pysnptools.distreader import Bgen, DistMemMap >>> from pysnptools.util import example_file # Download and return local file name >>> bgen_file = example_file("pysnptools/examples/2500x100.bgen") >>> distreader = Bgen(bgen_file)[:,:10] #Create a reader for the first 10 SNPs >>> pstutil.create_directory_if_necessary("tempdir/tiny.dist.memmap") >>> DistMemMap.write("tempdir/tiny.dist.memmap",distreader) # Write distreader in DistMemMap format DistMemMap('tempdir/tiny.dist.memmap') """ #We write iid and sid in ascii for compatibility between Python 2 and Python 3 formats. row_ascii = np.array(distreader.row, dtype='S') #!!!avoid this copy when not needed col_ascii = np.array(distreader.col, dtype='S') #!!!avoid this copy when not needed block_size = block_size or max( (100 * 1000) // max(1, distreader.row_count), 1) if hasattr(distreader, 'val'): order = PstMemMap._order(distreader) if order == 'A' else order dtype = dtype or distreader.val.dtype else: order = 'F' if order == 'A' else order dtype = dtype or np.float64 dtype = np.dtype(dtype) self = PstMemMap.empty(row_ascii, col_ascii, filename + '.temp', row_property=distreader.row_property, col_property=distreader.col_property, order=order, dtype=dtype, val_shape=3) if hasattr(distreader, 'val'): self.val[:, :, :] = distreader.val else: start = 0 with log_in_place("sid_index ", logging.INFO) as updater: while start < distreader.sid_count: updater('{0} of {1}'.format(start, distreader.sid_count)) distdata = distreader[:, start:start + block_size].read( order=order, dtype=dtype) self.val[:, start:start + distdata.sid_count, :] = distdata.val start += distdata.sid_count self.flush() if os.path.exists(filename): os.remove(filename) shutil.move(filename + '.temp', filename) logging.debug("Done writing " + filename) return DistMemMap(filename)