Exemplo n.º 1
0
    def scan_local(local_directory, url=None, logging_level=logging.WARNING):
        '''
        Bootstrap a Hashdown by recursively walking a local directory and finding the local MD5 hashes.
        (A local hash might be wrong if the files are out of date or have OS-dependent line endings.)
        Typically, you'll then want to save the result to a JSON file and then edit that JSON file
        manually to remove uninteresting files.

        :param local_directory: Local directory to recursively walk
        :type path: string

        :param url:  URL to give to the Hashdown. (It will not be checked.)
        :type path: string

        :param logging_level: Logging level for printing progress of the walk. Default
               is logging.WARNING)

        :rtype: :class:`.Hashdown`

        '''
        from pysnptools.util.filecache import LocalCache

        file_to_hash = {}
        localcache = LocalCache(local_directory)
        with log_in_place("scanning", logging_level) as updater:
            for file in localcache.walk():
                updater(file)
                with localcache.open_read(file) as full_file:
                    hash = Hashdown._get_hash(full_file)
                    file_to_hash[file] = hash
        return Hashdown(url, file_to_hash=file_to_hash)
Exemplo n.º 2
0
    def _read(self, row_index_or_none, col_index_or_none, order, dtype,
              force_python_only, view_ok):
        self._run_once()
        import pysnptools.util as pstutil
        dtype = np.dtype(dtype)

        if order == 'A':
            order = 'F'

        row_index_count = len(
            row_index_or_none
        ) if row_index_or_none is not None else self._iid_count  # turn to a count of the index positions e.g. all of them
        col_index = col_index_or_none if col_index_or_none is not None else np.arange(
            self._sid_count
        )  # turn to an array of index positions, e.g. 0,1,200,2200,10
        batch_index = col_index // self._block_size  #find the batch index of each index position, e.g. 0,0,0,2,0
        val = np.empty((row_index_count, len(col_index)),
                       order=order,
                       dtype=dtype)  #allocate memory for result
        list_batch_index = list(set(batch_index))
        with log_in_place("working on snpgen batch", logging.INFO) as updater:
            for ii, i in enumerate(
                    list_batch_index
            ):  #for each distinct batch index, generate snps
                updater("{0} of {1}".format(ii, len(list_batch_index)))
                start = i * self._block_size  #e.g. 0 (then 2000)
                stop = start + self._block_size  #e.g. 1000, then 3000
                batch_val = self._get_val2(start,
                                           stop,
                                           order=order,
                                           dtype=dtype)  # generate whole batch
                a = (
                    batch_index == i
                )  #e.g. [True,True,True,False,True], then [False,False,False,True,False]
                b = col_index[a] - start  #e.g.  0,1,200,10, then 200
                val[:,
                    a] = batch_val[:,
                                   b] if row_index_or_none is None else pstutil.sub_matrix(
                                       batch_val, row_index_or_none, b)

        return val
Exemplo n.º 3
0
    def genwrite(
        filename,
        distreader,
        decimal_places=None,
        id_rsid_function=default_id_rsid_function,
        sample_function=default_sample_function,
        block_size=None,
    ):
        """Writes a :class:`DistReader` to Gen format

        :param filename: the name of the file to create (will also create *filename_without_gen*.sample file.)
        :type filename: string
        :param distreader: The data that should be written to disk. It can also be any distreader, for example, :class:`.DistNpz`, :class:`.DistData`, or
           another :class:`.Bgen`.
        :type distreader: :class:`DistReader`
        :param decimal_places: (Default: None) Number of decimal places with which to write the text numbers. *None* writes to full precision.
        :type bits: int or None
        :param id_rsid_function: Function to turn a  a :attr:`DistReader.sid` into a GEN (SNP) id and rsid.
           (Default: :meth:`bgen.default_id_rsid_function`.)
        :type id_rsid_function: function
        :param sid_function: Function to turn a GEN (SNP) id and rsid into a :attr:`DistReader.sid`.
           (Default: :meth:`bgen.default_sid_function`.)
        :type sid_function: function
        :param block_size: The number of SNPs to read in a batch from *distreader*. Defaults to a *block_size* such that *block_size* \* *iid_count* is about 100,000.
        :type block_size: number
        :rtype: None

        >>> from pysnptools.distreader import DistHdf5, Bgen
        >>> import pysnptools.util as pstutil
        >>> from pysnptools.util import example_file # Download and return local file name
        >>> hdf5_file = example_file("pysnptools/examples/toydata.snpmajor.dist.hdf5")
        >>> distreader = DistHdf5(hdf5_file)[:,:10] # A reader for the first 10 SNPs in Hdf5 format
        >>> pstutil.create_directory_if_necessary("tempdir/toydata10.bgen")
        >>> Bgen.genwrite("tempdir/toydata10.gen",distreader)        # Write data in GEN format
        """
        # https://www.cog-genomics.org/plink2/formats#gen
        # https://web.archive.org/web/20181010160322/http://www.stats.ox.ac.uk/~marchini/software/gwas/file_format.html

        block_size = block_size or max(
            (100 * 1000) // max(1, distreader.row_count), 1)

        if decimal_places is None:

            def format(num):
                return f"{num}"

            format_function = format
        else:

            def format(num):
                return ("{0:." + str(decimal_places) + "f}").format(num)

            format_function = format

        start = 0
        updater_freq = 10000
        index = -1
        with log_in_place("writing text values ", logging.INFO) as updater:
            with open(filename + ".temp", "w", newline="\n") as genfp:
                while start < distreader.sid_count:
                    distdata = distreader[:, start:start +
                                          block_size].read(view_ok=True)
                    for sid_index in range(distdata.sid_count):
                        id, rsid = id_rsid_function(distdata.sid[sid_index])
                        assert id.strip() != "", "id cannot be whitespace"
                        assert rsid.strip() != "", "rsid cannot be whitespace"
                        genfp.write("{0} {1} {2} {3} A G".format(
                            int(distdata.pos[sid_index, 0]),
                            id,
                            rsid,
                            int(distdata.pos[sid_index, 2]),
                        ))
                        for iid_index in range(distdata.iid_count):
                            index += 1
                            if (updater_freq > 1 and index > 0
                                    and index % updater_freq == 0):
                                updater("{0:,} of {1:,} ({2:.2%})".format(
                                    index,
                                    distreader.iid_count *
                                    distreader.sid_count,
                                    1.0 * index / (distreader.iid_count *
                                                   distreader.sid_count),
                                ))
                            prob_dist = distdata.val[iid_index, sid_index, :]
                            if not np.isnan(prob_dist).any():
                                s = " " + " ".join((format_function(num)
                                                    for num in prob_dist))
                                genfp.write(s)
                            else:
                                genfp.write(" 0 0 0")
                        genfp.write("\n")
                    start += distdata.sid_count
        sample_filename = os.path.splitext(filename)[0] + ".sample"
        # https://www.well.ox.ac.uk/~gav/qctool_v2/documentation/sample_file_formats.html
        with open(sample_filename, "w", newline="\n") as samplefp:
            samplefp.write("ID\n")
            samplefp.write("0\n")
            for f, i in distreader.iid:
                samplefp.write("{0}\n".format(sample_function(f, i)))

        if os.path.exists(filename):
            os.remove(filename)
        shutil.move(filename + ".temp", filename)
Exemplo n.º 4
0
    def _run_once(self):
        if self._ran_once:
            return

        assert os.path.exists(
            self.filename), "Expect file to exist ('{0}')".format(
                self.filename)
        verbose = logging.getLogger().level <= logging.INFO

        self._open_bgen = open_bgen(self.filename,
                                    self._sample,
                                    verbose=verbose)
        assert (self._open_bgen.nvariants == 0 or self._open_bgen.nalleles[0]
                == 2), "expect number of alleles to be 2"
        assert (self._open_bgen.nvariants == 0
                or not self._open_bgen.phased[0]), "expect data to be unphased"

        if self._col_property_key not in self._open_bgen._metadata2_memmaps:
            logging.info("Extending metadata file with PySnpTools metadata")
            assert (self._default_iid_key
                    not in self._open_bgen._metadata2_memmaps
                    and self._default_sid_key
                    not in self._open_bgen._metadata2_memmaps), "real assert"
            metadata2_temp = self._open_bgen._metadata2_path.parent / (
                self._open_bgen._metadata2_path.name + ".temp")
            if metadata2_temp.exists():
                metadata2_temp.unlink()
            metadata2_path = self._open_bgen._metadata2_path
            del self._open_bgen
            shutil.copy(metadata2_path, metadata2_temp)
            with MultiMemMap(metadata2_temp, mode="r+") as metadata2_memmaps:
                samples = metadata2_memmaps["samples"]
                row = metadata2_memmaps.append_empty(
                    self._default_iid_key,
                    shape=(len(samples), 2),
                    dtype=str(samples.dtype),
                )
                if len(samples) == 0 or "," not in samples[0]:
                    logging.info(
                        "No comma in first sample, so extending metadata file with 'no-comma' default iids"
                    )
                    row[:, 0] = "0"
                    row[:, 1] = samples
                else:
                    # Later: Do this in chunks of size about len(samples) using old np.stack(np.core.defchararray.split(samples,',',maxsplit=2)) code (see pre 8/1/2020 code)
                    with log_in_place("splitting samples on commas",
                                      logging.INFO) as updater:
                        for i, val in enumerate(samples):
                            if i % 1000 == 0:
                                updater(f"{i:,} of {len(samples):,}")
                            row[i, :] = default_iid_function(val)
                col_property = metadata2_memmaps.append_empty(
                    self._col_property_key,
                    shape=(len(metadata2_memmaps["ids"]), 3),
                    dtype="float",
                )
                col_property[:, 2] = metadata2_memmaps["positions"]
                # Do something fast if all numbers
                try:
                    col_property[:, 0] = metadata2_memmaps["chromosomes"]
                except Exception:
                    # If that doesn't work, do something slow
                    with log_in_place(
                            "converting chromosomes strings to numbers, one at a time",
                            logging.INFO) as updater:
                        for i, val in enumerate(
                                metadata2_memmaps["chromosomes"]):
                            if i % 1000 == 0:
                                updater(f"{i:,} of {len(col_property):,}")
                            try:
                                col_property[i, 0] = int(val)
                            except Exception:
                                col_property[i, 0] = 0

                rsid_list = metadata2_memmaps["rsids"]
                id_list = metadata2_memmaps["ids"]
                assert str(rsid_list.dtype).startswith("<U") and str(
                    id_list.dtype).startswith("<U"), "real assert"
                if _all_equal_in_parts(rsid_list, "0") or _all_equal_in_parts(
                        rsid_list, ""):
                    col = metadata2_memmaps.append_empty(
                        self._default_sid_key,
                        shape=len(metadata2_memmaps["ids"]),
                        dtype=str(id_list.dtype),
                    )
                    col[:] = id_list
                else:
                    max_length = (int(str(rsid_list.dtype)[2:]) + 1 +
                                  int(str(id_list.dtype)[2:]))
                    col = metadata2_memmaps.append_empty(
                        self._default_sid_key,
                        shape=len(metadata2_memmaps["ids"]),
                        dtype=f"<U{max_length}",
                    )
                    for _, _, start, end in _parts(len(col)):
                        col[start:end] = np.char.add(
                            np.char.add(id_list[start:end], ","),
                            rsid_list[start:end])

            metadata2_path.unlink()
            shutil.copy(metadata2_temp, metadata2_path)
            self._open_bgen = open_bgen(self.filename,
                                        self._sample,
                                        verbose=verbose)
        else:
            assert (self._default_iid_key in self._open_bgen._metadata2_memmaps
                    and self._default_sid_key
                    in self._open_bgen._metadata2_memmaps), "real assert"

        self._row = self._apply_iid_function(self._open_bgen.samples)
        self._col = self._apply_sid_function(self._open_bgen.ids,
                                             self._open_bgen.rsids)
        self._col_property = self._open_bgen._metadata2_memmaps[
            self._col_property_key]
        self._assert_iid_sid_pos(check_val=False)
        self._ran_once = True
Exemplo n.º 5
0
    def _map_metadata(self, metafile_filepath):
        with log_in_place("metadata", logging.INFO) as updater:
            with bgen_metafile(Path(metafile_filepath)) as mf:
                nparts = mf.npartitions
                (
                    id_list,
                    rsid_list,
                    chrom_list,
                    position_list,
                    vaddr_list,
                    nalleles_list,
                    allele_ids_list,
                    ncombinations_list,
                    phased_list,
                ) = ([], [], [], [], [], [], [], [], [])

                for ipart2 in range(nparts):  # LATER multithread?
                    # LATER in notebook this message doesn't appear on one line
                    updater("step 2: part {0:,} of {1:,}".format(
                        ipart2, nparts))

                    (
                        nvariants,
                        vid,
                        rsid,
                        chrom,
                        position,
                        nalleles,
                        allele_ids,
                        offset,
                    ) = _inner_read_partition(mf, ipart2)

                    id_list.append(vid)
                    rsid_list.append(rsid)
                    chrom_list.append(chrom)
                    position_list.append(position)
                    nalleles_list.append(nalleles)
                    allele_ids_list.append(allele_ids)
                    vaddr_list.append(offset)

            # LATER use concatenate(...out=) instead
            self._ids = np.array(np.concatenate(id_list),
                                 dtype="str")  # dtype needed to make unicode
            self._rsids = np.array(np.concatenate(rsid_list), dtype="str")
            self._vaddr = np.concatenate(vaddr_list)
            self._chromosomes = np.array(np.concatenate(chrom_list),
                                         dtype="str")
            self._positions = np.concatenate(position_list)
            self._nalleles = np.concatenate(nalleles_list)
            self._allele_ids = np.array(np.concatenate(allele_ids_list),
                                        dtype="str")

            for i, vaddr0 in enumerate(self._vaddr):
                if i % 1000 == 0:
                    updater("step 3: part {0:,} of {1:,}".format(
                        i, self.nvariants))
                genotype = lib.bgen_file_open_genotype(self._bgen._bgen_file,
                                                       vaddr0)
                ncombinations_list.append(lib.bgen_genotype_ncombs(genotype))
                phased_list.append(lib.bgen_genotype_phased(genotype))
                lib.bgen_genotype_close(genotype)

            self._ncombinations = np.array(ncombinations_list, dtype="int")
            self._phased = np.array(phased_list, dtype="bool")
Exemplo n.º 6
0
    def read(
        self,
        index: Optional[Any] = None,
        dtype: Optional[Union[type, str]] = np.float64,
        order: Optional[str] = "F",
        max_combinations: Optional[int] = None,
        return_probabilities: Optional[bool] = True,
        return_missings: Optional[bool] = False,
        return_ploidies: Optional[bool] = False,
    ) -> Union[None, np.ndarray, Tuple[np.ndarray, np.ndarray], Tuple[
            np.ndarray, np.ndarray, np.ndarray], ]:
        """
        Read genotype information from an :class:`open_bgen` object.

        Parameters
        ----------
        index
            An expression specifying the samples and variants to read. (See :ref:`read_examples`, below).
            Defaults to ``None``, meaning read all.
        dtype : data-type
            The desired data-type for the returned probability array.
            Defaults to :class:`numpy.float64`. Use :class:`numpy.float32` or :class:`numpy.float16`, when appropriate,
            to save 50% or 75% of memory. (See :ref:`read_notes`, below).
        order : {'F','C'}
            The desired memory layout for the returned probability array.
            Defaults to ``F`` (Fortran order, which is variant-major).
        max_combinations : int or ``None``.
            The number of values to allocate for each probability distribution.
            Defaults to a number just large enough for any data in the file.
            For unphased, diploid, biallelic data, it will default to 3. For phased, diploid, biallelic data, it will
            default to 4. Any overallocated space is filled with :const:`numpy.nan`.
        return_probabilities: bool
            Read and return the probabilities for samples and variants specified.
            Defaults to ``True``.
        return_missings: bool
            Return a boolean array telling which probabilities are missing.
            Defaults to ``False``.
        return_ploidies: bool
            Read and return the ploidy for the samples and variants specified.
            Defaults to ``False``.

        Returns
        -------
        zero to three :class:`numpy.ndarray`
            always in this order:

            * a :class:`numpy.ndarray` of probabilities with ``dtype`` and shape `(nsamples_out,nvariants_out,max_combinations)`,
              if ``return_probabilities`` is ``True`` (the default). Missing data is filled with :const:`numpy.nan`.
            * a :class:`numpy.ndarray` of ``bool`` of shape `(nsamples_out,nvariants_out)`, if ``return_missings`` is ``True``
            * a :class:`numpy.ndarray` of ``int`` of shape `(nsamples_out,nvariants_out)`, if ``return_ploidies`` is ``True``


        .. _read_notes:

        Notes
        ------
        * About ``dtype``

            If you know the compression level of your BGEN file, you can sometimes save 50% or 75% on memory with ``dtype``.
            (Test with your data to confirm you are not losing any precision.) The approximate relationship is:

                * BGEN compression 1 to 10 bits: ``dtype`` ='float16'
                * BGEN compression 11 to 23 bits: ``dtype`` ='float32'
                * BGEN compression 24 to 32 bits: ``dtype`` ='float64' (default)


        .. _read_examples:

        Examples
        --------
        * Index Examples

            To read all data in a BGEN file, set ``index`` to ``None``. This is the default.

            .. doctest::

                >>> import numpy as np
                >>> from bgen_reader import example_filepath, open_bgen
                >>>
                >>> with open_bgen(example_filepath("haplotypes.bgen"), verbose=False) as bgen_h:
                ...     print(bgen_h.read()) #real all
                [[[1. 0. 1. 0.]
                  [0. 1. 1. 0.]
                  [1. 0. 0. 1.]
                  [0. 1. 0. 1.]]
                <BLANKLINE>
                 [[0. 1. 1. 0.]
                  [1. 0. 0. 1.]
                  [0. 1. 0. 1.]
                  [1. 0. 1. 0.]]
                <BLANKLINE>
                 [[1. 0. 0. 1.]
                  [0. 1. 0. 1.]
                  [1. 0. 1. 0.]
                  [0. 1. 1. 0.]]
                <BLANKLINE>
                 [[0. 1. 0. 1.]
                  [1. 0. 1. 0.]
                  [0. 1. 1. 0.]
                  [1. 0. 0. 1.]]]

            To read selected variants, set ``index`` to an ``int``, a list of ``int``, a :class:`slice`, or a list of ``bool``.
            Negative integers count from the end of the data.

            .. doctest::

                >>> bgen_e = open_bgen(example_filepath("example.bgen"), verbose=False)
                >>> probs = bgen_e.read(5)  # read the variant indexed by 5.
                >>> print(probs.shape)      # print the dimensions of the returned numpy array.
                (500, 1, 3)
                >>> probs = bgen_e.read([5,6,1])  # read the variant indexed by 5, 6, and 1
                >>> print(probs.shape)
                (500, 3, 3)
                >>> probs = bgen_e.read(slice(5)) #read the first 5 variants
                >>> print(probs.shape)
                (500, 5, 3)
                >>> probs = bgen_e.read(slice(2,5)) #read variants from 2 (inclusive) to 5 (exclusive)
                >>> print(probs.shape)
                (500, 3, 3)
                >>> probs = bgen_e.read(slice(2,None)) # read variants starting at index 2.
                >>> print(probs.shape)
                (500, 197, 3)
                >>> probs = bgen_e.read(slice(None,None,10)) #read every 10th variant
                >>> print(probs.shape)
                (500, 20, 3)
                >>> print(np.unique(bgen_e.chromosomes)) # print unique chrom values
                ['01']
                >>> probs = bgen_e.read(bgen_e.chromosomes=='01') # read all variants in chrom 1
                >>> print(probs.shape)
                (500, 199, 3)
                >>> probs = bgen_e.read(-1) # read the last variant
                >>> print(probs.shape)
                (500, 1, 3)

            To read selected samples, set ``index`` to a tuple of the form ``(sample_index,None)``, where ``sample index`` follows the form
            of ``variant index``, above.

            .. doctest::

                >>> probs = bgen_e.read((0,None)) # Read 1st sample (across all variants)
                >>> print(probs.shape)
                (1, 199, 3)
                >>> probs = bgen_e.read((slice(None,None,10),None)) # Read every 10th sample
                >>> print(probs.shape)
                (50, 199, 3)

            To read selected samples and selected variants, set ``index`` to a tuple of the form ``(sample_index,variant_index)``,
            where ``sample index`` and ``variant_index`` follow the forms above.

            .. doctest::

                >>> # Read samples 10 (inclusive) to 20 (exclusive) and the first 15 variants.
                >>> probs = bgen_e.read((slice(10,20),slice(15)))
                >>> print(probs.shape)
                (10, 15, 3)
                >>> #read last and 2nd-to-last sample and the last variant
                >>> probs = bgen_e.read(([-1,-2],-1))
                >>> print(probs.shape)
                (2, 1, 3)

        * Multiple Return Example

            Read probabilities, missingness, and ploidy. Print all unique ploidies values.

            .. doctest::

                >>> probs,missing,ploidy = bgen_e.read(return_missings=True,return_ploidies=True)
                >>> print(np.unique(ploidy))
                [2]
        """
        # LATER could allow strings (variant names) and lists of strings
        if not hasattr(self, "_bgen_context_manager"):
            raise ValueError("I/O operation on a closed file")

        max_combinations = (max_combinations if max_combinations is not None
                            else self.max_combinations
                            )  # Can't use 'or' because it treats 0 as False

        samples_index, variants_index = self._split_index(index)

        samples_index = self._sample_range[
            samples_index]  # converts slice(), etc to a list of  numbers
        vaddr = self._vaddr[variants_index]
        ncombinations = self._ncombinations[variants_index]

        if len(ncombinations) > 0 and max(ncombinations) > max_combinations:
            raise ValueError(
                "Need at least {0} max_combinations, but only {1} given".
                format(max(ncombinations), max_combinations))

        # allocating prob_buffer only when its size changes makes reading
        # 10x5M data 30% faster
        if return_probabilities:
            val = np.full(
                (len(samples_index), len(vaddr), max_combinations),
                np.nan,
                dtype=dtype,
                order=order,
            )
            prob_buffer = None
        if return_missings:
            missing_val = np.full((len(samples_index), len(vaddr)),
                                  False,
                                  dtype="bool",
                                  order=order)
        if return_ploidies:
            ploidy_val = np.full((len(samples_index), len(vaddr)),
                                 0,
                                 dtype="int",
                                 order=order)

        # LATER multithread?
        approx_read_seconds = len(vaddr) / 20000.0 + len(
            vaddr) * self.nsamples / (2 * 1000 * 1000.0)
        vaddr_per_second = max(1,
                               len(vaddr) // int(max(1, approx_read_seconds)))
        vaddr_per_second = 10**(
            int(math.log10(vaddr_per_second) + 0.5)
        )  # Do "logarithmic rounding" to make numbers look nicer, e.g.  999 -> 1000
        with log_in_place("reading", logging.INFO) as updater:
            for out_index, vaddr0 in enumerate(vaddr):
                if out_index % vaddr_per_second == 0:
                    updater("part {0:,} of {1:,}".format(
                        out_index, len(vaddr)))

                genotype = lib.bgen_file_open_genotype(self._bgen._bgen_file,
                                                       vaddr0)

                if return_probabilities:
                    if (prob_buffer is None or
                            ncombinations[out_index] != prob_buffer.shape[-1]):
                        prob_buffer = np.full(
                            (len(self._samples), ncombinations[out_index]),
                            np.nan,
                            order="C",
                            dtype="float64",
                        )
                    lib.bgen_genotype_read(
                        genotype, ffi.cast("double *",
                                           prob_buffer.ctypes.data))
                    val[:, out_index, :ncombinations[out_index]] = (
                        prob_buffer if (samples_index is self._sample_range)
                        else prob_buffer[samples_index, :])

                if return_missings:
                    missing_val[:, out_index] = [
                        lib.bgen_genotype_missing(genotype, i)
                        for i in samples_index
                    ]

                if return_ploidies:
                    ploidy_val[:, out_index] = [
                        lib.bgen_genotype_ploidy(genotype, i)
                        for i in samples_index
                    ]

                lib.bgen_genotype_close(genotype)

        result_array = (([val] if return_probabilities else []) +
                        ([missing_val] if return_missings else []) +
                        ([ploidy_val] if return_ploidies else []))
        if len(result_array) == 1:
            return result_array[0]
        else:
            return tuple(result_array)
Exemplo n.º 7
0
    def write(filename,
              snpreader,
              standardizer=Identity(),
              order='A',
              dtype=None,
              block_size=None,
              num_threads=None):
        """Writes a :class:`SnpReader` to :class:`SnpMemMap` format.

        :param filename: the name of the file to create
        :type filename: string
        :param snpreader: The data that should be written to disk.
        :type snpreader: :class:`SnpReader`
        :rtype: :class:`.SnpMemMap`

        >>> import pysnptools.util as pstutil
        >>> from pysnptools.util import example_file # Download and return local file name
        >>> from pysnptools.snpreader import Bed, SnpMemMap
        >>> bed_file = example_file("pysnptools/examples/toydata.5chrom.*","*.bed")
        >>> bed = Bed(bed_file)
        >>> pstutil.create_directory_if_necessary("tempdir/toydata.5chrom.snp.memmap") #LATER should we just promise to create directories?
        >>> SnpMemMap.write("tempdir/toydata.5chrom.snp.memmap",bed)      # Write bed in SnpMemMap format
        SnpMemMap('tempdir/toydata.5chrom.snp.memmap')
        """
        block_size = block_size or max(
            (100_000) // max(1, snpreader.row_count), 1)

        if hasattr(snpreader, 'val'):
            order = PstMemMap._order(snpreader) if order == 'A' else order
            dtype = dtype or snpreader.val.dtype
        else:
            order = 'F' if order == 'A' else order
            dtype = dtype or np.float64
        dtype = np.dtype(dtype)

        snpmemmap = SnpMemMap.empty(iid=snpreader.iid,
                                    sid=snpreader.sid,
                                    filename=filename + '.temp',
                                    pos=snpreader.col_property,
                                    order=order,
                                    dtype=dtype)
        if hasattr(snpreader, 'val'):
            standardizer.standardize(snpreader, num_threads=num_threads)
            snpmemmap.val[:, :] = snpreader.val
        else:
            with log_in_place("SnpMemMap write sid_index ",
                              logging.INFO) as updater:
                for start in range(0, snpreader.sid_count, block_size):
                    updater('{0} of {1}'.format(start, snpreader.sid_count))
                    snpdata = snpreader[:, start:start + block_size].read(
                        order=order, dtype=dtype, num_threads=num_threads)
                    standardizer.standardize(snpdata, num_threads=num_threads)
                    snpmemmap.val[:, start:start +
                                  snpdata.sid_count] = snpdata.val

        snpmemmap.flush()
        if os.path.exists(filename):
            os.remove(filename)
        shutil.move(filename + '.temp', filename)
        logging.debug("Done writing " + filename)
        return SnpMemMap(filename)
Exemplo n.º 8
0
    def write(filename, distreader, order='A', dtype=None, block_size=None):
        """Writes a :class:`DistReader` to :class:`DistMemMap` format.

        :param filename: the name of the file to create
        :type filename: string
        :param distreader: The data that should be written to disk. It can also be any distreader, for example, :class:`.DistNpz`, :class:`.DistData`, or
           another :class:`.Bgen`.
        :type distreader: :class:`DistReader`
        :param order: {'A' (default), 'F', 'C'}, optional -- Specify the order of the ndarray. By default, will match the order of the input if knowable; otherwise, 'F'
        :type order: string or None
        :param dtype: {None (default), numpy.float64, numpy.float32}, optional -- The data-type for the :attr:`DistMemMap.val` ndarray.
             By default, will match the order of the input if knowable; otherwise np.float64.
        :type dtype: data-type
        :param block_size: The number of SNPs to read in a batch from *distreader*. Defaults to a *block_size* such that *block_size* \* *iid_count* is about 100,000.
        :type block_size: number
        :rtype: :class:`.DistMemMap`

        >>> import pysnptools.util as pstutil
        >>> from pysnptools.distreader import Bgen, DistMemMap
        >>> from pysnptools.util import example_file # Download and return local file name
        >>> bgen_file = example_file("pysnptools/examples/2500x100.bgen")
        >>> distreader = Bgen(bgen_file)[:,:10] #Create a reader for the first 10 SNPs
        >>> pstutil.create_directory_if_necessary("tempdir/tiny.dist.memmap")
        >>> DistMemMap.write("tempdir/tiny.dist.memmap",distreader)      # Write distreader in DistMemMap format
        DistMemMap('tempdir/tiny.dist.memmap')

        """

        #We write iid and sid in ascii for compatibility between Python 2 and Python 3 formats.
        row_ascii = np.array(distreader.row,
                             dtype='S')  #!!!avoid this copy when not needed
        col_ascii = np.array(distreader.col,
                             dtype='S')  #!!!avoid this copy when not needed

        block_size = block_size or max(
            (100 * 1000) // max(1, distreader.row_count), 1)

        if hasattr(distreader, 'val'):
            order = PstMemMap._order(distreader) if order == 'A' else order
            dtype = dtype or distreader.val.dtype
        else:
            order = 'F' if order == 'A' else order
            dtype = dtype or np.float64
        dtype = np.dtype(dtype)

        self = PstMemMap.empty(row_ascii,
                               col_ascii,
                               filename + '.temp',
                               row_property=distreader.row_property,
                               col_property=distreader.col_property,
                               order=order,
                               dtype=dtype,
                               val_shape=3)
        if hasattr(distreader, 'val'):
            self.val[:, :, :] = distreader.val
        else:
            start = 0
            with log_in_place("sid_index ", logging.INFO) as updater:
                while start < distreader.sid_count:
                    updater('{0} of {1}'.format(start, distreader.sid_count))
                    distdata = distreader[:, start:start + block_size].read(
                        order=order, dtype=dtype)
                    self.val[:, start:start +
                             distdata.sid_count, :] = distdata.val
                    start += distdata.sid_count

        self.flush()
        if os.path.exists(filename):
            os.remove(filename)
        shutil.move(filename + '.temp', filename)
        logging.debug("Done writing " + filename)
        return DistMemMap(filename)