Пример #1
0
def snpsA(seed, iid_count, sid_count, use_distributed):
    import numpy as np
    from pysnptools.snpreader import Bed
    from pysnptools.snpreader import DistributedBed
    from pysnptools.snpreader import SnpGen

    chrom_count = 10
    global top_cache
    if use_distributed:
        test_snp_path = (
            cache_top /
            f"snpsA_{seed}_{chrom_count}_{iid_count}_{sid_count}_db")
    else:
        test_snp_path = (
            cache_top /
            f"snpsA_{seed}_{chrom_count}_{iid_count}_{sid_count}.bed")
    count_A1 = False
    if not test_snp_path.exists():
        snpgen = SnpGen(
            seed=seed,
            iid_count=iid_count,
            sid_count=sid_count,
            chrom_count=chrom_count,
            block_size=1000,
        )
        if use_distributed:
            test_snps = DistributedBed.write(str(test_snp_path), snpgen)
        else:
            test_snps = Bed.write(str(test_snp_path),
                                  snpgen.read(dtype="float32"),
                                  count_A1=count_A1)
    else:
        if use_distributed:
            test_snps = DistributedBed(str(test_snp_path))
        else:
            test_snps = Bed(str(test_snp_path), count_A1=count_A1)
    from pysnptools.snpreader import SnpData

    np.random.seed(seed)
    pheno = SnpData(
        iid=test_snps.iid,
        sid=["pheno"],
        val=np.random.randn(test_snps.iid_count, 1) * 3 + 2,
    )
    covar = SnpData(
        iid=test_snps.iid,
        sid=["covar1", "covar2"],
        val=np.random.randn(test_snps.iid_count, 2) * 2 - 3,
    )

    return test_snps, pheno, covar
Пример #2
0
    def test_one_chrom(self):
        logging.info("test_one_chrom")

        output_file = self.file_name("one_chrom")

        storage = LocalCache("local_cache/one_chrom")
        test_storage = storage.join('test_snps')
        test_storage.rmtree('')
        test_snps3 = self.bed[:, self.bed.pos[:, 0] ==
                              3]  # Test only on chromosome 3
        test_snps3_dist = DistributedBed.write(test_storage,
                                               test_snps3,
                                               piece_per_chrom_count=2)

        for test_snps, ref, clear_cache, name in (
            (test_snps3, "old_one", True, "Run with just chrom3"),
            (test_snps3_dist, "old_one", True,
             "Run with distributed test SNPs"),
            (test_snps3, "old_one", False, "Run with just chrom3 (use cache)"),
            (test_snps3_dist, "old_one", False,
             "Run with distributed test SNPs (use cache)"),
        ):
            logging.info("=========== " + name + " ===========")
            results_df = single_snp_scale(
                test_snps=test_snps,
                pheno=self.phen_fn,
                covar=self.cov_fn,
                K0=self.bed,
                cache=self._cache_dict(storage, clear_cache=clear_cache),
                output_file_name=output_file,
            )
            self.compare_files(results_df, ref)
Пример #3
0
    def too_slow_test_peertopeer(self):
        logging.info("test_peertopeer")

        output_file = self.file_name("peertopeer")

        def id_and_path_function():
            from pysnptools.util.filecache import ip_address_pid
            ip_pid = ip_address_pid()
            #Need to put the 'cache_top' here explicitly.
            return ip_pid, 'peertopeer/{0}'.format(ip_pid)

        storage = PeerToPeer(common_directory='peertopeer/common',
                             id_and_path_function=id_and_path_function)
        test_snps_cache = storage.join('test_snps')
        test_snps_cache.rmtree()
        test_snps = DistributedBed.write(test_snps_cache,
                                         self.bed,
                                         piece_per_chrom_count=2)

        runner = LocalMultiProc(
            taskcount=5)  #Run on 5 additional Python processes

        for clear_cache in (True, False):
            if clear_cache:
                storage.join('cache').rmtree()
            results_df = single_snp_scale(test_snps=test_snps,
                                          pheno=self.phen_fn,
                                          covar=self.cov_fn,
                                          cache=storage.join('cache'),
                                          output_file_name=output_file,
                                          runner=runner)
            self.compare_files(results_df, "old")
Пример #4
0
    def test_local_distribute(self):
        logging.info("test_local_distribute")
        force_python_only = False

        output_file = self.file_name("local_distribute")

        storage = LocalCache("local_cache/local_distribute")
        test_storage = storage.join('test_snps')
        test_storage.rmtree('')
        test_snps = DistributedBed.write(test_storage,
                                         self.bed,
                                         piece_per_chrom_count=2)

        results_df = single_snp_scale(test_snps=test_snps,
                                      pheno=self.phen_fn,
                                      covar=self.cov_fn,
                                      G0=self.bed,
                                      cache=self._cache_dict(storage,
                                                             clear_cache=True),
                                      output_file_name=output_file,
                                      force_python_only=force_python_only)

        self.compare_files(results_df, "old")

        results_df = single_snp_scale(test_snps=self.bed,
                                      pheno=self.phen_fn,
                                      covar=self.cov_fn,
                                      G0=self.bed,
                                      cache=self._cache_dict(
                                          storage, clear_cache=False),
                                      output_file_name=output_file)
        self.compare_files(results_df, "old")
Пример #5
0
    def test1(self):
        logging.info("in TestDistributedBed test1")
        from pysnptools.snpreader import SnpGen, DistributedBed
        snpgen = SnpGen(seed=0, iid_count=100, sid_count=100)

        temp_dir = 'tempdir/distributed_bed_test1'
        if os.path.exists(temp_dir):
            shutil.rmtree(temp_dir)
        distributed_bed = DistributedBed.write(temp_dir,
                                               snpgen,
                                               piece_per_chrom_count=2)
        snpdata = distributed_bed.read()

        ref1 = DistributedBed(
            os.path.dirname(os.path.realpath(__file__)) +
            '/../../tests/datasets/distributed_bed_test1').read()
        assert (snpdata.allclose(ref1, equal_nan=True))

        ref2 = Bed(os.path.dirname(os.path.realpath(__file__)) +
                   '/../../tests/datasets/distributed_bed_test1_X',
                   count_A1=False).read()
        assert (snpdata.allclose(ref2, equal_nan=True))
Пример #6
0
    def test_one_fast(self):
        logging.info("test_one_fast")

        output_file = self.file_name("one_fast")

        storage = LocalCache("local_cache")
        test_storage = storage.join('one_fast')
        test_storage.rmtree()
        test_snps3 = self.bed[:, self.bed.pos[:, 0] ==
                              3]  # Test only on chromosome 3
        test_snps3_dist = DistributedBed.write(test_storage,
                                               test_snps3,
                                               piece_per_chrom_count=2)

        results_df = single_snp_scale(test_snps=test_snps3_dist,
                                      pheno=self.phen_fn,
                                      covar=self.cov_fn,
                                      G0=self.bed,
                                      output_file_name=output_file)
        self.compare_files(results_df, "old_one")
Пример #7
0
    def write(
        storage,
        snpreader,
        piece_per_chrom_count=1,
        updater=None,
        runner=None
    ):  #!!! might want to set pieces_per_chrom such that it is a certain size
        '''
        Uploads from any :class:`.Bed`-like data to cluster storage for efficient retrieval later.
        If some of the contents already exists in storage, it skips uploading that part of the contents. (To avoid this behavior,
        clear the storage.)

        :param storage: Tells where to store SNP data.
                      A string can be given and will be interpreted as the path of a local directory to use for storage. (The local
                      directory will **not** be automatically erased and so must be user managed.) 
                      A :class:`.FileCache` instance can be given, which provides a
                      method to specify cluster-distributed storage. (:class:`.FileCache`'s will **not** be automatically erased and must be user managed.)
                      If `None`, the storage will be in an automatically-erasing temporary directory. (If the TEMP environment variable is set, Python places the temp directory under it.)
                      
        :type storage: string or :class:`.FileCache` or None.

        :param snpreader: A :class:`.Bed` or other :class:`.SnpReader` with values of 0,1,2, or missing.
            (Note that this differs from most other `write` methods that take a :class:`.SnpData`)
        :type snpreader: :class:`.SnpReader`

        :param piece_per_chrom_count: The number of pieces in which to store the data from each chromosome. Data is split across
            SNPs. For exmple, if `piece_per_chrom_count` is set to 100 and 22 chromosomes are uploaded, then data will be stored in 2200 pieces. Later, when data is requested
            only the pieces necessary for the request will be downloaded to local storage.
        :type piece_per_chrom_count: A number

        :param updater: A single argument function to write logging message to, for example, the function created by :func:`.log_in_place`.
        :type updater: A function or lambda

        :param runner: a :class:`.Runner`, optional: Tells how to run.
            (Note that :class:`.Local` and :class:`.LocalMultProc` are good options.)
            If not given, the function is run locally.
        :type runner: :class:`.Runner`

        :rtype: DistributedBed

        >>> from pysnptools.snpreader import DistributedBed, Bed
        >>> import shutil
        >>> from pysnptools.util import example_file # Download and return local file name
        >>> directory = 'tempdir/toydataSkip10.distributedbed'
        >>> if os.path.exists(directory):
        ...     shutil.rmtree(directory)
        >>> bedfile = example_file("pysnptools/examples/toydata.5chrom.*","*.bed")
        >>> snpreader = Bed(bedfile,count_A1=False)[:,::10]  # Read every 10 snps from Bed format
        >>> DistributedBed.write(directory,snpreader,piece_per_chrom_count=5)  # Write data in DistributedBed format
        DistributedBed(LocalCache('tempdir/toydataSkip10.distributedbed'))


        '''
        from pysnptools.util import _file_transfer_reporter
        from pysnptools.util.filecache import FileCache

        count_A1 = True  #Make all these's the same for reading and writing so that nothing will change.
        snpreader = _snps_fixup(snpreader, count_A1=count_A1)

        storage = FileCache._fixup(storage)

        chrom_set = sorted(set(snpreader.pos[:, 0]))
        for chrom in chrom_set:
            assert chrom == chrom and chrom == int(
                chrom
            ), "DistributedBed.write expects all chromosomes to be integers (not '{0}')".format(
                chrom)
        with _file_transfer_reporter("DistributedBed.write",
                                     size=0,
                                     updater=updater) as updater2:

            def mapper_closure(chrom):
                chrom_reader = snpreader[:, snpreader.pos[:, 0] == chrom]

                def nested_closure(piece_per_chrom_index):
                    start = chrom_reader.sid_count * piece_per_chrom_index // piece_per_chrom_count
                    stop = chrom_reader.sid_count * (
                        piece_per_chrom_index + 1) // piece_per_chrom_count
                    piece_reader = chrom_reader[:, start:stop]
                    _piece_name_list = [
                        "chrom{0}.piece{1}of{2}.{3}".format(
                            int(chrom), piece_per_chrom_index,
                            piece_per_chrom_count, suffix)
                        for suffix in ['bim', 'fam', 'bed']
                    ]
                    exist_list = [
                        storage.file_exists(_piece_name)
                        for _piece_name in _piece_name_list
                    ]
                    if sum(
                            exist_list
                    ) < 3:  #If all three of the BIM/FAM/BED files are already there, then skip the upload, otherwise do the upload
                        for i in range(
                                3
                        ):  #If one or two of BIM/FAM/BED are there, remove them
                            if exist_list[i]:
                                storage.remove(_piece_name_list[i])
                        _Distributed1Bed.write(_piece_name_list[-1],
                                               storage,
                                               piece_reader.read(),
                                               count_A1=count_A1,
                                               updater=updater2)
                    return _piece_name_list[-1]

                return map_reduce(
                    range(piece_per_chrom_count),
                    mapper=nested_closure,
                )

            list_list_pair = map_reduce(
                chrom_set,
                nested=mapper_closure,
                runner=runner,
            )

        reader_name_list = []
        reader_list = []
        for chrom_result in list_list_pair:
            for _piece_name in chrom_result:
                reader_name_list.append(_piece_name)
                reader_list.append(_Distributed1Bed(_piece_name, storage))

        _metadatanpz = "metadata.npz"
        with storage.open_write(_metadatanpz) as local_metadatanpz:
            _reader_name_listnpz = "reader_name_list.npz"
            with storage.open_write(
                    _reader_name_listnpz) as local_reader_name_listnpz:
                reader_name_list_ascii = np.array(reader_name_list, dtype='S')
                np.savez(local_reader_name_listnpz,
                         reader_name_list=reader_name_list_ascii)
                if os.path.exists(local_metadatanpz):
                    os.remove(local_metadatanpz)
                _MergeSIDs(reader_list,
                           cache_file=local_metadatanpz,
                           skip_check=True)

        return DistributedBed(storage)
Пример #8
0
    test_suite = unittest.TestSuite([])
    test_suite.addTests(
        unittest.TestLoader().loadTestsFromTestCase(TestDistributedBed))
    return test_suite


if __name__ == "__main__":
    import doctest
    logging.basicConfig(level=logging.INFO)

    if False:
        from pysnptools.snpreader import DistributedBed, Bed
        import shutil
        directory = 'tempdir/toydataSkip10.distributedbed'
        if os.path.exists(directory):
            shutil.rmtree(directory)
        snpreader = Bed(
            '../examples/toydata.5chrom.bed',
            count_A1=False)[:, ::10]  # Read every 10 snps from Bed format
        DistributedBed.write(
            directory, snpreader,
            piece_per_chrom_count=5)  # Write data in DistributedBed format

    result = doctest.testmod(optionflags=doctest.ELLIPSIS)
    assert result.failed == 0, "failed doc test: " + __file__

    suites = getTestSuite()
    r = unittest.TextTestRunner(failfast=True)
    ret = r.run(suites)
    assert ret.wasSuccessful()