def _upload_big_file(self, big_size, path_list): def mapper(path): azure_path = "{0}/big{1}.txt".format(path, big_size) if not self.container.file_exists(azure_path): logging.info("Uploading '{0}'".format(azure_path)) temp_parent = os.path.join(tempfile.gettempdir(), format(hash(os.times()))) file_name = temp_parent + azure_path logging.info("Creating {0}".format(file_name)) pstutil.create_directory_if_necessary(file_name) with open( file_name, "wb" ) as fp: #Preallocate the local file to its full size if big_size > 0: fp.seek(big_size) fp.write("\0") with _file_transfer_reporter("'{0}'".format(azure_path), big_size) as updater: self.container.upload(file_name, azure_path, updater=updater) os.remove(file_name) return azure_path azure_path_list = map_reduce(path_list, mapper=mapper, runner=LocalMultiThread(len(path_list))) return azure_path_list
def prime_search1(start, stop, runner): ''' Distributed algorithm for finding prime numbers in a range, but just testing each number. >>> from pysnptools.util.mapreduce1.examples import prime_search1 >>> from pysnptools.util.mapreduce1.runner import LocalMultiProc >>> prime_search1(2,10,LocalMultiProc(4)) [2, 3, 5, 7] ''' from pysnptools.util.mapreduce1 import map_reduce def mapper(i): if is_prime(i): return i else: return None def reducer(sequence): result = [] for i in sequence: if i is not None: result.append(i) return result return map_reduce( range(start, stop), mapper=mapper, reducer= reducer, #lambda sequence: [i for i in sequence if i is not None], #Filter out the None's name="prime_search1", runner=runner)
def mmultfile_ata(memmap_lambda, writer, sid, work_count, name, runner, force_python_only=False): sid_count = len(sid) piece_count = work_count * 2 log_frequency = 1 def debatch_closure(piece_index): return sid_count * piece_index // piece_count def mapper_closure(work_index): memmap = memmap_lambda() piece_index0 = work_index piece_index1 = piece_count - work_index - 1 gtg_piece0 = mmultfile_ata_piece(memmap.filename, memmap.offset, piece_index0, piece_count, log_frequency=log_frequency, force_python_only=force_python_only) gtg_piece1 = mmultfile_ata_piece(memmap.filename, memmap.offset, piece_index1, piece_count, log_frequency=log_frequency, force_python_only=force_python_only) return [[piece_index0, gtg_piece0], [piece_index1, gtg_piece1]] def reducer_closure(result_result_sequence): logging.info("starting ata reducer") iid = [[value, value] for value in sid] gtg_data = KernelData(iid=iid, val=np.zeros((sid_count, sid_count))) for result_result in result_result_sequence: for piece_index, gtg_piece in result_result: logging.info("combining ata reducer {0}".format(piece_index)) start = debatch_closure(piece_index) stop = debatch_closure(piece_index + 1) gtg_data.val[start:, start:stop] = gtg_piece gtg_data.val[start:stop, start + gtg_piece.shape[1]:] = gtg_piece[ gtg_piece.shape[1]:, :].T result = writer(gtg_data) return result gtg_npz_lambda = map_reduce(xrange(work_count), mapper=mapper_closure, reducer=reducer_closure, runner=runner, name=name, input_files=[], output_files=[])
def holder1(n, runner): def mapper1(x): return x * x def reducer1(sequence): return sum(sequence) return map_reduce(range(n), mapper=mapper1, reducer=reducer1, runner=runner)
def holder1(n, runner): def mapper1(x): return int(os.environ['TEST_ENVIRON']) def reducer1(sequence): return sum(sequence) + int(os.environ['TEST_ENVIRON']) return map_reduce(range(n), mapper=mapper1, reducer=reducer1, runner=runner)
def upload(self, local_path, azure_path, do_sync_date=True, updater=None): """ Upload a local file to the container. """ assert os.path.exists( local_path), 'Expect local_path to exist: "{0}"'.format(local_path) #self._run_once() t0 = time.time() self.remove(azure_path) size = os.path.getsize(local_path) piece_count = self._get_piece_count(size) with _file_transfer_reporter("upload", size, updater=updater) as updater2: def mapper_closure(piece_index): t00 = time.time() start = size * piece_index // piece_count stop = size * (piece_index + 1) // piece_count shard_size = stop - start blob_name = "{0}/{1}.{2}".format(azure_path, piece_index, piece_count) self._create_blob_from_stream(local_path, start, stop, blob_name) updater2(shard_size) if piece_index == piece_count - 1: self._create_blob_from_stream( local_path, stop, stop, "{0}/exists.txt".format(azure_path)) map_reduce( range(piece_count), mapper=mapper_closure, runner=self._get_runner(), ) if do_sync_date: self._sync_date(azure_path, local_path)
def _big_files_fileshare_internal(big_size, n, runner, storage, double_it): nn = n * 2 if double_it else n def mapper(ii): i = ii // 2 if double_it else i if ii % 2 == 0 or not double_it: short_name = "big{0}.{1}.txt".format(big_size, i) if storage.file_exists(short_name): storage.remove(short_name) with storage.open_write(short_name) as file_name: with open( file_name, "wb" ) as fp: #Preallocate the local file to its full size if big_size > 0: fp.seek(big_size) fp.write("\0") if ii % 2 == 1 or not double_it: next_name = "big{0}.{1}.txt".format(big_size, (i + 1) % n) logging.info("Transferring {0}".format(next_name)) sleep_time = 5.0 for j in xrange(50): if storage.file_exists(next_name): break logging.info( "Waiting for '{0}' to exist. Will sleep {1}".format( next_name, sleep_time)) time.sleep(sleep_time) sleep_time = min(60.0, sleep_time * 1.1) assert storage.file_exists( next_name), "{0} still doesn't exist".format(next_name) t2 = time.time() with storage.open_read(next_name) as file_name: pass mbps2 = _mbps(big_size, time.time() - t2) logging.info("transfers Mbps={0}".format(mbps2)) return Mbps2 return None mbps_list = map_reduce( xrange(nn), mapper=mapper, reducer=lambda sequence: [x for x in sequence if x is not None], name="big_filename.{0}{1}".format(n, ".x2" if double_it else ""), runner=runner) return mbps_list
def _gwas_2k_via_loo_chrom(test_snps, chrom_list, input_files, runner, G, chrom_index_to_best_sid, pheno, covar, force_full_rank, force_low_rank, mixing, h2, output_file_name, GB_goal): logging.info("Doing GWAS_2K for each chrom. Work_count={0}".format( len(chrom_list))) def mapper_single_snp_2K_given_chrom(test_chr): logging.info("Working on chr={0}".format(test_chr)) test_snps_chrom = test_snps[:, test_snps.pos[:, 0] == test_chr] G_for_chrom = _K_per_chrom(G, test_chr, G.iid).snpreader chrom_index = chrom_list.index(test_chr) best_sid = chrom_index_to_best_sid[chrom_index] K1 = G_for_chrom[:, G_for_chrom.sid_to_index(best_sid)] result = single_snp(test_snps=test_snps_chrom, K0=G_for_chrom, K1=K1, pheno=pheno, covar=covar, leave_out_one_chrom=False, GB_goal=GB_goal, force_full_rank=force_full_rank, force_low_rank=force_low_rank, mixing=mixing, h2=h2, count_A1=count_A1) return result def reducer_closure( frame_sequence): #!!!very similar code in single_snp frame = pd.concat(frame_sequence) frame.sort_values(by="PValue", inplace=True) frame.index = np.arange(len(frame)) if output_file_name is not None: frame.to_csv(output_file_name, sep="\t", index=False) logging.info("PhenotypeName\t{0}".format(pheno.sid[0])) logging.info("SampleSize\t{0}".format(G.iid_count)) logging.info("SNPCount\t{0}".format(G.sid_count)) return frame frame = map_reduce(chrom_list, mapper=mapper_single_snp_2K_given_chrom, reducer=reducer_closure, input_files=input_files, name="single_snp with two K's for all chroms", runner=runner) return frame
def _big_files_slow_down_internal(container, big_size, n, azure_path_list, runner, storage): def mapper(i): azure_path = azure_path_list[i % len(azure_path_list)] short_name = "big{0}.{1}.txt".format(big_size, i) if storage.file_exists(short_name): storage.remove(short_name) with storage.open_write(short_name, size=big_size) as file_name: logging.info("Downloading {0}".format(azure_path)) with _file_transfer_reporter( "Downloading {0}".format(azure_path), big_size) as updater: t0 = time.time() container.download(azure_path, file_name, updater=updater) mbps0 = _mbps(big_size, time.time() - t0) return mbps0 mbps_list = map_reduce(xrange(n), mapper=mapper, name="big_files_slow_down", runner=runner) return mbps_list
def mapper_closure(chrom): chrom_reader = snpreader[:, snpreader.pos[:, 0] == chrom] def nested_closure(piece_per_chrom_index): start = chrom_reader.sid_count * piece_per_chrom_index // piece_per_chrom_count stop = chrom_reader.sid_count * ( piece_per_chrom_index + 1) // piece_per_chrom_count piece_reader = chrom_reader[:, start:stop] _piece_name_list = [ "chrom{0}.piece{1}of{2}.{3}".format( int(chrom), piece_per_chrom_index, piece_per_chrom_count, suffix) for suffix in ['bim', 'fam', 'bed'] ] exist_list = [ storage.file_exists(_piece_name) for _piece_name in _piece_name_list ] if sum( exist_list ) < 3: #If all three of the BIM/FAM/BED files are already there, then skip the upload, otherwise do the upload for i in range( 3 ): #If one or two of BIM/FAM/BED are there, remove them if exist_list[i]: storage.remove(_piece_name_list[i]) _Distributed1Bed.write(_piece_name_list[-1], storage, piece_reader.read(), count_A1=count_A1, updater=updater2) return _piece_name_list[-1] return map_reduce( range(piece_per_chrom_count), mapper=nested_closure, )
part_pair_count = (part_count*part_count+part_count)//2 part_pair_index = -1 print("part_pair_count={0:,}".format(part_pair_count)) K0 = SnpKernel(synbed,standardizer=Unit()).read() #Precompute the similarity start_time = datetime.datetime.now() for i,part_i in enumerate(part_list): def mapper1(j): #from fastlmm.association import single_snp #from pysnptools.snpreader import Pairs #print('Z') #part_j = part_list[j] #print('A') print("Looking at pair {0},{1} which is {2} of {3}".format(i,j,part_pair_index+j+1,part_pair_count)) #pairs = Pairs(part_i) if i==j else Pairs(part_i,part_j) #result_df_ij = single_snp(pairs, K0=K0, pheno=pheno_fn, covar=cov_fn, leave_out_one_chrom=False, count_A1=True) #print(result_df_ij[:1]) #return result_df_ij result_df_i = map_reduce(range(i,part_count), mapper=mapper1, reducer=lambda result_j_list:pd.concat(result_j_list), runner=runner, name='js') part_pair_index+=(part_count-i) time_so_far = datetime.datetime.now()-start_time total_time_estimate = time_so_far*part_pair_count/(part_pair_index+1) print(total_time_estimate)
def mapper_find_best_given_chrom(test_chr): G_for_chrom = _K_per_chrom(G, test_chr, G.iid).snpreader def mapper_gather_lots(i_fold_and_pair): i_fold, (train_idx, test_idx) = i_fold_and_pair logging.info( "Working on GWAS_1K and k search, chrom={0}, i_fold={1}". format(test_chr, i_fold)) G_train = G_for_chrom[train_idx, :] #Precompute whole x whole standardized on train from fastlmm.association.single_snp import _internal_determine_block_size, _block_size_from_GB_goal min_count = _internal_determine_block_size( G_for_chrom, None, None, force_full_rank, force_low_rank) block_size = _block_size_from_GB_goal(GB_goal, G_for_chrom.iid_count, min_count) K_whole_unittrain = _SnpWholeWithTrain( whole=G_for_chrom, train_idx=train_idx, standardizer=Unit(), block_size=block_size).read() assert np.array_equal(K_whole_unittrain.iid, G_for_chrom.iid), "real assert" K_train = K_whole_unittrain[train_idx] single_snp_result = single_snp( test_snps=G_train, K0=K_train, pheno= pheno, #iid intersection means when can give the whole covariate and pheno covar=covar, leave_out_one_chrom=False, GB_goal=GB_goal, force_full_rank=force_full_rank, force_low_rank=force_low_rank, mixing=mixing, h2=h2, count_A1=count_A1) is_all = (i_fold == n_folds) if n_folds > 1 else True k_list_in = [0] + [ int(k) for k in k_list if 0 < k and k < len(single_snp_result) ] if is_all: top_snps = list(single_snp_result.SNP[:max_k]) else: top_snps = None if i_fold == n_folds: k_index_to_nLL = None else: k_index_to_nLL = [] for k in k_list_in: top_k = G_for_chrom[:, G_for_chrom.sid_to_index( single_snp_result.SNP[:k])] logging.info( "Working on chr={0}, i_fold={1}, and K_{2}".format( test_chr, i_fold, k)) top_k_train = top_k[train_idx, :] if k > 0 else None fastlmm = FastLMM(force_full_rank=force_full_rank, force_low_rank=force_low_rank, GB_goal=GB_goal) fastlmm.fit( K0_train=K_train, K1_train=top_k_train, X=covar, y=pheno, mixing=mixing, h2raw=h2 ) #iid intersection means when can give the whole covariate and pheno top_k_test = top_k[test_idx, :] if k > 0 else None K0_whole_test = K_whole_unittrain[:, test_idx] nLL = fastlmm.score( K0_whole_test=K0_whole_test, K1_whole_test=top_k_test, X=covar, y=pheno ) #iid intersection means when can give the whole covariate and pheno k_index_to_nLL.append(nLL) if i_fold > 0: k_list_in = None return k_list_in, top_snps, k_index_to_nLL def reducer_find_best(top_snps_and_k_index_to_nLL_sequence): #Starts fold_index+all -> k_index -> nll #Need: k_index -> sum(fold_index -> nll) k_index_to_sum_nll = None top_snps_all = None k_list_in_all = None for i_fold, (k_list_in, top_snps, k_index_to_nLL) in enumerate( top_snps_and_k_index_to_nLL_sequence): if k_list_in is not None: assert k_list_in_all is None, "real assert" k_list_in_all = k_list_in k_index_to_sum_nll = np.zeros(len(k_list_in)) if top_snps is not None: assert top_snps_all is None, "real assert" top_snps_all = top_snps if k_index_to_nLL is not None: assert i_fold < n_folds or n_folds == 1, "real assert" for k_index, nLL in enumerate(k_index_to_nLL): k_index_to_sum_nll[k_index] += nLL #find best # top_snps best_k = k_list_in_all[np.argmin(k_index_to_sum_nll)] logging.info("For chrom={0}, best_k={1}".format( test_chr, best_k)) if do_plot: _nll_plot(k_list_in_all, k_index_to_sum_nll) #Return the top snps from all result = top_snps_all[:best_k] return result i_fold_index_to_top_snps_and_k_index_to_nLL = map_reduce( _kfold(G_for_chrom.iid_count, n_folds, seed, end_with_all=True), mapper=mapper_gather_lots, reducer=reducer_find_best) return i_fold_index_to_top_snps_and_k_index_to_nLL
def single_snp_select( test_snps, pheno, G=None, covar=None, k_list=None, n_folds=10, #1 is special and means test on train just_return_selected_snps=False, seed=0, output_file_name=None, GB_goal=None, force_full_rank=False, force_low_rank=False, h2=None, runner=None, count_A1=None): """ Function performing single SNP GWAS based on covariates (often PCs) and a similarity matrix constructed of the top *k* SNPs where SNPs are ordered via the PValue from :meth:`.single_snp_linreg` and *k* is determined via out-of-sample prediction. Will reorder and intersect IIDs as needed. :param test_snps: SNPs to test. Can be any `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__. If you give a string, it should be the base name of a set of PLINK Bed-formatted files. (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header') :type test_snps: a `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string :param pheno: A single phenotype: Can be any `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__, for example, `Pheno <http://fastlmm.github.io/PySnpTools/#snpreader-pheno>`__ or `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__. If you give a string, it should be the file name of a PLINK phenotype-formatted file. Any IIDs with missing values will be removed. (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header') :type pheno: a `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string :param G: SNPs from which to create a similarity matrix of the top *k* SNPs. If not given, will use test_snps. Can be any `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__. If you give a string, it should be the base name of a set of PLINK Bed-formatted files. :type G: `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string :param covar: covariate information, optional: Can be any `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__, for example, `Pheno <http://fastlmm.github.io/PySnpTools/#snpreader-pheno>`__ or `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__. If you give a string, it should be the file name of a PLINK phenotype-formatted file. (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header') :type covar: a `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string :param k_list: Values of *k* (in addition to 0) to test. Default to [1,2,4,8,...8192]. :type k_list: list of numbers :param n_folds: Number of folds of cross validation to use for out-of-sample evaluation of various values of *k*. Default to 10. :type n_folds: number :param just_return_selected_snps: Instead of returning the results of GWAS, return the top *k* SNPs selected. :type just_return_selected_snps: list of strings :param seed: (optional) Random seed used to generate permutations for lrt G0 fitting. :type seed: number :param output_file_name: Name of file to write results to, optional. If not given, no output file will be created. :type output_file_name: file name :param GB_goal: gigabytes of memory the run should use, optional. If not given, will read the test_snps in blocks the same size as the kernel, which is memory efficient with little overhead on computation time. :type GB_goal: number :param force_full_rank: Even if kernels are defined with fewer SNPs than IIDs, create an explicit iid_count x iid_count kernel. Cannot be True if force_low_rank is True. :type force_full_rank: Boolean :param force_low_rank: Even if kernels are defined with fewer IIDs than SNPs, create a low-rank iid_count x sid_count kernel. Cannot be True if force_full_rank is True. :type force_low_rank: Boolean :param h2: A parameter to LMM learning that tells how much weight to give the K's vs. the identity matrix, optional If not given will search for best value. :type h2: number :param runner: a `Runner <http://fastlmm.github.io/PySnpTools/#util-mapreduce1-runner-runner>`__, optional: Tells how to run locally, multi-processor, or on a cluster. If not given, the function is run locally. :type runner: `Runner <http://fastlmm.github.io/PySnpTools/#util-mapreduce1-runner-runner>`__ :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1 alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True. :type count_A1: bool :rtype: Pandas dataframe with one row per test SNP. Columns include "PValue" :Example: >>> import logging >>> import numpy as np >>> from fastlmm.association import single_snp_select >>> from pysnptools.snpreader import Bed >>> from fastlmm.util import example_file # Download and return local file name >>> from fastlmm.util import compute_auto_pcs >>> bed_fn = example_file("tests/datasets/synth/all.bed") >>> phen_fn = example_file("tests/datasets/synth/pheno_10_causals.txt") >>> covar = compute_auto_pcs(bed_fn,count_A1=False) >>> results_dataframe = single_snp_select(test_snps=bed_fn, G=bed_fn, pheno=phen_fn, covar=covar, GB_goal=2, count_A1=False) >>> print(results_dataframe.iloc[0].SNP,round(results_dataframe.iloc[0].PValue,7),len(results_dataframe)) snp495_m0_.01m1_.04 0.0 5000 """ with patch.dict('os.environ', {'ARRAY_MODULE': 'numpy'}) as _: #!!!code similar to single_snp and feature_selection if force_full_rank and force_low_rank: raise Exception("Can't force both full rank and low rank") assert test_snps is not None, "test_snps must be given as input" if k_list is None: k_list = np.logspace(start=0, stop=13, num=14, base=2) test_snps, G, pheno, covar = _fixup(test_snps, G, pheno, covar, count_A1=count_A1) common_input_files = [test_snps, G, pheno, covar] k_list_in = [0] + [int(k) for k in k_list if 0 < k <= G.sid_count] def top_snps_for_each_fold_nested(kfold_item): fold_index, (train_idx, test_idx) = kfold_item _, G_in, pheno_in, covar_in = _fixup(test_snps, G, pheno, covar, count_A1=count_A1) nested = single_snp_linreg(G_in[train_idx, :], pheno_in[train_idx, :], covar_in[train_idx, :], GB_goal=GB_goal, max_output_len=max(k_list_in), count_A1=count_A1) return nested def top_snps_for_each_fold_reducer(dataframe_list): result = [list(dataframe['SNP']) for dataframe in dataframe_list] return result #Find top snps for each fold fold_index_to_top_snps = map_reduce( _kfold(G.iid_count, n_folds, seed, end_with_all=True, iid_to_index=G.iid_to_index), nested=top_snps_for_each_fold_nested, reducer=top_snps_for_each_fold_reducer, name="top_snps_for_each_fold", input_files=common_input_files, runner=runner) #================================================= # Start of definition of inner functions #================================================= def k_index_to_nLL_mapper(k): _, G_in, pheno_in, covar_in = _fixup(test_snps, G, pheno, covar, count_A1=count_A1) nll_sum = 0 mse_sum = 0 n_folds_in = 0 for fold_index, (train_idx, test_idx) in _kfold(G.iid_count, n_folds, seed, end_with_all=False, iid_to_index=G.iid_to_index): assert set(train_idx).isdisjoint(set(test_idx)), "real assert" top_snps_in_fold = fold_index_to_top_snps[fold_index][:k] sid_idx_in_fold = G_in.sid_to_index(top_snps_in_fold) G_train = G_in[train_idx, sid_idx_in_fold] if k > 0 else None fastlmm = FastLMM(force_full_rank=force_full_rank, force_low_rank=force_low_rank, GB_goal=GB_goal) fastlmm.fit( K0_train=G_train, X=covar_in[train_idx, :], y=pheno_in[train_idx, :], h2raw=h2 ) #iid intersection means when can give the whole covariate and pheno G_test = G_in[ test_idx, sid_idx_in_fold] if k > 0 else KernelIdentity( G_in.iid, G_in.iid[test_idx] ) #!!! instead of this, which blows up when # of iids is large, should switch to linear regression model with k is 0 nll, mse = fastlmm.score( K0_whole_test=G_test, X=covar_in[test_idx, :], y=pheno_in[test_idx, :], return_mse_too=True ) #iid intersection means when can give the whole covariate and pheno nll_sum += nll mse_sum += mse n_folds_in += 1 logging.info("k={0},nLL={1},average mse={2}".format( k, nll_sum, mse_sum / n_folds_in)) return nll_sum #================================================= # End of definition of inner functions #================================================= #find best # of top SNPs k_index_to_nLL = map_reduce(k_list_in, mapper=k_index_to_nLL_mapper, input_files=common_input_files, name="k_index_to_nLL", runner=runner) best_k = k_list_in[np.argmin(k_index_to_nLL)] top_snps = fold_index_to_top_snps[-1][:best_k] if just_return_selected_snps: return top_snps sid_idx = G.sid_to_index(top_snps) G_top = G[:, sid_idx] # Run GWAS with leave-one-chrom out single_snp_result = single_snp(test_snps=test_snps, K0=G_top, pheno=pheno, covar=covar, leave_out_one_chrom=True, GB_goal=GB_goal, force_full_rank=force_full_rank, force_low_rank=force_low_rank, h2=h2, output_file_name=output_file_name, runner=runner, count_A1=count_A1) return single_snp_result
def single_snp_linreg(test_snps, pheno, covar=None, max_output_len=None, output_file_name=None, GB_goal=None, runner=None, count_A1=None): """ Function performing single SNP GWAS using linear regression. Will reorder and intersect IIDs as needed. :param test_snps: SNPs to test. Can be any `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__. If you give a string, it should be the base name of a set of PLINK Bed-formatted files. (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header') :type test_snps: a `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string :param pheno: A single phenotype: Can be any `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__, for example, `Pheno <http://fastlmm.github.io/PySnpTools/#snpreader-pheno>`__ or `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__. If you give a string, it should be the file name of a PLINK phenotype-formatted file. Any IIDs with missing values will be removed. (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header') :type pheno: a `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string :param covar: covariate information, optional: Can be any `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__, for example, `Pheno <http://fastlmm.github.io/PySnpTools/#snpreader-pheno>`__ or `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__. If you give a string, it should be the file name of a PLINK phenotype-formatted file. (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header') :type covar: a `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string :param max_output_len: Maximum number of Pvalues to return. Default to None, which means 'Return all'. :type max_output_len: number :param output_file_name: Name of file to write results to, optional. If not given, no output file will be created. The output format is tab-delimited text. :type output_file_name: file name :param GB_goal: gigabytes of memory the run should use, optional. If not given, will read the test_snps in blocks of size iid_count, which is memory efficient with little overhead on computation time. :type GB_goal: number :param runner: `Runner <http://fastlmm.github.io/PySnpTools/#util-mapreduce1-runner-runner>`__, optional: Tells how to run locally, multi-processor, or on a cluster. If not given, the function is run locally. :type runner: `Runner <http://fastlmm.github.io/PySnpTools/#util-mapreduce1-runner-runner>`__ :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1 alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True. :type count_A1: bool :rtype: Pandas dataframe with one row per test SNP. Columns include "PValue" :Example: >>> import logging >>> import numpy as np >>> from fastlmm.association import single_snp_linreg >>> from pysnptools.snpreader import Bed >>> from fastlmm.util import example_file # Download and return local file name >>> logging.basicConfig(level=logging.INFO) >>> pheno_fn = example_file("fastlmm/feature_selection/examples/toydata.phe") >>> test_snps = example_file("fastlmm/feature_selection/examples/toydata.5chrom.*","*.bed") >>> results_dataframe = single_snp_linreg(test_snps=test_snps, pheno=pheno_fn, count_A1=False) >>> print(results_dataframe.iloc[0].SNP,round(results_dataframe.iloc[0].PValue,7),len(results_dataframe)) null_576 1e-07 10000 """ with patch.dict('os.environ', {'ARRAY_MODULE': 'numpy'}) as _: assert test_snps is not None, "test_snps must be given as input" test_snps = _snps_fixup(test_snps, count_A1=count_A1) pheno = _pheno_fixup(pheno, count_A1=count_A1).read() assert pheno.sid_count == 1, "Expect pheno to be just one variable" pheno = pheno[(pheno.val == pheno.val)[:, 0], :] covar = _pheno_fixup(covar, iid_if_none=pheno.iid) test_snps, pheno, covar = pstutil.intersect_apply( [test_snps, pheno, covar]) logging.debug("# of iids now {0}".format(test_snps.iid_count)) if GB_goal is not None: bytes_per_sid = test_snps.iid_count * 8 sid_per_GB_goal = 1024.0**3 * GB_goal / bytes_per_sid block_size = max(1, int(sid_per_GB_goal + .5)) block_count = test_snps.sid_count / block_size else: block_count = 1 block_size = test_snps.sid_count logging.debug("block_count={0}, block_size={1}".format( block_count, block_size)) #!!!what about missing data in covar, in test_snps, in y covar = np.c_[ covar.read(view_ok=True, order='A').val, np.ones((test_snps.iid_count, 1))] #view_ok because np.c_ will allocation new memory y = pheno.read( view_ok=True, order='A' ).val #view_ok because this code already did a fresh read to look for any missing values def mapper(start): logging.info( "single_snp_linereg reading start={0},block_size={1}".format( start, block_size)) snp_index = np.arange(start, min(start + block_size, test_snps.sid_count)) x = test_snps[:, start:start + block_size].read().standardize().val logging.info("single_snp_linereg linreg") _, pval_in = lin_reg.f_regression_cov_alt(x, y, covar) logging.info("single_snp_linereg done") pval_in = pval_in.reshape(-1) if max_output_len is None: return pval_in, snp_index else: #We only need to return the top max_output_len results sort_index = np.argsort(pval_in)[:max_output_len] return pval_in[sort_index], snp_index[sort_index] def reducer(pval_and_snp_index_sequence): pval_list = [] snp_index_list = [] for pval, snp_index in pval_and_snp_index_sequence: pval_list.append(pval) snp_index_list.append(snp_index) pval = np.concatenate(pval_list) snp_index = np.concatenate(snp_index_list) sort_index = np.argsort(pval) if max_output_len is not None: sort_index = sort_index[:max_output_len] index = snp_index[sort_index] dataframe = pd.DataFrame(index=np.arange(len(index)), columns=('sid_index', 'SNP', 'Chr', 'GenDist', 'ChrPos', 'PValue')) #!!Is this the only way to set types in a dataframe? dataframe['sid_index'] = dataframe['sid_index'].astype(np.float) dataframe['Chr'] = dataframe['Chr'].astype(np.float) dataframe['GenDist'] = dataframe['GenDist'].astype(np.float) dataframe['ChrPos'] = dataframe['ChrPos'].astype(np.float) dataframe['PValue'] = dataframe['PValue'].astype(np.float) dataframe['sid_index'] = index dataframe['SNP'] = np.array( test_snps.sid[index], dtype='str' ) #This will be ascii on Python2 and unicode on Python3 dataframe['Chr'] = test_snps.pos[index, 0] dataframe['GenDist'] = test_snps.pos[index, 1] dataframe['ChrPos'] = test_snps.pos[index, 2] dataframe['PValue'] = pval[sort_index] if output_file_name is not None: dataframe.to_csv(output_file_name, sep="\t", index=False) return dataframe dataframe = map_reduce(range(0, test_snps.sid_count, block_size), mapper=mapper, reducer=reducer, input_files=[test_snps, pheno, covar], output_files=[output_file_name], name="single_snp_linreg", runner=runner) return dataframe
def download( self, azure_path, local_path, do_sync_date=True, as_needed=True, updater=None ): #!!!perhaps should download to a tmp file and then rename after everything works. """ Download a file from the container. _file_transfer_reporter : is a python context manager what is initialized with a size and that yields a updater method that can be called with a byte count as the download progresses. """ #self._run_once() if as_needed and not self._download_needed_and_ready( local_path, azure_path): return t0 = time.time() blob_list = self._find_blobs_and_check(azure_path) piece_count = len(blob_list) start_stop_pairs = [] start = 0 for _, _, blob in blob_list: stop = start + blob.properties.content_length start_stop_pairs.append((start, stop)) start = stop size = start_stop_pairs[-1][1] # The size is the last stop value pstutil.create_directory_if_necessary(local_path, isfile=True) local_path_temp = local_path + ".temp" #!!! give it a unique name to ensure that it can't collide with a user's name. with open(local_path_temp, "wb") as fp: #Preallocate the local file to its full size if size > 0: fp.seek(size - 1) fp.write("\0") with _file_transfer_reporter("download", size, updater=updater) as updater: def mapper_closure(piece_index): blobetc = blob_list[piece_index] start, stop = start_stop_pairs[piece_index] logging.debug("\tDownloading {0}/{4} {1}-{2} in '{3}'".format( piece_index, start, stop, local_path, piece_count)) self._get_blobetc_to_stream(blobetc, local_path_temp, start, stop) updater(stop - start) name = "download." + os.path.basename( local_path) + datetime.datetime.utcnow().strftime( "%Y%m%d-%H%M%S") + str(random.random()) map_reduce( range(piece_count), mapper=mapper_closure, name=name, runner=self._get_runner(), ) if do_sync_date: self._sync_date(azure_path, local_path_temp) self._rename_no_matter_what(local_path_temp, local_path)
def single_snp(test_snps, pheno, K0=None, K1=None, mixing=None, covar=None, covar_by_chrom=None, leave_out_one_chrom=True, output_file_name=None, h2=None, log_delta=None, cache_file = None, GB_goal=None, interact_with_snp=None, force_full_rank=False, force_low_rank=False, G0=None, G1=None, runner=None, count_A1=None): """ Function performing single SNP GWAS using cross validation over the chromosomes and REML. Will reorder and intersect IIDs as needed. (For backwards compatibility, you may use 'leave_out_one_chrom=False' to skip cross validation, but that is not recommended.) :param test_snps: SNPs to test. Can be any `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_. If you give a string, it should be the base name of a set of PLINK Bed-formatted files. (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header') :type test_snps: a `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string :param pheno: A single phenotype: Can be any `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_, for example, `Pheno <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-pheno>`_ or `SnpData <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpdata>`_. If you give a string, it should be the file name of a PLINK phenotype-formatted file. Any IIDs with missing values will be removed. (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header') :type pheno: a `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string :param K0: SNPs from which to create a similarity matrix. If not given, will use test_snps. Can be any `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_. If you give a string, it should be the base name of a set of PLINK Bed-formatted files. (When leave_out_one_chrom is False, can be a `KernelReader <http://fastlmm.github.io.github.io/PySnpTools/#kernelreader-kernelreader>`_ or a `KernelNpz <http://fastlmm.github.io.github.io/PySnpTools/#kernelreader-kernelnpz>`_-formated file name.) :type K0: `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string (or `KernelReader <http://fastlmm.github.io.github.io/PySnpTools/#kernelreader-kernelreader>`_) :param K1: SNPs from which to create a second similarity matrix, optional. (Also, see 'mixing'). Can be any `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_. If you give a string, it should be the base name of a set of PLINK Bed-formatted files. (When leave_out_one_chrom is False, can be a `KernelReader <http://fastlmm.github.io.github.io/PySnpTools/#kernelreader-kernelreader>`_ or a `KernelNpz <http://fastlmm.github.io.github.io/PySnpTools/#kernelreader-kernelnpz>`_-formated file name.) :type K1: `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string (or `KernelReader <http://fastlmm.github.io.github.io/PySnpTools/#kernelreader-kernelreader>`_) :param mixing: Weight between 0.0 (inclusive, default) and 1.0 (inclusive) given to K1 relative to K0. If you give no mixing number and a K1 is given, the best weight will be learned. :type mixing: number :param covar: covariate information, optional: Can be any `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_, for example, `Pheno <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-pheno>`_ or `SnpData <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpdata>`_. If you give a string, it should be the file name of a PLINK phenotype-formatted file. (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header') :type covar: a `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string :param leave_out_one_chrom: Perform single SNP GWAS via cross validation over the chromosomes. Default to True. (Warning: setting False can cause proximal contamination.) :type leave_out_one_chrom: boolean :param output_file_name: Name of file to write results to, optional. If not given, no output file will be created. The output format is tab-delimited text. :type output_file_name: file name :param h2: A parameter to LMM learning, optional If not given will search for best value. If mixing is unspecified, then h2 must also be unspecified. :type h2: number :param log_delta: a re-parameterization of h2 provided for backwards compatibility. h2 is 1./(exp(log_delta)+1) :type log_delta: number :param cache_file: Name of file to read or write cached precomputation values to, optional. If not given, no cache file will be used. If given and file does not exist, will write precomputation values to file. If given and file does exist, will read precomputation values from file. The file contains the U and S matrix from the decomposition of the training matrix. It is in Python's np.savez (\*.npz) format. Calls using the same cache file should have the same 'K0' and 'K1' If given and the file does exist then K0 and K1 need not be given. :type cache_file: file name :param GB_goal: gigabytes of memory the run should use, optional. If not given, will read the test_snps in blocks the same size as the kernel, which is memory efficient with little overhead on computation time. :type GB_goal: number :param interact_with_snp: index of a covariate to perform an interaction test with. Allows for interaction testing (interact_with_snp x snp will be tested) default: None :param force_full_rank: Even if kernels are defined with fewer SNPs than IIDs, create an explicit iid_count x iid_count kernel. Cannot be True if force_low_rank is True. :type force_full_rank: Boolean :param force_low_rank: Even if kernels are defined with fewer IIDs than SNPs, create a low-rank iid_count x sid_count kernel. Cannot be True if force_full_rank is True. :type force_low_rank: Boolean :param G0: Same as K0. Provided for backwards compatibility. Cannot be given if K0 is given. :type G0: `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string (or `KernelReader <http://fastlmm.github.io.github.io/PySnpTools/#kernelreader-kernelreader>`_) :param G1: Same as K1. Provided for backwards compatibility. Cannot be given if K1 is given. :type G1: `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string (or `KernelReader <http://fastlmm.github.io.github.io/PySnpTools/#kernelreader-kernelreader>`_) :param runner: a `Runner <http://fastlmm.github.io.github.io/PySnpTools/#util-mapreduce1-runner-runner>`_, optional: Tells how to run locally, multi-processor, or on a cluster. If not given, the function is run locally. :type runner: `Runner <http://fastlmm.github.io.github.io/PySnpTools/#util-mapreduce1-runner-runner>`_ :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1 alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True. :type count_A1: bool :rtype: Pandas dataframe with one row per test SNP. Columns include "PValue" :Example: >>> import logging >>> from fastlmm.association import single_snp >>> from pysnptools.snpreader import Bed >>> logging.basicConfig(level=logging.INFO) >>> pheno_fn = "../feature_selection/examples/toydata.phe" >>> results_dataframe = single_snp(test_snps="../feature_selection/examples/toydata.5chrom", pheno=pheno_fn, count_A1=False) >>> print results_dataframe.iloc[0].SNP,round(results_dataframe.iloc[0].PValue,7),len(results_dataframe) null_576 1e-07 10000 """ t0 = time.time() if force_full_rank and force_low_rank: raise Exception("Can't force both full rank and low rank") assert test_snps is not None, "test_snps must be given as input" test_snps = _snps_fixup(test_snps, count_A1=count_A1) pheno = _pheno_fixup(pheno, count_A1=count_A1).read() assert pheno.sid_count == 1, "Expect pheno to be just one variable" pheno = pheno[(pheno.val==pheno.val)[:,0],:] covar = _pheno_fixup(covar, iid_if_none=pheno.iid, count_A1=count_A1) if not leave_out_one_chrom: assert covar_by_chrom is None, "When 'leave_out_one_chrom' is False, 'covar_by_chrom' must be None" K0 = _kernel_fixup(K0 or G0 or test_snps, iid_if_none=test_snps.iid, standardizer=Unit(),count_A1=count_A1) K1 = _kernel_fixup(K1 or G1, iid_if_none=test_snps.iid, standardizer=Unit(),count_A1=count_A1) K0, K1, test_snps, pheno, covar = pstutil.intersect_apply([K0, K1, test_snps, pheno, covar]) logging.debug("# of iids now {0}".format(K0.iid_count)) K0, K1, block_size = _set_block_size(K0, K1, mixing, GB_goal, force_full_rank, force_low_rank) frame = _internal_single(K0=K0, test_snps=test_snps, pheno=pheno, covar=covar, K1=K1, mixing=mixing, h2=h2, log_delta=log_delta, cache_file = cache_file, force_full_rank=force_full_rank,force_low_rank=force_low_rank, output_file_name=output_file_name,block_size=block_size, interact_with_snp=interact_with_snp, runner=runner) sid_index_range = IntRangeSet(frame['sid_index']) assert sid_index_range == (0,test_snps.sid_count), "Some SNP rows are missing from the output" else: chrom_list = list(set(test_snps.pos[:,0])) # find the set of all chroms mentioned in test_snps, the main testing data assert not np.isnan(chrom_list).any(), "chrom list should not contain NaN" input_files = [test_snps, pheno, K0, G0, K1, G1, covar] + ([] if covar_by_chrom is None else covar_by_chrom.values()) def nested_closure(chrom): test_snps_chrom = test_snps[:,test_snps.pos[:,0]==chrom] covar_chrom = _create_covar_chrom(covar, covar_by_chrom, chrom) cache_file_chrom = None if cache_file is None else cache_file + ".{0}".format(chrom) K0_chrom = _K_per_chrom(K0 or G0 or test_snps, chrom, test_snps.iid) K1_chrom = _K_per_chrom(K1 or G1, chrom, test_snps.iid) K0_chrom, K1_chrom, test_snps_chrom, pheno_chrom, covar_chrom = pstutil.intersect_apply([K0_chrom, K1_chrom, test_snps_chrom, pheno, covar_chrom]) logging.debug("# of iids now {0}".format(K0_chrom.iid_count)) K0_chrom, K1_chrom, block_size = _set_block_size(K0_chrom, K1_chrom, mixing, GB_goal, force_full_rank, force_low_rank) distributable = _internal_single(K0=K0_chrom, test_snps=test_snps_chrom, pheno=pheno_chrom, covar=covar_chrom, K1=K1_chrom, mixing=mixing, h2=h2, log_delta=log_delta, cache_file=cache_file_chrom, force_full_rank=force_full_rank,force_low_rank=force_low_rank, output_file_name=None, block_size=block_size, interact_with_snp=interact_with_snp, runner=Local()) return distributable def reducer_closure(frame_sequence): frame = pd.concat(frame_sequence) frame.sort_values(by="PValue", inplace=True) frame.index = np.arange(len(frame)) if output_file_name is not None: frame.to_csv(output_file_name, sep="\t", index=False) logging.info("PhenotypeName\t{0}".format(pheno.sid[0])) logging.info("SampleSize\t{0}".format(test_snps.iid_count)) logging.info("SNPCount\t{0}".format(test_snps.sid_count)) logging.info("Runtime\t{0}".format(time.time()-t0)) return frame frame = map_reduce(chrom_list, mapper = nested_closure, reducer = reducer_closure, input_files = input_files, output_files = [output_file_name], name = "single_snp (leave_out_one_chrom), out='{0}'".format(output_file_name), runner = runner) return frame
def _internal_single(K0, test_snps, pheno, covar, K1, mixing, h2, log_delta, cache_file, force_full_rank, force_low_rank, output_file_name, block_size, interact_with_snp, runner): assert K0 is not None, "real assert" assert K1 is not None, "real assert" assert block_size is not None, "real assert" assert mixing is None or 0.0 <= mixing <= 1.0 if force_full_rank and force_low_rank: raise Exception("Can't force both full rank and low rank") assert h2 is None or log_delta is None, "if h2 is specified, log_delta may not be specified" if log_delta is not None: h2 = 1.0/(np.exp(log_delta)+1) covar = np.c_[covar.read(view_ok=True,order='A').val,np.ones((test_snps.iid_count, 1))] #view_ok because np.c_ will allocation new memory y = pheno.read(view_ok=True,order='A').val #view_ok because this code already did a fresh read to look for any missing values if cache_file is not None and os.path.exists(cache_file): lmm = lmm_cov(X=covar, Y=y, G=None, K=None) with np.load(cache_file) as data: #!! similar code in epistasis lmm.U = data['arr_0'] lmm.S = data['arr_1'] h2 = data['arr_2'][0] mixing = data['arr_2'][1] else: K, h2, mixer = _Mixer.combine_the_best_way(K0, K1, covar, y, mixing, h2, force_full_rank=force_full_rank, force_low_rank=force_low_rank,kernel_standardizer=DiagKtoN()) mixing = mixer.mixing if mixer.do_g: lmm = lmm_cov(X=covar, Y=y, K=None, G=K.snpreader.val, inplace=True) else: #print(covar.sum(),y.sum(),K.val.sum(),covar[0],y[0],K.val[0,0]) lmm = lmm_cov(X=covar, Y=y, K=K.val, G=None, inplace=True) if h2 is None: result = lmm.findH2() h2 = result['h2'] logging.info("h2={0}".format(h2)) if cache_file is not None and not os.path.exists(cache_file): pstutil.create_directory_if_necessary(cache_file) lmm.getSU() np.savez(cache_file, lmm.U,lmm.S,np.array([h2,mixing])) #using np.savez instead of pickle because it seems to be faster to read and write if interact_with_snp is not None: logging.info("interaction with %i" % interact_with_snp) assert 0 <= interact_with_snp and interact_with_snp < covar.shape[1]-1, "interact_with_snp is out of range" interact = covar[:,interact_with_snp].copy() interact -=interact.mean() interact /= interact.std() else: interact = None work_count = -(test_snps.sid_count // -block_size) #Find the work count based on batch size (rounding up) # We define three closures, that is, functions define inside function so that the inner function has access to the local variables of the outer function. def debatch_closure(work_index): return test_snps.sid_count * work_index // work_count def mapper_closure(work_index): if work_count > 1: logging.info("single_snp: Working on snp block {0} of {1}".format(work_index,work_count)) do_work_time = time.time() start = debatch_closure(work_index) end = debatch_closure(work_index+1) snps_read = test_snps[:,start:end].read().standardize() if interact_with_snp is not None: variables_to_test = snps_read.val * interact[:,np.newaxis] else: variables_to_test = snps_read.val res = lmm.nLLeval(h2=h2, dof=None, scale=1.0, penalty=0.0, snps=variables_to_test) beta = res['beta'] chi2stats = beta*beta/res['variance_beta'] #p_values = stats.chi2.sf(chi2stats,1)[:,0] assert test_snps.iid_count == lmm.U.shape[0] p_values = stats.f.sf(chi2stats,1,lmm.U.shape[0]-(lmm.linreg.D+1))[:,0]#note that G.shape is the number of individuals# dataframe = _create_dataframe(snps_read.sid_count) dataframe['sid_index'] = np.arange(start,end) dataframe['SNP'] = snps_read.sid dataframe['Chr'] = snps_read.pos[:,0] dataframe['GenDist'] = snps_read.pos[:,1] dataframe['ChrPos'] = snps_read.pos[:,2] dataframe['PValue'] = p_values dataframe['SnpWeight'] = beta[:,0] dataframe['SnpWeightSE'] = np.sqrt(res['variance_beta'][:,0]) dataframe['SnpFractVarExpl'] = np.sqrt(res['fraction_variance_explained_beta'][:,0]) dataframe['Mixing'] = np.zeros((snps_read.sid_count)) + mixing dataframe['Nullh2'] = np.zeros((snps_read.sid_count)) + h2 logging.info("time={0}".format(time.time()-do_work_time)) #logging.info(dataframe) return dataframe def reducer_closure(result_sequence): if output_file_name is not None: create_directory_if_necessary(output_file_name) frame = pd.concat(result_sequence) frame.sort_values(by="PValue", inplace=True) frame.index = np.arange(len(frame)) if output_file_name is not None: frame.to_csv(output_file_name, sep="\t", index=False) return frame frame = map_reduce(xrange(work_count), mapper=mapper_closure,reducer=reducer_closure, input_files=[test_snps],output_files=[output_file_name], name="single_snp(output_file={0})".format(output_file_name), runner=runner) return frame
def write( storage, snpreader, piece_per_chrom_count=1, updater=None, runner=None ): #!!! might want to set pieces_per_chrom such that it is a certain size ''' Uploads from any :class:`.Bed`-like data to cluster storage for efficient retrieval later. If some of the contents already exists in storage, it skips uploading that part of the contents. (To avoid this behavior, clear the storage.) :param storage: Tells where to store SNP data. A string can be given and will be interpreted as the path of a local directory to use for storage. (The local directory will **not** be automatically erased and so must be user managed.) A :class:`.FileCache` instance can be given, which provides a method to specify cluster-distributed storage. (:class:`.FileCache`'s will **not** be automatically erased and must be user managed.) If `None`, the storage will be in an automatically-erasing temporary directory. (If the TEMP environment variable is set, Python places the temp directory under it.) :type storage: string or :class:`.FileCache` or None. :param snpreader: A :class:`.Bed` or other :class:`.SnpReader` with values of 0,1,2, or missing. (Note that this differs from most other `write` methods that take a :class:`.SnpData`) :type snpreader: :class:`.SnpReader` :param piece_per_chrom_count: The number of pieces in which to store the data from each chromosome. Data is split across SNPs. For exmple, if `piece_per_chrom_count` is set to 100 and 22 chromosomes are uploaded, then data will be stored in 2200 pieces. Later, when data is requested only the pieces necessary for the request will be downloaded to local storage. :type piece_per_chrom_count: A number :param updater: A single argument function to write logging message to, for example, the function created by :func:`.log_in_place`. :type updater: A function or lambda :param runner: a :class:`.Runner`, optional: Tells how to run. (Note that :class:`.Local` and :class:`.LocalMultProc` are good options.) If not given, the function is run locally. :type runner: :class:`.Runner` :rtype: DistributedBed >>> from pysnptools.snpreader import DistributedBed, Bed >>> import shutil >>> from pysnptools.util import example_file # Download and return local file name >>> directory = 'tempdir/toydataSkip10.distributedbed' >>> if os.path.exists(directory): ... shutil.rmtree(directory) >>> bedfile = example_file("pysnptools/examples/toydata.5chrom.*","*.bed") >>> snpreader = Bed(bedfile,count_A1=False)[:,::10] # Read every 10 snps from Bed format >>> DistributedBed.write(directory,snpreader,piece_per_chrom_count=5) # Write data in DistributedBed format DistributedBed(LocalCache('tempdir/toydataSkip10.distributedbed')) ''' from pysnptools.util import _file_transfer_reporter from pysnptools.util.filecache import FileCache count_A1 = True #Make all these's the same for reading and writing so that nothing will change. snpreader = _snps_fixup(snpreader, count_A1=count_A1) storage = FileCache._fixup(storage) chrom_set = sorted(set(snpreader.pos[:, 0])) for chrom in chrom_set: assert chrom == chrom and chrom == int( chrom ), "DistributedBed.write expects all chromosomes to be integers (not '{0}')".format( chrom) with _file_transfer_reporter("DistributedBed.write", size=0, updater=updater) as updater2: def mapper_closure(chrom): chrom_reader = snpreader[:, snpreader.pos[:, 0] == chrom] def nested_closure(piece_per_chrom_index): start = chrom_reader.sid_count * piece_per_chrom_index // piece_per_chrom_count stop = chrom_reader.sid_count * ( piece_per_chrom_index + 1) // piece_per_chrom_count piece_reader = chrom_reader[:, start:stop] _piece_name_list = [ "chrom{0}.piece{1}of{2}.{3}".format( int(chrom), piece_per_chrom_index, piece_per_chrom_count, suffix) for suffix in ['bim', 'fam', 'bed'] ] exist_list = [ storage.file_exists(_piece_name) for _piece_name in _piece_name_list ] if sum( exist_list ) < 3: #If all three of the BIM/FAM/BED files are already there, then skip the upload, otherwise do the upload for i in range( 3 ): #If one or two of BIM/FAM/BED are there, remove them if exist_list[i]: storage.remove(_piece_name_list[i]) _Distributed1Bed.write(_piece_name_list[-1], storage, piece_reader.read(), count_A1=count_A1, updater=updater2) return _piece_name_list[-1] return map_reduce( range(piece_per_chrom_count), mapper=nested_closure, ) list_list_pair = map_reduce( chrom_set, nested=mapper_closure, runner=runner, ) reader_name_list = [] reader_list = [] for chrom_result in list_list_pair: for _piece_name in chrom_result: reader_name_list.append(_piece_name) reader_list.append(_Distributed1Bed(_piece_name, storage)) _metadatanpz = "metadata.npz" with storage.open_write(_metadatanpz) as local_metadatanpz: _reader_name_listnpz = "reader_name_list.npz" with storage.open_write( _reader_name_listnpz) as local_reader_name_listnpz: reader_name_list_ascii = np.array(reader_name_list, dtype='S') np.savez(local_reader_name_listnpz, reader_name_list=reader_name_list_ascii) if os.path.exists(local_metadatanpz): os.remove(local_metadatanpz) _MergeSIDs(reader_list, cache_file=local_metadatanpz, skip_check=True) return DistributedBed(storage)