def run_human(pheno_vector, covariate_matrix, plink_input_file, kinship_matrix, refit=False, loading_progress=None): v = np.isnan(pheno_vector) keep = True - v keep = keep.reshape((len(keep),)) identifier = str(uuid.uuid4()) print("pheno_vector: ", pf(pheno_vector)) print("kinship_matrix: ", pf(kinship_matrix)) print("kinship_matrix.shape: ", pf(kinship_matrix.shape)) lmm_vars = pickle.dumps(dict( pheno_vector = pheno_vector, covariate_matrix = covariate_matrix, kinship_matrix = kinship_matrix )) Redis.hset(identifier, "lmm_vars", lmm_vars) Redis.expire(identifier, 60*60) if v.sum(): pheno_vector = pheno_vector[keep] #print("pheno_vector shape is now: ", pf(pheno_vector.shape)) covariate_matrix = covariate_matrix[keep,:] #print("kinship_matrix shape is: ", pf(kinship_matrix.shape)) #print("len(keep) is: ", pf(keep.shape)) kinship_matrix = kinship_matrix[keep,:][:,keep] n = kinship_matrix.shape[0] #print("n is:", n) lmm_ob = LMM(pheno_vector, kinship_matrix, covariate_matrix) lmm_ob.fit() # Buffers for pvalues and t-stats p_values = [] t_stats = [] #print("input_file: ", plink_input_file) with Bench("Opening and loading pickle file"): with gzip.open(plink_input_file, "rb") as input_file: data = pickle.load(input_file) plink_input = data['plink_input'] #plink_input.getSNPIterator() with Bench("Calculating numSNPs"): total_snps = data['numSNPs'] with Bench("snp iterator loop"): count = 0 with Bench("Create list of inputs"): inputs = list(plink_input) print("len(genotypes): ", len(inputs)) with Bench("Divide into chunks"): results = chunks.divide_into_chunks(inputs, 64) result_store = [] key = "plink_inputs" # Todo: Delete below line when done testing Redis.delete(key) timestamp = datetime.datetime.utcnow().isoformat() #print("Starting adding loop") for part, result in enumerate(results): #data = pickle.dumps(result, pickle.HIGHEST_PROTOCOL) holder = pickle.dumps(dict( identifier = identifier, part = part, timestamp = timestamp, result = result ), pickle.HIGHEST_PROTOCOL) #print("Adding:", part) Redis.rpush(key, zlib.compress(holder)) #print("End adding loop") #print("***** Added to {} queue *****".format(key)) for snp, this_id in plink_input: #with Bench("part before association"): #if count > 2000: # break count += 1 percent_complete = (float(count) / total_snps) * 100 #print("percent_complete: ", percent_complete) loading_progress.store("percent_complete", percent_complete) #with Bench("actual association"): ps, ts = human_association(snp, n, keep, lmm_ob, pheno_vector, covariate_matrix, kinship_matrix, refit) #with Bench("after association"): p_values.append(ps) t_stats.append(ts) return p_values, t_stats
def get_trait_data(self, sample_list=None): if sample_list: self.samplelist = sample_list else: self.samplelist = self.group.samplelist if (self.group.parlist + self.group.f1list) in self.samplelist: self.samplelist += self.group.parlist + self.group.f1list query = """ SELECT Strain.Name, Strain.Id FROM Strain, Species WHERE Strain.Name IN {} and Strain.SpeciesId=Species.Id and Species.name = '{}' """.format(create_in_clause(self.samplelist), *mescape(self.group.species)) results = dict(g.db.execute(query).fetchall()) sample_ids = [results[item] for item in self.samplelist] # MySQL limits the number of tables that can be used in a join to 61, # so we break the sample ids into smaller chunks # Postgres doesn't have that limit, so we can get rid of this after we transition chunk_size = 50 number_chunks = int(math.ceil(len(sample_ids) / chunk_size)) trait_sample_data = [] for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks): #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId #tempTable = None #if GeneId and db.type == "ProbeSet": # if method == "3": # tempTable = self.getTempLiteratureTable(species=species, # input_species_geneid=GeneId, # returnNumber=returnNumber) # # if method == "4" or method == "5": # tempTable = self.getTempTissueCorrTable(primaryTraitSymbol=GeneSymbol, # TissueProbeSetFreezeId=tissueProbeSetFreezeId, # method=method, # returnNumber=returnNumber) if self.type == "Publish": dataset_type = "Phenotype" else: dataset_type = self.type temp = ['T%s.value' % item for item in sample_ids_step] if self.type == "Publish": query = "SELECT {}XRef.Id,".format(escape(self.type)) else: query = "SELECT {}.Name,".format(escape(dataset_type)) data_start_pos = 1 query += string.join(temp, ', ') query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(dataset_type, self.type, self.type)) for item in sample_ids_step: query += """ left join {}Data as T{} on T{}.Id = {}XRef.DataId and T{}.StrainId={}\n """.format(*mescape(self.type, item, item, self.type, item, item)) if self.type == "Publish": query += """ WHERE {}XRef.PublicationId = {}Freeze.Id and {}Freeze.Name = '{}' and {}.Id = {}XRef.{}Id order by {}.Id """.format(*mescape(self.type, self.type, self.type, self.name, dataset_type, self.type, dataset_type, dataset_type)) else: query += """ WHERE {}XRef.{}FreezeId = {}Freeze.Id and {}Freeze.Name = '{}' and {}.Id = {}XRef.{}Id order by {}.Id """.format(*mescape(self.type, self.type, self.type, self.type, self.name, dataset_type, self.type, self.type, dataset_type)) results = g.db.execute(query).fetchall() trait_sample_data.append(results) trait_count = len(trait_sample_data[0]) self.trait_data = collections.defaultdict(list) # put all of the separate data together into a dictionary where the keys are # trait names and values are lists of sample values for trait_counter in range(trait_count): trait_name = trait_sample_data[0][trait_counter][0] for chunk_counter in range(int(number_chunks)): self.trait_data[trait_name] += ( trait_sample_data[chunk_counter][trait_counter][data_start_pos:])