def do_correlation(start_vars): assert ('db' in start_vars) assert ('target_db' in start_vars) assert ('trait_id' in start_vars) this_dataset = data_set.create_dataset(dataset_name=start_vars['db']) target_dataset = data_set.create_dataset( dataset_name=start_vars['target_db']) this_trait = GeneralTrait(dataset=this_dataset, name=start_vars['trait_id']) this_trait = retrieve_sample_data(this_trait, this_dataset) corr_params = init_corr_params(start_vars) corr_results = calculate_results(this_trait, this_dataset, target_dataset, corr_params) #corr_results = collections.OrderedDict(sorted(corr_results.items(), key=lambda t: -abs(t[1][0]))) final_results = [] for _trait_counter, trait in enumerate( corr_results.keys()[:corr_params['return_count']]): if corr_params['type'] == "tissue": [sample_r, num_overlap, sample_p, symbol] = corr_results[trait] result_dict = { "trait": trait, "sample_r": sample_r, "#_strains": num_overlap, "p_value": sample_p, "symbol": symbol } elif corr_params['type'] == "literature" or corr_params[ 'type'] == "lit": [gene_id, sample_r] = corr_results[trait] result_dict = { "trait": trait, "sample_r": sample_r, "gene_id": gene_id } else: [sample_r, sample_p, num_overlap] = corr_results[trait] result_dict = { "trait": trait, "sample_r": sample_r, "#_strains": num_overlap, "p_value": sample_p } final_results.append(result_dict) # json_corr_results = generate_corr_json(final_corr_results, this_trait, this_dataset, target_dataset, for_api = True) return final_results
def fetch_sample_data(start_vars, this_trait, this_dataset, target_dataset): sample_data = process_samples(start_vars, this_dataset.group.all_samples_ordered()) target_dataset.get_trait_data(list(sample_data.keys())) this_trait = retrieve_sample_data(this_trait, this_dataset) this_trait_data = { "trait_sample_data": sample_data, "trait_id": start_vars["trait_id"] } results = map_shared_keys_to_values(target_dataset.samplelist, target_dataset.trait_data) return (this_trait_data, results)
def do_correlation(start_vars): assert('db' in start_vars) assert('target_db' in start_vars) assert('trait_id' in start_vars) this_dataset = data_set.create_dataset(dataset_name=start_vars['db']) target_dataset = data_set.create_dataset( dataset_name=start_vars['target_db']) this_trait = create_trait(dataset=this_dataset, name=start_vars['trait_id']) this_trait = retrieve_sample_data(this_trait, this_dataset) corr_params = init_corr_params(start_vars) corr_results = calculate_results( this_trait, this_dataset, target_dataset, corr_params) final_results = [] for _trait_counter, trait in enumerate(list(corr_results.keys())[:corr_params['return_count']]): if corr_params['type'] == "tissue": [sample_r, num_overlap, sample_p, symbol] = corr_results[trait] result_dict = { "trait": trait, "sample_r": sample_r, "#_strains": num_overlap, "p_value": sample_p, "symbol": symbol } elif corr_params['type'] == "literature" or corr_params['type'] == "lit": [gene_id, sample_r] = corr_results[trait] result_dict = { "trait": trait, "sample_r": sample_r, "gene_id": gene_id } else: [sample_r, sample_p, num_overlap] = corr_results[trait] result_dict = { "trait": trait, "sample_r": sample_r, "#_strains": num_overlap, "p_value": sample_p } final_results.append(result_dict) return final_results
def run_analysis(self, requestform): print("Starting CTL analysis on dataset") self.trait_db_list = [trait.strip() for trait in requestform['trait_list'].split(',')] self.trait_db_list = [x for x in self.trait_db_list if x] print("strategy:", requestform.get("strategy")) strategy = requestform.get("strategy") print("nperm:", requestform.get("nperm")) nperm = int(requestform.get("nperm")) print("parametric:", requestform.get("parametric")) parametric = bool(requestform.get("parametric")) print("significance:", requestform.get("significance")) significance = float(requestform.get("significance")) # Get the name of the .geno file belonging to the first phenotype datasetname = self.trait_db_list[0].split(":")[1] dataset = data_set.create_dataset(datasetname) genofilelocation = locate(dataset.group.name + ".geno", "genotype") parser = genofile_parser.ConvertGenoFile(genofilelocation) parser.process_csv() print(dataset.group) # Create a genotype matrix individuals = parser.individuals markers = [] markernames = [] for marker in parser.markers: markernames.append(marker["name"]) markers.append(marker["genotypes"]) genotypes = list(itertools.chain(*markers)) print(len(genotypes) / len(individuals), "==", len(parser.markers)) rGeno = r_t(ro.r.matrix(r_unlist(genotypes), nrow=len(markernames), ncol=len(individuals), dimnames = r_list(markernames, individuals), byrow=True)) # Create a phenotype matrix traits = [] for trait in self.trait_db_list: print("retrieving data for", trait) if trait != "": ts = trait.split(':') gt = TRAIT.GeneralTrait(name = ts[0], dataset_name = ts[1]) gt = TRAIT.retrieve_sample_data(gt, dataset, individuals) for ind in individuals: if ind in gt.data.keys(): traits.append(gt.data[ind].value) else: traits.append("-999") rPheno = r_t(ro.r.matrix(r_as_numeric(r_unlist(traits)), nrow=len(self.trait_db_list), ncol=len(individuals), dimnames = r_list(self.trait_db_list, individuals), byrow=True)) print(rPheno) # Use a data frame to store the objects rPheno = r_data_frame(rPheno, check_names = False) rGeno = r_data_frame(rGeno, check_names = False) # Debug: Print the genotype and phenotype files to disk #r_write_table(rGeno, "~/outputGN/geno.csv") #r_write_table(rPheno, "~/outputGN/pheno.csv") # Perform the CTL scan res = self.r_CTLscan(rGeno, rPheno, strategy = strategy, nperm = nperm, parametric = parametric, ncores = 6) # Get significant interactions significant = self.r_CTLsignificant(res, significance = significance) # Create an image for output self.results = {} self.results['imgurl1'] = webqtlUtil.genRandStr("CTLline_") + ".png" self.results['imgloc1'] = GENERATED_IMAGE_DIR + self.results['imgurl1'] self.results['ctlresult'] = significant self.results['requestform'] = requestform # Store the user specified parameters for the output page # Create the lineplot r_png(self.results['imgloc1'], width=1000, height=600, type='cairo-png') self.r_lineplot(res, significance = significance) r_dev_off() n = 2 # We start from 2, since R starts from 1 :) for trait in self.trait_db_list: # Create the QTL like CTL plots self.results['imgurl' + str(n)] = webqtlUtil.genRandStr("CTL_") + ".png" self.results['imgloc' + str(n)] = GENERATED_IMAGE_DIR + self.results['imgurl' + str(n)] r_png(self.results['imgloc' + str(n)], width=1000, height=600, type='cairo-png') self.r_plotCTLobject(res, (n-1), significance = significance, main='Phenotype ' + trait) r_dev_off() n = n + 1 # Flush any output from R sys.stdout.flush() # Create the interactive graph for cytoscape visualization (Nodes and Edges) print(type(significant)) if not type(significant) == ri.RNULLType: for x in range(len(significant[0])): print(significant[0][x], significant[1][x], significant[2][x]) # Debug to console tsS = significant[0][x].split(':') # Source tsT = significant[2][x].split(':') # Target gtS = TRAIT.GeneralTrait(name = tsS[0], dataset_name = tsS[1]) # Retrieve Source info from the DB gtT = TRAIT.GeneralTrait(name = tsT[0], dataset_name = tsT[1]) # Retrieve Target info from the DB self.addNode(gtS) self.addNode(gtT) self.addEdge(gtS, gtT, significant, x) significant[0][x] = gtS.symbol + " (" + gtS.name + ")" # Update the trait name for the displayed table significant[2][x] = gtT.symbol + " (" + gtT.name + ")" # Update the trait name for the displayed table self.elements = json.dumps(self.nodes_list + self.edges_list)
def do_mapping_for_api(start_vars): assert ('db' in start_vars) assert ('trait_id' in start_vars) dataset = data_set.create_dataset(dataset_name=start_vars['db']) dataset.group.get_markers() this_trait = GeneralTrait(dataset=dataset, name=start_vars['trait_id']) this_trait = retrieve_sample_data(this_trait, dataset) samples = [] vals = [] for sample in dataset.group.samplelist: in_trait_data = False for item in this_trait.data: if this_trait.data[item].name == sample: value = str(this_trait.data[item].value) samples.append(item) vals.append(value) in_trait_data = True break if not in_trait_data: vals.append("x") mapping_params = initialize_parameters(start_vars, dataset, this_trait) covariates = "" #ZS: It seems to take an empty string as default. This should probably be changed. if mapping_params['mapping_method'] == "gemma": header_row = ["name", "chr", "Mb", "lod_score", "p_value"] if mapping_params[ 'use_loco'] == "True": #ZS: gemma_mapping returns both results and the filename for LOCO, so need to only grab the former for api result_markers = gemma_mapping.run_gemma( this_trait, dataset, samples, vals, covariates, mapping_params['use_loco'], mapping_params['maf'])[0] else: result_markers = gemma_mapping.run_gemma( this_trait, dataset, samples, vals, covariates, mapping_params['use_loco'], mapping_params['maf']) elif mapping_params['mapping_method'] == "rqtl": header_row = ["name", "chr", "cM", "lod_score"] if mapping_params['num_perm'] > 0: _sperm_output, _suggestive, _significant, result_markers = rqtl_mapping.run_rqtl_geno( vals, dataset, mapping_params['rqtl_method'], mapping_params['rqtl_model'], mapping_params['perm_check'], mapping_params['num_perm'], mapping_params['do_control'], mapping_params['control_marker'], mapping_params['manhattan_plot'], mapping_params['pair_scan']) else: result_markers = rqtl_mapping.run_rqtl_geno( vals, dataset, mapping_params['rqtl_method'], mapping_params['rqtl_model'], mapping_params['perm_check'], mapping_params['num_perm'], mapping_params['do_control'], mapping_params['control_marker'], mapping_params['manhattan_plot'], mapping_params['pair_scan']) if mapping_params['limit_to']: result_markers = result_markers[:mapping_params['limit_to']] if mapping_params['format'] == "csv": output_rows = [] output_rows.append(header_row) for marker in result_markers: this_row = [marker[header] for header in header_row] output_rows.append(this_row) return output_rows, mapping_params['format'] elif mapping_params['format'] == "json": return result_markers, mapping_params['format'] else: return result_markers, None
def __init__(self, params): if "Temp" in params['dataset_1']: self.dataset_1 = data_set.create_dataset(dataset_name = "Temp", dataset_type = "Temp", group_name = params['dataset_1'].split("_")[1]) else: self.dataset_1 = data_set.create_dataset(params['dataset_1']) if "Temp" in params['dataset_2']: self.dataset_2 = data_set.create_dataset(dataset_name = "Temp", dataset_type = "Temp", group_name = params['dataset_2'].split("_")[1]) else: self.dataset_2 = data_set.create_dataset(params['dataset_2']) #self.dataset_3 = data_set.create_dataset(params['dataset_3']) self.trait_1 = create_trait(name=params['trait_1'], dataset=self.dataset_1) self.trait_2 = create_trait(name=params['trait_2'], dataset=self.dataset_2) #self.trait_3 = create_trait(name=params['trait_3'], dataset=self.dataset_3) self.method = params['method'] primary_samples = self.dataset_1.group.samplelist if self.dataset_1.group.parlist != None: primary_samples += self.dataset_1.group.parlist if self.dataset_1.group.f1list != None: primary_samples += self.dataset_1.group.f1list self.trait_1 = retrieve_sample_data(self.trait_1, self.dataset_1, primary_samples) self.trait_2 = retrieve_sample_data(self.trait_2, self.dataset_2, primary_samples) samples_1, samples_2, num_overlap = corr_result_helpers.normalize_values_with_samples(self.trait_1.data, self.trait_2.data) self.data = [] self.indIDs = list(samples_1.keys()) vals_1 = [] for sample in list(samples_1.keys()): vals_1.append(samples_1[sample].value) self.data.append(vals_1) vals_2 = [] for sample in list(samples_2.keys()): vals_2.append(samples_2[sample].value) self.data.append(vals_2) slope, intercept, r_value, p_value, std_err = stats.linregress(vals_1, vals_2) if slope < 0.001: slope_string = '%.3E' % slope else: slope_string = '%.3f' % slope x_buffer = (max(vals_1) - min(vals_1))*0.1 y_buffer = (max(vals_2) - min(vals_2))*0.1 x_range = [min(vals_1) - x_buffer, max(vals_1) + x_buffer] y_range = [min(vals_2) - y_buffer, max(vals_2) + y_buffer] intercept_coords = get_intercept_coords(slope, intercept, x_range, y_range) rx = stats.rankdata(vals_1) ry = stats.rankdata(vals_2) self.rdata = [] self.rdata.append(rx.tolist()) self.rdata.append(ry.tolist()) srslope, srintercept, srr_value, srp_value, srstd_err = stats.linregress(rx, ry) if srslope < 0.001: srslope_string = '%.3E' % srslope else: srslope_string = '%.3f' % srslope x_buffer = (max(rx) - min(rx))*0.1 y_buffer = (max(ry) - min(ry))*0.1 sr_range = [min(rx) - x_buffer, max(rx) + x_buffer] sr_intercept_coords = get_intercept_coords(srslope, srintercept, sr_range, sr_range) self.collections_exist = "False" if g.user_session.num_collections > 0: self.collections_exist = "True" self.js_data = dict( data = self.data, rdata = self.rdata, indIDs = self.indIDs, trait_1 = self.trait_1.dataset.name + ": " + str(self.trait_1.name), trait_2 = self.trait_2.dataset.name + ": " + str(self.trait_2.name), samples_1 = samples_1, samples_2 = samples_2, num_overlap = num_overlap, vals_1 = vals_1, vals_2 = vals_2, x_range = x_range, y_range = y_range, sr_range = sr_range, intercept_coords = intercept_coords, sr_intercept_coords = sr_intercept_coords, slope = slope, slope_string = slope_string, intercept = intercept, r_value = r_value, p_value = p_value, srslope = srslope, srslope_string = srslope_string, srintercept = srintercept, srr_value = srr_value, srp_value = srp_value #trait3 = self.trait_3.data, #vals_3 = vals_3 ) self.jsdata = self.js_data