def create_trait(**kw): assert bool(kw.get('dataset')) != bool( kw.get('dataset_name')), "Needs dataset ob. or name" assert bool(kw.get('name')), "Needs trait name" if bool(kw.get('dataset')): dataset = kw.get('dataset') else: if kw.get('dataset_name') != "Temp": dataset = create_dataset(kw.get('dataset_name')) else: dataset = create_dataset("Temp", group_name=kw.get('group_name')) if dataset.type == 'Publish': permissions = check_resource_availability(dataset, kw.get('name')) else: permissions = check_resource_availability(dataset) if permissions['data'] != "no-access": the_trait = GeneralTrait(**kw) if the_trait.dataset.type != "Temp": the_trait = retrieve_trait_info( the_trait, the_trait.dataset, get_qtl_info=kw.get('get_qtl_info')) return the_trait else: return None
def view_collection(): params = request.args uc_id = params['uc_id'] uc = (collection for collection in g.user_session.user_collections if collection["id"] == uc_id).next() traits = uc["members"] trait_obs = [] json_version = [] for atrait in traits: name, dataset_name = atrait.split(':') if dataset_name == "Temp": group = name.split("_")[2] dataset = create_dataset(dataset_name, dataset_type="Temp", group_name=group) trait_ob = trait.GeneralTrait(name=name, dataset=dataset) else: dataset = create_dataset(dataset_name) trait_ob = trait.GeneralTrait(name=name, dataset=dataset) trait_ob = trait.retrieve_trait_info(trait_ob, dataset, get_qtl_info=True) trait_obs.append(trait_ob) json_version.append(trait.jsonable(trait_ob)) collection_info = dict(trait_obs=trait_obs, uc=uc) if "json" in params: return json.dumps(json_version) else: return render_template("collections/view.html", **collection_info)
def loading_page(): # logger.info(request.url) initial_start_vars = request.form start_vars_container = {} n_samples = 0 # ZS: So it can be displayed on loading page if 'wanted_inputs' in initial_start_vars: wanted = initial_start_vars['wanted_inputs'].split(",") start_vars = {} for key, value in list(initial_start_vars.items()): if key in wanted: start_vars[key] = value sample_vals_dict = json.loads(start_vars['sample_vals']) if 'n_samples' in start_vars: n_samples = int(start_vars['n_samples']) else: if 'group' in start_vars: dataset = create_dataset(start_vars['dataset'], group_name=start_vars['group']) else: dataset = create_dataset(start_vars['dataset']) samples = dataset.group.samplelist if 'genofile' in start_vars: if start_vars['genofile'] != "": genofile_string = start_vars['genofile'] dataset.group.genofile = genofile_string.split(":")[0] genofile_samples = run_mapping.get_genofile_samplelist( dataset) if len(genofile_samples) > 1: samples = genofile_samples for sample in samples: if sample in sample_vals_dict: if sample_vals_dict[sample] != "x": n_samples += 1 start_vars['n_samples'] = n_samples start_vars['vals_hash'] = generate_hash_of_string( str(sample_vals_dict)) if start_vars[ 'dataset'] != "Temp": # Currently can't get diff for temp traits start_vars['vals_diff'] = get_diff_of_vals( sample_vals_dict, str(start_vars['trait_id'] + ":" + str(start_vars['dataset']))) start_vars['wanted_inputs'] = initial_start_vars['wanted_inputs'] start_vars_container['start_vars'] = start_vars else: start_vars_container['start_vars'] = initial_start_vars rendered_template = render_template("loading.html", **start_vars_container) return rendered_template
def do_correlation(start_vars): assert ('db' in start_vars) assert ('target_db' in start_vars) assert ('trait_id' in start_vars) this_dataset = data_set.create_dataset(dataset_name=start_vars['db']) target_dataset = data_set.create_dataset( dataset_name=start_vars['target_db']) this_trait = GeneralTrait(dataset=this_dataset, name=start_vars['trait_id']) this_trait = retrieve_sample_data(this_trait, this_dataset) corr_params = init_corr_params(start_vars) corr_results = calculate_results(this_trait, this_dataset, target_dataset, corr_params) #corr_results = collections.OrderedDict(sorted(corr_results.items(), key=lambda t: -abs(t[1][0]))) final_results = [] for _trait_counter, trait in enumerate( corr_results.keys()[:corr_params['return_count']]): if corr_params['type'] == "tissue": [sample_r, num_overlap, sample_p, symbol] = corr_results[trait] result_dict = { "trait": trait, "sample_r": sample_r, "#_strains": num_overlap, "p_value": sample_p, "symbol": symbol } elif corr_params['type'] == "literature" or corr_params[ 'type'] == "lit": [gene_id, sample_r] = corr_results[trait] result_dict = { "trait": trait, "sample_r": sample_r, "gene_id": gene_id } else: [sample_r, sample_p, num_overlap] = corr_results[trait] result_dict = { "trait": trait, "sample_r": sample_r, "#_strains": num_overlap, "p_value": sample_p } final_results.append(result_dict) # json_corr_results = generate_corr_json(final_corr_results, this_trait, this_dataset, target_dataset, for_api = True) return final_results
def __init__(self, get_qtl_info=False, get_sample_info=True, **kw): # xor assertion assert bool(kw.get('dataset')) != bool( kw.get('dataset_name')), "Needs dataset ob. or name" # Trait ID, ProbeSet ID, Published ID, etc. self.name = kw.get('name') if kw.get('dataset_name'): if kw.get('dataset_name') == "Temp": temp_group = self.name.split("_")[2] self.dataset = create_dataset(dataset_name="Temp", dataset_type="Temp", group_name=temp_group) else: self.dataset = create_dataset(kw.get('dataset_name')) else: self.dataset = kw.get('dataset') self.cellid = kw.get('cellid') self.identification = kw.get('identification', 'un-named trait') self.haveinfo = kw.get('haveinfo', False) # Blat sequence, available for ProbeSet self.sequence = kw.get('sequence') self.data = kw.get('data', {}) self.view = True # Sets defaults self.locus = None self.lrs = None self.pvalue = None self.mean = None self.additive = None self.num_overlap = None self.strand_probe = None self.symbol = None self.display_name = self.name self.LRS_score_repr = "N/A" self.LRS_location_repr = "N/A" if kw.get('fullname'): name2 = value.split("::") if len(name2) == 2: self.dataset, self.name = name2 # self.cellid is set to None above elif len(name2) == 3: self.dataset, self.name, self.cellid = name2 # Todo: These two lines are necessary most of the time, but # perhaps not all of the time So we could add a simple if # statement to short-circuit this if necessary if get_sample_info is not False: self = retrieve_sample_data(self, self.dataset)
def view_collection(): params = request.args print("PARAMS in view collection:", params) if "uc_id" in params: uc_id = params['uc_id'] uc = model.UserCollection.query.get(uc_id) traits = json.loads(uc.members) else: user_collections = json.loads(Redis.get(user_manager.AnonUser().key)) this_collection = {} for collection in user_collections: if collection['id'] == params['collection_id']: this_collection = collection break #this_collection = user_collections[params['collection_id']] traits = this_collection['members'] print("in view_collection traits are:", traits) trait_obs = [] json_version = [] for atrait in traits: name, dataset_name = atrait.split(':') if dataset_name == "Temp": group = name.split("_")[2] dataset = create_dataset(dataset_name, dataset_type = "Temp", group_name = group) trait_ob = trait.GeneralTrait(name=name, dataset=dataset) else: dataset = create_dataset(dataset_name) trait_ob = trait.GeneralTrait(name=name, dataset=dataset) trait_ob = trait.retrieve_trait_info(trait_ob, dataset, get_qtl_info=True) trait_obs.append(trait_ob) json_version.append(trait.jsonable(trait_ob)) if "uc_id" in params: collection_info = dict(trait_obs=trait_obs, uc = uc) else: collection_info = dict(trait_obs=trait_obs, collection_name=this_collection['name']) if "json" in params: print("json_version:", json_version) return json.dumps(json_version) else: return render_template("collections/view.html", **collection_info )
def __init__(self, get_qtl_info=False, **kw): # xor assertion assert bool(kw.get("dataset")) != bool(kw.get("dataset_name")), "Needs dataset ob. or name" if kw.get("dataset_name"): self.dataset = create_dataset(kw.get("dataset_name")) print(" in GeneralTrait created dataset:", self.dataset) else: self.dataset = kw.get("dataset") self.name = kw.get("name") # Trait ID, ProbeSet ID, Published ID, etc. self.cellid = kw.get("cellid") self.identification = kw.get("identification", "un-named trait") self.haveinfo = kw.get("haveinfo", False) self.sequence = kw.get("sequence") # Blat sequence, available for ProbeSet self.data = kw.get("data", {}) # Sets defaults self.locus = None self.lrs = None self.pvalue = None self.mean = None self.num_overlap = None if kw.get("fullname"): name2 = value.split("::") if len(name2) == 2: self.dataset, self.name = name2 # self.cellid is set to None above elif len(name2) == 3: self.dataset, self.name, self.cellid = name2 # Todo: These two lines are necessary most of the time, but perhaps not all of the time # So we could add a simple if statement to short-circuit this if necessary self.retrieve_info(get_qtl_info=get_qtl_info) self.retrieve_sample_data()
def gen_covariates_file(this_dataset, covariates): covariate_list = covariates.split(",") covariate_data_object = [] for covariate in covariate_list: this_covariate_data = [] trait_name = covariate.split(":")[0] dataset_ob = create_dataset(covariate.split(":")[1]) trait_ob = GeneralTrait(dataset=dataset_ob, name=trait_name, cellid=None) #trait_samples = this_dataset.group.all_samples_ordered() this_dataset.group.get_samplelist() trait_samples = this_dataset.group.samplelist logger.debug("SAMPLES:", trait_samples) trait_sample_data = trait_ob.data logger.debug("SAMPLE DATA:", trait_sample_data) for index, sample in enumerate(trait_samples): if sample in trait_sample_data: sample_value = trait_sample_data[sample].value this_covariate_data.append(sample_value) else: this_covariate_data.append("-9") covariate_data_object.append(this_covariate_data) with open("{}/{}_covariates.txt".format(flat_files('mapping'), this_dataset.group.name), "w") as outfile: for i in range(len(covariate_data_object[0])): for this_covariate in covariate_data_object: outfile.write(str(this_covariate[i]) + "\t") outfile.write("\n")
def create_target_this_trait(start_vars): """this function creates the required trait and target dataset for correlation""" if start_vars['dataset'] == "Temp": this_dataset = data_set.create_dataset(dataset_name="Temp", dataset_type="Temp", group_name=start_vars['group']) else: this_dataset = data_set.create_dataset( dataset_name=start_vars['dataset']) target_dataset = data_set.create_dataset( dataset_name=start_vars['corr_dataset']) this_trait = create_trait(dataset=this_dataset, name=start_vars['trait_id']) sample_data = () return (this_dataset, this_trait, target_dataset, sample_data)
def __init__(self, start_vars): self.dataset1 = data_set.create_dataset(start_vars["dataset1"]) self.trait1 = GeneralTrait(dataset=self.dataset1.name, name=start_vars["trait1"]) self.dataset2 = data_set.create_dataset(start_vars["dataset2"]) self.trait2 = GeneralTrait(dataset=self.dataset2.name, name=start_vars["trait2"]) sample_names_1 = self.get_sample_names(self.dataset1) sample_names_2 = self.get_sample_names(self.dataset2) self.samples_1 = self.get_samples(self.dataset1, sample_names_1, self.trait1) self.samples_2 = self.get_samples(self.dataset2, sample_names_2, self.trait2) coords = {} for sample in self.samples_1: coords[sample.name] = sample.val
def gen_covariates_file(this_dataset, covariates, samples): covariate_list = covariates.split(",") covariate_data_object = [] for covariate in covariate_list: this_covariate_data = [] trait_name = covariate.split(":")[0] dataset_ob = create_dataset(covariate.split(":")[1]) trait_ob = create_trait(dataset=dataset_ob, name=trait_name, cellid=None) this_dataset.group.get_samplelist() trait_samples = this_dataset.group.samplelist trait_sample_data = trait_ob.data for index, sample in enumerate(trait_samples): if sample in samples: if sample in trait_sample_data: sample_value = trait_sample_data[sample].value this_covariate_data.append(sample_value) else: this_covariate_data.append("-9") covariate_data_object.append(this_covariate_data) with open((f"{flat_files('mapping')}/" f"{this_dataset.group.name}_covariates.txt"), "w") as outfile: for i in range(len(covariate_data_object[0])): for this_covariate in covariate_data_object: outfile.write(str(this_covariate[i]) + "\t") outfile.write("\n")
def gen_covariates_file(this_dataset, covariates): covariate_list = covariates.split(",") covariate_data_object = [] for covariate in covariate_list: this_covariate_data = [] trait_name = covariate.split(":")[0] dataset_ob = create_dataset(covariate.split(":")[1]) trait_ob = GeneralTrait(dataset=dataset_ob, name=trait_name, cellid=None) #trait_samples = this_dataset.group.all_samples_ordered() this_dataset.group.get_samplelist() trait_samples = this_dataset.group.samplelist logger.debug("SAMPLES:", trait_samples) trait_sample_data = trait_ob.data logger.debug("SAMPLE DATA:", trait_sample_data) for index, sample in enumerate(trait_samples): if sample in trait_sample_data: sample_value = trait_sample_data[sample].value this_covariate_data.append(sample_value) else: this_covariate_data.append("-9") covariate_data_object.append(this_covariate_data) with open( "{}/{}_covariates.txt".format(flat_files('mapping'), this_dataset.group.name), "w") as outfile: for i in range(len(covariate_data_object[0])): for this_covariate in covariate_data_object: outfile.write(str(this_covariate[i]) + "\t") outfile.write("\n")
def run_analysis(self, requestform): print("Starting ePheWAS analysis on dataset") genofilelocation = locate("BXD.geno", "genotype") # Get the location of the BXD genotypes tissuealignerloc = locate("Tissue_color_aligner.csv", "auwerx") # Get the location of the Tissue_color_aligner # Get user parameters, trait_id and dataset, and store/update them in self self.trait_id = requestform["trait_id"] self.datasetname = requestform["dataset"] self.dataset = data_set.create_dataset(self.datasetname) # Print some debug print "self.trait_id:" + self.trait_id + "\n" print "self.datasetname:" + self.datasetname + "\n" print "self.dataset.type:" + self.dataset.type + "\n" # Load in the genotypes file *sigh* to make the markermap parser = genofile_parser.ConvertGenoFile(genofilelocation) parser.process_csv() snpinfo = [] for marker in parser.markers: snpinfo.append(marker["name"]); snpinfo.append(marker["chr"]); snpinfo.append(marker["Mb"]); rnames = r_seq(1, len(parser.markers)) # Create the snp aligner object out of the BXD genotypes snpaligner = ro.r.matrix(snpinfo, nrow=len(parser.markers), dimnames = r_list(rnames, r_c("SNP", "Chr", "Pos")), ncol = 3, byrow=True) # Create the phenotype aligner object using R phenoaligner = self.r_create_Pheno_aligner() print("Initialization of ePheWAS done !")
def loading_page(): logger.info(request.url) initial_start_vars = request.form start_vars_container = {} n_samples = 0 #ZS: So it can be displayed on loading page if 'wanted_inputs' in initial_start_vars: wanted = initial_start_vars['wanted_inputs'].split(",") start_vars = {} for key, value in list(initial_start_vars.items()): if key in wanted: start_vars[key] = value if 'n_samples' in start_vars: n_samples = int(start_vars['n_samples']) else: sample_vals_dict = json.loads(start_vars['sample_vals']) if 'group' in start_vars: dataset = create_dataset(start_vars['dataset'], group_name=start_vars['group']) else: dataset = create_dataset(start_vars['dataset']) genofile_samplelist = [] samples = start_vars['primary_samples'].split(",") if 'genofile' in start_vars: if start_vars['genofile'] != "": genofile_string = start_vars['genofile'] dataset.group.genofile = genofile_string.split(":")[0] genofile_samples = run_mapping.get_genofile_samplelist( dataset) if len(genofile_samples) > 1: samples = genofile_samples for sample in samples: if sample in sample_vals_dict: if sample_vals_dict[sample] != "x": n_samples += 1 start_vars['n_samples'] = n_samples start_vars['wanted_inputs'] = initial_start_vars['wanted_inputs'] start_vars_container['start_vars'] = start_vars else: start_vars_container['start_vars'] = initial_start_vars rendered_template = render_template("loading.html", **start_vars_container) return rendered_template
def __init__(self, kw): """ This class gets invoked after hitting submit on the main menu (in views.py). """ ########################################### # Names and IDs of group / F2 set ########################################### self.uc_id = uuid.uuid4() self.go_term = None logger.debug("uc_id:", self.uc_id) # contains a unique id logger.debug("kw is:", kw) # dict containing search terms if kw['search_terms_or']: self.and_or = "or" self.search_terms = kw['search_terms_or'] else: self.and_or = "and" self.search_terms = kw['search_terms_and'] search = self.search_terms self.original_search_string = self.search_terms # check for dodgy search terms rx = re.compile(r'.*\W(href|http|sql|select|update)\W.*', re.IGNORECASE) if rx.match(search): logger.info("Regex failed search") self.search_term_exists = False return else: self.search_term_exists = True self.results = [] type = kw.get('type') if type == "Phenotypes": # split datatype on type field dataset_type = "Publish" elif type == "Genotypes": dataset_type = "Geno" else: dataset_type = "ProbeSet" # ProbeSet is default assert (is_str(kw.get('dataset'))) self.dataset = create_dataset(kw['dataset'], dataset_type) logger.debug("search_terms:", self.search_terms) #ZS: I don't like using try/except, but it seems like the easiest way to account for all possible bad searches here try: self.search() except: self.search_term_exists = False self.too_many_results = False if self.search_term_exists: if len(self.results) > 50000: self.trait_list = [] self.too_many_results = True else: self.gen_search_result()
def view_collection(): params = request.args if g.user_session.logged_in and "uc_id" in params: uc_id = params['uc_id'] uc = (collection for collection in g.user_session.user_collections if collection["id"] == uc_id).next() traits = uc["members"] else: user_collections = json.loads(Redis.get(user_manager.AnonUser().key)) this_collection = {} for collection in user_collections: if collection['id'] == params['collection_id']: this_collection = collection break traits = this_collection['members'] trait_obs = [] json_version = [] for atrait in traits: name, dataset_name = atrait.split(':') if dataset_name == "Temp": group = name.split("_")[2] dataset = create_dataset(dataset_name, dataset_type = "Temp", group_name = group) trait_ob = trait.GeneralTrait(name=name, dataset=dataset) else: dataset = create_dataset(dataset_name) trait_ob = trait.GeneralTrait(name=name, dataset=dataset) trait_ob = trait.retrieve_trait_info(trait_ob, dataset, get_qtl_info=True) trait_obs.append(trait_ob) json_version.append(trait.jsonable(trait_ob)) if "uc_id" in params: collection_info = dict(trait_obs=trait_obs, uc = uc) else: collection_info = dict(trait_obs=trait_obs, collection_name=this_collection['name']) if "json" in params: return json.dumps(json_version) else: return render_template("collections/view.html", **collection_info )
def geno_db_exists(this_dataset): geno_db_name = this_dataset.group.name + "Geno" try: geno_db = data_set.create_dataset(dataset_name=geno_db_name, get_samplelist=False) return "True" except: return "False"
def jsonable_table_row(trait, dataset_name, index): """Return a list suitable for json and intended to be displayed in a table Actual turning into json doesn't happen here though""" dataset = create_dataset(dataset_name) if dataset.type == "ProbeSet": if trait.mean == "": mean = "N/A" else: mean = "%.3f" % round(float(trait.mean), 2) if trait.additive == "": additive = "N/A" else: additive = "%.3f" % round(float(trait.additive), 2) return ['<input type="checkbox" name="searchResult" class="checkbox trait_checkbox" value="' + user_manager.data_hmac('{}:{}'.format(str(trait.name), dataset.name)) + '">', index, '<a href="/show_trait?trait_id='+str(trait.name)+'&dataset='+dataset.name+'">'+str(trait.name)+'</a>', trait.symbol, trait.description_display, trait.location_repr, mean, trait.LRS_score_repr, trait.LRS_location_repr, additive] elif dataset.type == "Publish": if trait.additive == "": additive = "N/A" else: additive = "%.2f" % round(float(trait.additive), 2) if trait.pubmed_id: return ['<input type="checkbox" name="searchResult" class="checkbox trait_checkbox" value="' + user_manager.data_hmac('{}:{}'.format(str(trait.name), dataset.name)) + '">', index, '<a href="/show_trait?trait_id='+str(trait.name)+'&dataset='+dataset.name+'">'+str(trait.name)+'</a>', trait.description_display, trait.authors, '<a href="' + trait.pubmed_link + '">' + trait.pubmed_text + '</href>', trait.LRS_score_repr, trait.LRS_location_repr, additive] else: return ['<input type="checkbox" name="searchResult" class="checkbox trait_checkbox" value="' + user_manager.data_hmac('{}:{}'.format(str(trait.name), dataset.name)) + '">', index, '<a href="/show_trait?trait_id='+str(trait.name)+'&dataset='+dataset.name+'">'+str(trait.name)+'</a>', trait.description_display, trait.authors, trait.pubmed_text, trait.LRS_score_repr, trait.LRS_location_repr, additive] elif dataset.type == "Geno": return ['<input type="checkbox" name="searchResult" class="checkbox trait_checkbox" value="' + user_manager.data_hmac('{}:{}'.format(str(trait.name), dataset.name)) + '">', index, '<a href="/show_trait?trait_id='+str(trait.name)+'&dataset='+dataset.name+'">'+str(trait.name)+'</a>', trait.location_repr] else: return dict()
def jsonable_table_row(trait, dataset_name, index): """Return a list suitable for json and intended to be displayed in a table Actual turning into json doesn't happen here though""" dataset = create_dataset(dataset_name) if dataset.type == "ProbeSet": if trait.mean == "": mean = "N/A" else: mean = "%.3f" % round(float(trait.mean), 2) if trait.additive == "": additive = "N/A" else: additive = "%.3f" % round(float(trait.additive), 2) return ['<input type="checkbox" name="searchResult" class="checkbox trait_checkbox" value="' + hmac.data_hmac('{}:{}'.format(str(trait.name), dataset.name)) + '">', index, '<a href="/show_trait?trait_id='+str(trait.name)+'&dataset='+dataset.name+'">'+str(trait.name)+'</a>', trait.symbol, trait.description_display, trait.location_repr, mean, trait.LRS_score_repr, trait.LRS_location_repr, additive] elif dataset.type == "Publish": if trait.additive == "": additive = "N/A" else: additive = "%.2f" % round(float(trait.additive), 2) if trait.pubmed_id: return ['<input type="checkbox" name="searchResult" class="checkbox trait_checkbox" value="' + hmac.data_hmac('{}:{}'.format(str(trait.name), dataset.name)) + '">', index, '<a href="/show_trait?trait_id='+str(trait.name)+'&dataset='+dataset.name+'">'+str(trait.name)+'</a>', trait.description_display, trait.authors, '<a href="' + trait.pubmed_link + '">' + trait.pubmed_text + '</href>', trait.LRS_score_repr, trait.LRS_location_repr, additive] else: return ['<input type="checkbox" name="searchResult" class="checkbox trait_checkbox" value="' + hmac.data_hmac('{}:{}'.format(str(trait.name), dataset.name)) + '">', index, '<a href="/show_trait?trait_id='+str(trait.name)+'&dataset='+dataset.name+'">'+str(trait.name)+'</a>', trait.description_display, trait.authors, trait.pubmed_text, trait.LRS_score_repr, trait.LRS_location_repr, additive] elif dataset.type == "Geno": return ['<input type="checkbox" name="searchResult" class="checkbox trait_checkbox" value="' + hmac.data_hmac('{}:{}'.format(str(trait.name), dataset.name)) + '">', index, '<a href="/show_trait?trait_id='+str(trait.name)+'&dataset='+dataset.name+'">'+str(trait.name)+'</a>', trait.location_repr] else: return dict()
def __init__(self, kw): """This class gets invoked after hitting submit on the main menu (in views.py). """ ########################################### # Names and IDs of group / F2 set ########################################### # All Phenotypes is a special case we'll deal with later #if kw['dataset'] == "All Phenotypes": # self.cursor.execute(""" # select PublishFreeze.Name, InbredSet.Name, InbredSet.Id from PublishFreeze, # InbredSet where PublishFreeze.Name not like 'BXD300%' and InbredSet.Id = # PublishFreeze.InbredSetId""") # results = self.cursor.fetchall() # self.dataset = map(lambda x: DataSet(x[0], self.cursor), results) # self.dataset_groups = map(lambda x: x[1], results) # self.dataset_group_ids = map(lambda x: x[2], results) #else: self.uc_id = uuid.uuid4() logger.debug("uc_id:", self.uc_id) # contains a unique id logger.debug("kw is:", kw) # dict containing search terms if kw['search_terms_or']: self.and_or = "or" self.search_terms = kw['search_terms_or'] else: self.and_or = "and" self.search_terms = kw['search_terms_and'] search = self.search_terms # check for dodgy search terms rx = re.compile(r'.*\W(href|http|sql|select|update)\W.*', re.IGNORECASE) if rx.match(search): logger.info("Regex failed search") self.search_term_exists = False return else: self.search_term_exists = True self.results = [] type = kw.get('type') if type == "Phenotypes": # split datatype on type field dataset_type = "Publish" elif type == "Genotypes": dataset_type = "Geno" else: dataset_type = "ProbeSet" # ProbeSet is default assert (is_str(kw.get('dataset'))) self.dataset = create_dataset(kw['dataset'], dataset_type) logger.debug("search_terms:", self.search_terms) self.search() if self.search_term_exists: self.gen_search_result()
def do_correlation(start_vars): assert('db' in start_vars) assert('target_db' in start_vars) assert('trait_id' in start_vars) this_dataset = data_set.create_dataset(dataset_name=start_vars['db']) target_dataset = data_set.create_dataset( dataset_name=start_vars['target_db']) this_trait = create_trait(dataset=this_dataset, name=start_vars['trait_id']) this_trait = retrieve_sample_data(this_trait, this_dataset) corr_params = init_corr_params(start_vars) corr_results = calculate_results( this_trait, this_dataset, target_dataset, corr_params) final_results = [] for _trait_counter, trait in enumerate(list(corr_results.keys())[:corr_params['return_count']]): if corr_params['type'] == "tissue": [sample_r, num_overlap, sample_p, symbol] = corr_results[trait] result_dict = { "trait": trait, "sample_r": sample_r, "#_strains": num_overlap, "p_value": sample_p, "symbol": symbol } elif corr_params['type'] == "literature" or corr_params['type'] == "lit": [gene_id, sample_r] = corr_results[trait] result_dict = { "trait": trait, "sample_r": sample_r, "gene_id": gene_id } else: [sample_r, sample_p, num_overlap] = corr_results[trait] result_dict = { "trait": trait, "sample_r": sample_r, "#_strains": num_overlap, "p_value": sample_p } final_results.append(result_dict) return final_results
def __init__(self, kw): """This class gets invoked after hitting submit on the main menu (in views.py). """ ########################################### # Names and IDs of group / F2 set ########################################### # All Phenotypes is a special case we'll deal with later #if kw['dataset'] == "All Phenotypes": # self.cursor.execute(""" # select PublishFreeze.Name, InbredSet.Name, InbredSet.Id from PublishFreeze, # InbredSet where PublishFreeze.Name not like 'BXD300%' and InbredSet.Id = # PublishFreeze.InbredSetId""") # results = self.cursor.fetchall() # self.dataset = map(lambda x: DataSet(x[0], self.cursor), results) # self.dataset_groups = map(lambda x: x[1], results) # self.dataset_group_ids = map(lambda x: x[2], results) #else: self.uc_id = uuid.uuid4() logger.debug("uc_id:", self.uc_id) # contains a unique id logger.debug("kw is:", kw) # dict containing search terms if kw['search_terms_or']: self.and_or = "or" self.search_terms = kw['search_terms_or'] else: self.and_or = "and" self.search_terms = kw['search_terms_and'] search = self.search_terms # check for dodgy search terms rx = re.compile(r'.*\W(href|http|sql|select|update)\W.*',re.IGNORECASE) if rx.match(search): logger.info("Regex failed search") self.search_term_exists = False return else: self.search_term_exists = True self.results = [] type = kw.get('type') if type == "Phenotypes": # split datatype on type field dataset_type = "Publish" elif type == "Genotypes": dataset_type = "Geno" else: dataset_type = "ProbeSet" # ProbeSet is default assert(is_str(kw.get('dataset'))) self.dataset = create_dataset(kw['dataset'], dataset_type) logger.debug("search_terms:", self.search_terms) self.search() if self.search_term_exists: self.gen_search_result()
def get_trait_db_obs(self, trait_db_list): self.trait_list = [] for i, trait_db in enumerate(trait_db_list): if i == (len(trait_db_list) - 1): break trait_name, dataset_name = trait_db.split(":") dataset_ob = data_set.create_dataset(dataset_name) trait_ob = GeneralTrait(dataset=dataset_ob, name=trait_name, cellid=None) self.trait_list.append((trait_ob, dataset_ob))
def set_template_vars(start_vars, correlation_data): corr_type = start_vars['corr_type'] corr_method = start_vars['corr_sample_method'] if start_vars['dataset'] == "Temp": this_dataset_ob = create_dataset(dataset_name="Temp", dataset_type="Temp", group_name=start_vars['group']) else: this_dataset_ob = create_dataset(dataset_name=start_vars['dataset']) this_trait = create_trait(dataset=this_dataset_ob, name=start_vars['trait_id']) correlation_data['this_trait'] = jsonable(this_trait, this_dataset_ob) correlation_data['this_dataset'] = this_dataset_ob.as_dict() target_dataset_ob = create_dataset(correlation_data['target_dataset']) correlation_data['target_dataset'] = target_dataset_ob.as_dict() table_json = correlation_json_for_table(correlation_data, correlation_data['this_trait'], correlation_data['this_dataset'], target_dataset_ob) correlation_data['table_json'] = table_json if target_dataset_ob.type == "ProbeSet": filter_cols = [7, 6] elif target_dataset_ob.type == "Publish": filter_cols = [6, 0] else: filter_cols = [4, 0] correlation_data['corr_method'] = corr_method correlation_data['filter_cols'] = filter_cols correlation_data['header_fields'] = get_header_fields( target_dataset_ob.type, correlation_data['corr_method']) correlation_data['formatted_corr_type'] = get_formatted_corr_type( corr_type, corr_method) return correlation_data
def jsonable(trait): """Return a dict suitable for using as json Actual turning into json doesn't happen here though""" dataset = create_dataset(dataset_name = trait.dataset.name, dataset_type = trait.dataset.type, group_name = trait.dataset.group.name) if dataset.type == "ProbeSet": return dict(name=trait.name, symbol=trait.symbol, dataset=dataset.name, dataset_name = dataset.shortname, description=trait.description_display, mean=trait.mean, location=trait.location_repr, lrs_score=trait.LRS_score_repr, lrs_location=trait.LRS_location_repr, additive=trait.additive ) elif dataset.type == "Publish": if trait.pubmed_id: return dict(name=trait.name, dataset=dataset.name, dataset_name = dataset.shortname, description=trait.description_display, abbreviation=trait.abbreviation, authors=trait.authors, pubmed_text=trait.pubmed_text, pubmed_link=trait.pubmed_link, lrs_score=trait.LRS_score_repr, lrs_location=trait.LRS_location_repr, additive=trait.additive ) else: return dict(name=trait.name, dataset=dataset.name, dataset_name = dataset.shortname, description=trait.description_display, abbreviation=trait.abbreviation, authors=trait.authors, pubmed_text=trait.pubmed_text, lrs_score=trait.LRS_score_repr, lrs_location=trait.LRS_location_repr, additive=trait.additive ) elif dataset.type == "Geno": return dict(name=trait.name, dataset=dataset.name, dataset_name = dataset.shortname, location=trait.location_repr ) else: return dict()
def view_collection(): params = request.args uc_id = params['uc_id'] uc = next( (collection for collection in g.user_session.user_collections if collection["id"] == uc_id)) traits = uc["members"] trait_obs = [] json_version = [] for atrait in traits: if ':' not in atrait: continue name, dataset_name = atrait.split(':') if dataset_name == "Temp": group = name.split("_")[2] dataset = create_dataset( dataset_name, dataset_type="Temp", group_name=group) trait_ob = create_trait(name=name, dataset=dataset) else: dataset = create_dataset(dataset_name) trait_ob = create_trait(name=name, dataset=dataset) trait_ob = retrieve_trait_info( trait_ob, dataset, get_qtl_info=True) trait_obs.append(trait_ob) json_version.append(jsonable(trait_ob)) collection_info = dict( trait_obs=trait_obs, uc=uc, heatmap_data_url=f"{GN_SERVER_URL}heatmaps/clustered") if "json" in params: return json.dumps(json_version) else: return render_template("collections/view.html", **collection_info )
def gen_covariates_file(this_dataset, covariates, samples): covariate_list = covariates.split(",") covariate_data_object = [] for covariate in covariate_list: this_covariate_data = [] trait_name = covariate.split(":")[0] dataset_name = covariate.split(":")[1] if dataset_name == "Temp": temp_group = trait_name.split("_")[2] dataset_ob = create_dataset(dataset_name="Temp", dataset_type="Temp", group_name=temp_group) else: dataset_ob = create_dataset(covariate.split(":")[1]) trait_ob = create_trait(dataset=dataset_ob, name=trait_name, cellid=None) this_dataset.group.get_samplelist() trait_samples = this_dataset.group.samplelist trait_sample_data = trait_ob.data for index, sample in enumerate(trait_samples): if sample in samples: if sample in trait_sample_data: sample_value = trait_sample_data[sample].value this_covariate_data.append(sample_value) else: this_covariate_data.append("-9") covariate_data_object.append(this_covariate_data) filename = "COVAR_" + generate_hash_of_string( this_dataset.name + str(covariate_data_object)).replace("/", "_") with open((f"{flat_files('mapping')}/" f"{filename}.txt"), "w") as outfile: for i in range(len(covariate_data_object[0])): for this_covariate in covariate_data_object: outfile.write(str(this_covariate[i]) + "\t") outfile.write("\n") return filename
def __init__(self, kw): ########################################### # Names and IDs of group / F2 set ########################################### # All Phenotypes is a special case we'll deal with later #if kw['dataset'] == "All Phenotypes": # self.cursor.execute(""" # select PublishFreeze.Name, InbredSet.Name, InbredSet.Id from PublishFreeze, # InbredSet where PublishFreeze.Name not like 'BXD300%' and InbredSet.Id = # PublishFreeze.InbredSetId""") # results = self.cursor.fetchall() # self.dataset = map(lambda x: DataSet(x[0], self.cursor), results) # self.dataset_groups = map(lambda x: x[1], results) # self.dataset_group_ids = map(lambda x: x[2], results) #else: self.quick = False self.uc_id = uuid.uuid4() print("uc_id:", self.uc_id) if 'q' in kw: self.results = {} self.quick = True self.search_terms = kw['q'] print("self.search_terms is: ", self.search_terms) self.trait_type = kw['trait_type'] self.quick_search() else: print("kw is:", kw) if kw['search_terms_or']: self.and_or = "or" self.search_terms = kw['search_terms_or'] else: self.and_or = "and" self.search_terms = kw['search_terms_and'] self.search_term_exists = True self.results = [] if kw['type'] == "Phenotypes": dataset_type = "Publish" elif kw['type'] == "Genotypes": dataset_type = "Geno" else: dataset_type = "ProbeSet" self.dataset = create_dataset(kw['dataset'], dataset_type) print("KEYWORD:", self.search_terms) self.search() if self.search_term_exists: self.gen_search_result()
def __init__(self, get_qtl_info=False, get_sample_info=True, **kw): # xor assertion assert bool(kw.get('dataset')) != bool( kw.get('dataset_name')), "Needs dataset ob. or name" if kw.get('dataset_name'): self.dataset = create_dataset(kw.get('dataset_name')) #print(" in GeneralTrait created dataset:", self.dataset) else: self.dataset = kw.get('dataset') self.name = kw.get('name') # Trait ID, ProbeSet ID, Published ID, etc. #print("THE NAME IS:", self.name) self.cellid = kw.get('cellid') self.identification = kw.get('identification', 'un-named trait') self.haveinfo = kw.get('haveinfo', False) self.sequence = kw.get( 'sequence') # Blat sequence, available for ProbeSet self.data = kw.get('data', {}) # Sets defaults self.locus = None self.lrs = None self.pvalue = None self.mean = None self.additive = None self.num_overlap = None self.strand_probe = None self.symbol = None self.LRS_score_repr = "N/A" self.LRS_score_value = 0 self.LRS_location_repr = "N/A" self.LRS_location_value = 1000000 if kw.get('fullname'): name2 = value.split("::") if len(name2) == 2: self.dataset, self.name = name2 # self.cellid is set to None above elif len(name2) == 3: self.dataset, self.name, self.cellid = name2 # Todo: These two lines are necessary most of the time, but perhaps not all of the time # So we could add a simple if statement to short-circuit this if necessary if self.dataset.type != "Temp": self = retrieve_trait_info(self, self.dataset, get_qtl_info=get_qtl_info) if get_sample_info != False: self = retrieve_sample_data(self, self.dataset)
def run(dataset_name, vals, temp_uuid): """Generates p-values for each marker""" tempdata = temp_data.TempData(temp_uuid) dataset = data_set.create_dataset(dataset_name) species = TheSpecies(dataset=dataset) samples = [] # Want only ones with values vals = vals for sample in dataset.group.samplelist: samples.append(str(sample)) gen_data(dataset, vals, tempdata)
def calculate_pca(self, cols, corr_eigen_value, corr_eigen_vectors): base = importr('base') stats = importr('stats') corr_results_to_list = robjects.FloatVector( [item for sublist in self.pca_corr_results for item in sublist]) m = robjects.r.matrix(corr_results_to_list, nrow=len(cols)) eigen = base.eigen(m) pca = stats.princomp(m, cor="TRUE") self.loadings = pca.rx('loadings') self.scores = pca.rx('scores') self.scale = pca.rx('scale') trait_array = zScore(self.trait_data_array) trait_array_vectors = np.dot(corr_eigen_vectors, trait_array) pca_traits = [] for i, vector in enumerate(trait_array_vectors): #ZS: Check if below check is necessary #if corr_eigen_value[i-1] > 100.0/len(self.trait_list): pca_traits.append((vector * -1.0).tolist()) this_group_name = self.trait_list[0][1].group.name temp_dataset = data_set.create_dataset(dataset_name="Temp", dataset_type="Temp", group_name=this_group_name) temp_dataset.group.get_samplelist() for i, pca_trait in enumerate(pca_traits): trait_id = "PCA" + str( i + 1 ) + "_" + temp_dataset.group.species + "_" + this_group_name + "_" + datetime.datetime.now( ).strftime("%m%d%H%M%S") this_vals_string = "" position = 0 for sample in temp_dataset.group.all_samples_ordered(): if sample in self.shared_samples_list: this_vals_string += str(pca_trait[position]) this_vals_string += " " position += 1 else: this_vals_string += "x " this_vals_string = this_vals_string[:-1] Redis.set(trait_id, this_vals_string, ex=THIRTY_DAYS) self.pca_trait_ids.append(trait_id) return pca
def jsonable(trait): """Return a dict suitable for using as json Actual turning into json doesn't happen here though""" dataset = create_dataset(dataset_name = trait.dataset.name, dataset_type = trait.dataset.type, group_name = trait.dataset.group.name) if dataset.type == "ProbeSet": return dict(name=trait.name, symbol=trait.symbol, dataset=dataset.name, description=trait.description_display, mean=trait.mean, location=trait.location_repr, lrs_score=trait.LRS_score_repr, lrs_location=trait.LRS_location_repr, additive=trait.additive ) elif dataset.type == "Publish": if trait.pubmed_id: return dict(name=trait.name, dataset=dataset.name, description=trait.description_display, authors=trait.authors, pubmed_text=trait.pubmed_text, pubmed_link=trait.pubmed_link, lrs_score=trait.LRS_score_repr, lrs_location=trait.LRS_location_repr, additive=trait.additive ) else: return dict(name=trait.name, dataset=dataset.name, description=trait.description_display, authors=trait.authors, pubmed_text=trait.pubmed_text, lrs_score=trait.LRS_score_repr, lrs_location=trait.LRS_location_repr, additive=trait.additive ) elif dataset.type == "Geno": return dict(name=trait.name, dataset=dataset.name, location=trait.location_repr ) else: return dict()
def getSequence(trait, dataset_name): dataset = create_dataset(dataset_name) if dataset.type == 'ProbeSet': results = g.db.execute(''' SELECT ProbeSet.BlatSeq FROM ProbeSet, ProbeSetFreeze, ProbeSetXRef WHERE ProbeSet.Id=ProbeSetXRef.ProbeSetId and ProbeSetFreeze.Id = ProbeSetXRef.ProbSetFreezeId and ProbeSet.Name = %s ProbeSetFreeze.Name = %s ''', trait.name, dataset.name).fetchone() return results[0]
def add_cofactors(cross, this_dataset, covariates, samples): ro.numpy2ri.activate() covariate_list = covariates.split(",") covar_name_string = "c(" for i, covariate in enumerate(covariate_list): this_covar_data = [] covar_as_string = "c(" trait_name = covariate.split(":")[0] dataset_ob = create_dataset(covariate.split(":")[1]) trait_ob = GeneralTrait(dataset=dataset_ob, name=trait_name, cellid=None) this_dataset.group.get_samplelist() trait_samples = this_dataset.group.samplelist trait_sample_data = trait_ob.data for index, sample in enumerate(trait_samples): if sample in samples: if sample in trait_sample_data: sample_value = trait_sample_data[sample].value this_covar_data.append(sample_value) else: this_covar_data.append("NA") for j, item in enumerate(this_covar_data): if j < (len(this_covar_data) - 1): covar_as_string += str(item) + "," else: covar_as_string += str(item) covar_as_string += ")" col_name = "covar_" + str(i) cross = add_phenotype(cross, covar_as_string, col_name) if i < (len(covariate_list) - 1): covar_name_string += '"' + col_name + '", ' else: covar_name_string += '"' + col_name + '"' covar_name_string += ")" covars_ob = pull_var("trait_covars", cross, covar_name_string) return cross, covars_ob
def __init__(self, kw): """This class gets invoked after hitting submit on the main menu (in views.py). """ ########################################### # Names and IDs of group / F2 set ########################################### # All Phenotypes is a special case we'll deal with later #if kw['dataset'] == "All Phenotypes": # self.cursor.execute(""" # select PublishFreeze.Name, InbredSet.Name, InbredSet.Id from PublishFreeze, # InbredSet where PublishFreeze.Name not like 'BXD300%' and InbredSet.Id = # PublishFreeze.InbredSetId""") # results = self.cursor.fetchall() # self.dataset = map(lambda x: DataSet(x[0], self.cursor), results) # self.dataset_groups = map(lambda x: x[1], results) # self.dataset_group_ids = map(lambda x: x[2], results) #else: self.uc_id = uuid.uuid4() logger.debug("uc_id:", self.uc_id) # contains a unique id logger.debug("kw is:", kw) # dict containing search terms if kw['search_terms_or']: self.and_or = "or" self.search_terms = kw['search_terms_or'] else: self.and_or = "and" self.search_terms = kw['search_terms_and'] self.search_term_exists = True self.results = [] if kw['type'] == "Phenotypes": # split datatype on type field dataset_type = "Publish" elif kw['type'] == "Genotypes": dataset_type = "Geno" else: dataset_type = "ProbeSet" # ProbeSet is default self.dataset = create_dataset(kw['dataset'], dataset_type) logger.debug("search_terms:", self.search_terms) self.search() if self.search_term_exists: self.gen_search_result()
def __init__(self, get_qtl_info=False, get_sample_info=True, **kw): # xor assertion assert bool(kw.get('dataset')) != bool(kw.get('dataset_name')), "Needs dataset ob. or name"; if kw.get('dataset_name'): self.dataset = create_dataset(kw.get('dataset_name')) #print(" in GeneralTrait created dataset:", self.dataset) else: self.dataset = kw.get('dataset') self.name = kw.get('name') # Trait ID, ProbeSet ID, Published ID, etc. #print("THE NAME IS:", self.name) self.cellid = kw.get('cellid') self.identification = kw.get('identification', 'un-named trait') self.haveinfo = kw.get('haveinfo', False) self.sequence = kw.get('sequence') # Blat sequence, available for ProbeSet self.data = kw.get('data', {}) # Sets defaults self.locus = None self.lrs = None self.pvalue = None self.mean = None self.additive = None self.num_overlap = None self.strand_probe = None self.symbol = None self.LRS_score_repr = "N/A" self.LRS_score_value = 0 self.LRS_location_repr = "N/A" self.LRS_location_value = 1000000 if kw.get('fullname'): name2 = value.split("::") if len(name2) == 2: self.dataset, self.name = name2 # self.cellid is set to None above elif len(name2) == 3: self.dataset, self.name, self.cellid = name2 # Todo: These two lines are necessary most of the time, but perhaps not all of the time # So we could add a simple if statement to short-circuit this if necessary if self.dataset.type != "Temp": self = retrieve_trait_info(self, self.dataset, get_qtl_info=get_qtl_info) if get_sample_info != False: self = retrieve_sample_data(self, self.dataset)
def get_export_metadata(trait_id, dataset_name): dataset = data_set.create_dataset(dataset_name) this_trait = create_trait(dataset=dataset, name=trait_id, cellid=None, get_qtl_info=False) metadata = [] if dataset.type == "Publish": metadata.append(["Phenotype ID: " + trait_id]) metadata.append([ "Phenotype URL: " + "http://genenetwork.org/show_trait?trait_id=" + trait_id + "&dataset=" + dataset_name ]) metadata.append(["Group: " + dataset.group.name]) metadata.append([ "Phenotype: " + this_trait.description_display.replace(",", "\",\"") ]) metadata.append([ "Authors: " + (this_trait.authors if this_trait.authors else "N/A") ]) metadata.append( ["Title: " + (this_trait.title if this_trait.title else "N/A")]) metadata.append([ "Journal: " + (this_trait.journal if this_trait.journal else "N/A") ]) metadata.append([ "Dataset Link: http://gn1.genenetwork.org/webqtl/main.py?FormID=sharinginfo&InfoPageName=" + dataset.name ]) else: metadata.append(["Record ID: " + trait_id]) metadata.append([ "Trait URL: " + "http://genenetwork.org/show_trait?trait_id=" + trait_id + "&dataset=" + dataset_name ]) if this_trait.symbol: metadata.append(["Symbol: " + this_trait.symbol]) metadata.append(["Dataset: " + dataset.name]) metadata.append(["Group: " + dataset.group.name]) metadata.append([]) return metadata
def getSequence(trait, dataset_name): dataset = create_dataset(dataset_name) if dataset.type == 'ProbeSet': results = g.db.execute( ''' SELECT ProbeSet.BlatSeq FROM ProbeSet, ProbeSetFreeze, ProbeSetXRef WHERE ProbeSet.Id=ProbeSetXRef.ProbeSetId and ProbeSetFreeze.Id = ProbeSetXRef.ProbSetFreezeId and ProbeSet.Name = %s ProbeSetFreeze.Name = %s ''', trait.name, dataset.name).fetchone() return results[0]
def check_access_permissions(): logger.debug("@app.before_request check_access_permissions") available = True if 'dataset' in request.args: permissions = DEFAULT_PRIVILEGES if request.args['dataset'] != "Temp": dataset = create_dataset(request.args['dataset']) if dataset.type == "Temp": permissions = DEFAULT_PRIVILEGES elif 'trait_id' in request.args: permissions = check_resource_availability( dataset, request.args['trait_id']) elif dataset.type != "Publish": permissions = check_resource_availability(dataset) if 'view' not in permissions['data']: return redirect(url_for("no_access_page"))
def run_analysis(self, requestform): print("Starting ePheWAS analysis on dataset") genofilelocation = locate( "BXD.geno", "genotype") # Get the location of the BXD genotypes tissuealignerloc = locate( "Tissue_color_aligner.csv", "auwerx") # Get the location of the Tissue_color_aligner # Get user parameters, trait_id and dataset, and store/update them in self self.trait_id = requestform["trait_id"] self.datasetname = requestform["dataset"] self.dataset = data_set.create_dataset(self.datasetname) # Print some debug print "self.trait_id:" + self.trait_id + "\n" print "self.datasetname:" + self.datasetname + "\n" print "self.dataset.type:" + self.dataset.type + "\n" # Load in the genotypes file *sigh* to make the markermap parser = genofile_parser.ConvertGenoFile(genofilelocation) parser.process_csv() snpinfo = [] for marker in parser.markers: snpinfo.append(marker["name"]) snpinfo.append(marker["chr"]) snpinfo.append(marker["Mb"]) rnames = r_seq(1, len(parser.markers)) # Create the snp aligner object out of the BXD genotypes snpaligner = ro.r.matrix(snpinfo, nrow=len(parser.markers), dimnames=r_list(rnames, r_c("SNP", "Chr", "Pos")), ncol=3, byrow=True) # Create the phenotype aligner object using R phenoaligner = self.r_create_Pheno_aligner() print("Initialization of ePheWAS done !")
def __init__(self, start_vars): # get trait list from db (database name) # calculate correlation with Base vector and targets # Check parameters assert('corr_type' in start_vars) assert(is_str(start_vars['corr_type'])) assert('dataset' in start_vars) # assert('group' in start_vars) permitted to be empty? assert('corr_sample_method' in start_vars) assert('corr_samples_group' in start_vars) assert('corr_dataset' in start_vars) assert('min_expr' in start_vars) assert('corr_return_results' in start_vars) if 'loc_chr' in start_vars: assert('min_loc_mb' in start_vars) assert('max_loc_mb' in start_vars) with Bench("Doing correlations"): if start_vars['dataset'] == "Temp": self.dataset = data_set.create_dataset(dataset_name = "Temp", dataset_type = "Temp", group_name = start_vars['group']) self.trait_id = "Temp" self.this_trait = GeneralTrait(dataset=self.dataset, name=self.trait_id, cellid=None) else: helper_functions.get_species_dataset_trait(self, start_vars) self.dataset.group.read_genotype_file() corr_samples_group = start_vars['corr_samples_group'] self.sample_data = {} self.corr_type = start_vars['corr_type'] self.corr_method = start_vars['corr_sample_method'] self.min_expr = get_float(start_vars,'min_expr') self.p_range_lower = get_float(start_vars,'p_range_lower',-1.0) self.p_range_upper = get_float(start_vars,'p_range_upper',1.0) if ('loc_chr' in start_vars and 'min_loc_mb' in start_vars and 'max_loc_mb' in start_vars): self.location_chr = get_string(start_vars,'loc_chr') self.min_location_mb = get_int(start_vars,'min_loc_mb') self.max_location_mb = get_int(start_vars,'max_loc_mb') self.get_formatted_corr_type() self.return_number = int(start_vars['corr_return_results']) #The two if statements below append samples to the sample list based upon whether the user #rselected Primary Samples Only, Other Samples Only, or All Samples primary_samples = self.dataset.group.samplelist if self.dataset.group.parlist != None: primary_samples += self.dataset.group.parlist if self.dataset.group.f1list != None: primary_samples += self.dataset.group.f1list #If either BXD/whatever Only or All Samples, append all of that group's samplelist if corr_samples_group != 'samples_other': self.process_samples(start_vars, primary_samples) #If either Non-BXD/whatever or All Samples, get all samples from this_trait.data and #exclude the primary samples (because they would have been added in the previous #if statement if the user selected All Samples) if corr_samples_group != 'samples_primary': if corr_samples_group == 'samples_other': primary_samples = [x for x in primary_samples if x not in ( self.dataset.group.parlist + self.dataset.group.f1list)] self.process_samples(start_vars, self.this_trait.data.keys(), primary_samples) self.target_dataset = data_set.create_dataset(start_vars['corr_dataset']) self.target_dataset.get_trait_data(self.sample_data.keys()) self.correlation_results = [] self.correlation_data = {} if self.corr_type == "tissue": self.trait_symbol_dict = self.dataset.retrieve_genes("Symbol") tissue_corr_data = self.do_tissue_correlation_for_all_traits() if tissue_corr_data != None: for trait in tissue_corr_data.keys()[:self.return_number]: self.get_sample_r_and_p_values(trait, self.target_dataset.trait_data[trait]) else: for trait, values in self.target_dataset.trait_data.iteritems(): self.get_sample_r_and_p_values(trait, values) elif self.corr_type == "lit": self.trait_geneid_dict = self.dataset.retrieve_genes("GeneId") lit_corr_data = self.do_lit_correlation_for_all_traits() for trait in lit_corr_data.keys()[:self.return_number]: self.get_sample_r_and_p_values(trait, self.target_dataset.trait_data[trait]) elif self.corr_type == "sample": for trait, values in self.target_dataset.trait_data.iteritems(): self.get_sample_r_and_p_values(trait, values) self.correlation_data = collections.OrderedDict(sorted(self.correlation_data.items(), key=lambda t: -abs(t[1][0]))) if self.target_dataset.type == "ProbeSet" or self.target_dataset.type == "Geno": #ZS: Convert min/max chromosome to an int for the location range option range_chr_as_int = None for order_id, chr_info in self.dataset.species.chromosomes.chromosomes.iteritems(): if chr_info.name == self.location_chr: range_chr_as_int = order_id for _trait_counter, trait in enumerate(self.correlation_data.keys()[:self.return_number]): trait_object = GeneralTrait(dataset=self.target_dataset, name=trait, get_qtl_info=True, get_sample_info=False) if self.target_dataset.type == "ProbeSet" or self.target_dataset.type == "Geno": #ZS: Convert trait chromosome to an int for the location range option chr_as_int = 0 for order_id, chr_info in self.dataset.species.chromosomes.chromosomes.iteritems(): if chr_info.name == trait_object.chr: chr_as_int = order_id if (float(self.correlation_data[trait][0]) >= self.p_range_lower and float(self.correlation_data[trait][0]) <= self.p_range_upper): if self.target_dataset.type == "ProbeSet" or self.target_dataset.type == "Geno": if (self.min_expr != None) and (float(trait_object.mean) < self.min_expr): continue elif range_chr_as_int != None and (chr_as_int != range_chr_as_int): continue elif (self.min_location_mb != None) and (float(trait_object.mb) < float(self.min_location_mb)): continue elif (self.max_location_mb != None) and (float(trait_object.mb) > float(self.max_location_mb)): continue (trait_object.sample_r, trait_object.sample_p, trait_object.num_overlap) = self.correlation_data[trait] # Set some sane defaults trait_object.tissue_corr = 0 trait_object.tissue_pvalue = 0 trait_object.lit_corr = 0 if self.corr_type == "tissue" and tissue_corr_data != None: trait_object.tissue_corr = tissue_corr_data[trait][1] trait_object.tissue_pvalue = tissue_corr_data[trait][2] elif self.corr_type == "lit": trait_object.lit_corr = lit_corr_data[trait][1] self.correlation_results.append(trait_object) else: (trait_object.sample_r, trait_object.sample_p, trait_object.num_overlap) = self.correlation_data[trait] # Set some sane defaults trait_object.tissue_corr = 0 trait_object.tissue_pvalue = 0 trait_object.lit_corr = 0 if self.corr_type == "tissue": trait_object.tissue_corr = tissue_corr_data[trait][1] trait_object.tissue_pvalue = tissue_corr_data[trait][2] elif self.corr_type == "lit": trait_object.lit_corr = lit_corr_data[trait][1] self.correlation_results.append(trait_object) self.target_dataset.get_trait_info(self.correlation_results, self.target_dataset.group.species) if self.corr_type != "lit" and self.dataset.type == "ProbeSet" and self.target_dataset.type == "ProbeSet": self.do_lit_correlation_for_trait_list() if self.corr_type != "tissue" and self.dataset.type == "ProbeSet" and self.target_dataset.type == "ProbeSet": self.do_tissue_correlation_for_trait_list() self.json_results = generate_corr_json(self.correlation_results, self.this_trait, self.dataset, self.target_dataset)
def run_analysis(self, requestform): logger.info("Starting PheWAS analysis on dataset") genofilelocation = locate("BXD.geno", "genotype") # Get the location of the BXD genotypes precompfile = locate_phewas("PheWAS_pval_EMMA_norm.RData", "auwerx") # Get the location of the pre-computed EMMA results # Get user parameters, trait_id and dataset, and store/update them in self self.trait_id = requestform["trait_id"] self.datasetname = requestform["dataset"] self.dataset = data_set.create_dataset(self.datasetname) self.region = int(requestform["num_region"]) self.mtadjust = str(requestform["sel_mtadjust"]) # Logger.Info some debug logger.info("self.trait_id:" + self.trait_id + "\n") logger.info("self.datasetname:" + self.datasetname + "\n") logger.info("self.dataset.type:" + self.dataset.type + "\n") # GN Magic ? self.this_trait = GeneralTrait(dataset=self.dataset, name = self.trait_id, get_qtl_info = False, get_sample_info=False) logger.info(vars(self.this_trait)) # Set the values we need self.chr = str(self.this_trait.chr); self.mb = int(self.this_trait.mb); # logger.info some debug logger.info("location:" + self.chr + ":" + str(self.mb) + "+/-" + str(self.region) + "\n") # Load in the genotypes file *sigh* to make the markermap parser = genofile_parser.ConvertGenoFile(genofilelocation) parser.process_csv() snpinfo = [] for marker in parser.markers: snpinfo.append(marker["name"]); snpinfo.append(marker["chr"]); snpinfo.append(marker["Mb"]); rnames = r_seq(1, len(parser.markers)) # Create the snp aligner object out of the BXD genotypes snpaligner = ro.r.matrix(snpinfo, nrow=len(parser.markers), dimnames = r_list(rnames, r_c("SNP", "Chr", "Pos")), ncol = 3, byrow=True) # Create the phenotype aligner object using R phenoaligner = self.r_create_Pheno_aligner() self.results = {} self.results['imgurl1'] = webqtlUtil.genRandStr("phewas_") + ".png" self.results['imgloc1'] = GENERATED_IMAGE_DIR + self.results['imgurl1'] self.results['mtadjust'] = self.mtadjust logger.info("IMAGE AT:", self.results['imgurl1'] ) logger.info("IMAGE AT:", self.results['imgloc1'] ) # Create the PheWAS plot (The gene/probe name, chromosome and gene/probe positions should come from the user input) # TODO: generate the PDF in the temp folder, with a unique name assert(precompfile) assert(phenoaligner) assert(snpaligner) phewasres = self.r_PheWASManhattan("Test", precompfile, phenoaligner, snpaligner, "None", self.chr, self.mb, self.region, self.results['imgloc1'] , self.mtadjust) self.results['phewas1'] = phewasres[0] self.results['phewas2'] = phewasres[1] self.results['tabulardata'] = phewasres[2] self.results['R_debuglog'] = phewasres[3] #self.r_PheWASManhattan(allpvalues) #self.r_Stop() logger.info("Initialization of PheWAS done !")
def __init__(self, params): self.data_set_1 = data_set.create_dataset(params['dataset_1']) self.data_set_2 = data_set.create_dataset(params['dataset_2']) self.trait_1 = GeneralTrait(name=params['trait_1'], dataset=self.data_set_1) self.trait_2 = GeneralTrait(name=params['trait_2'], dataset=self.data_set_2) try: width = int(params['width']) except: width = 800 self.width = width try: height = int(params['height']) except: height = 600 self.height = height try: circle_color = params['circle_color'] except: circle_color = '#3D85C6' self.circle_color = circle_color try: circle_radius = int(params['circle_radius']) except: circle_radius = 5 self.circle_radius = circle_radius try: line_color = params['line_color'] except: line_color = '#FF0000' self.line_color = line_color try: line_width = int(params['line_width']) except: line_width = 1 self.line_width = line_width samples_1, samples_2, num_overlap = corr_result_helpers.normalize_values_with_samples(self.trait_1.data, self.trait_2.data) self.data = [] self.indIDs = samples_1.keys() vals_1 = [] for sample in samples_1.keys(): vals_1.append(samples_1[sample].value) self.data.append(vals_1) vals_2 = [] for sample in samples_2.keys(): vals_2.append(samples_2[sample].value) self.data.append(vals_2) x = np.array(vals_1) y = np.array(vals_2) slope, intercept, r_value, p_value, std_err = stats.linregress(x, y) rx = stats.rankdata(x) ry = stats.rankdata(y) self.rdata = [] self.rdata.append(rx.tolist()) self.rdata.append(ry.tolist()) srslope, srintercept, srr_value, srp_value, srstd_err = stats.linregress(rx, ry) self.js_data = dict( data = self.data, rdata = self.rdata, indIDs = self.indIDs, trait_1 = self.trait_1.dataset.name + ": " + str(self.trait_1.name), trait_2 = self.trait_2.dataset.name + ": " + str(self.trait_2.name), samples_1 = samples_1, samples_2 = samples_2, num_overlap = num_overlap, vals_1 = vals_1, vals_2 = vals_2, slope = slope, intercept = intercept, r_value = r_value, p_value = p_value, srslope = srslope, srintercept = srintercept, srr_value = srr_value, srp_value = srp_value, width = width, height = height, circle_color = circle_color, circle_radius = circle_radius, line_color = line_color, line_width = line_width ) self.jsdata = self.js_data
def __init__(self, kw): self.type = kw['type'] self.terms = kw['terms'] #self.row_range = kw['row_range'] if self.type == "gene": sql = """ SELECT Species.`Name` AS species_name, InbredSet.`Name` AS inbredset_name, Tissue.`Name` AS tissue_name, ProbeSetFreeze.Name AS probesetfreeze_name, ProbeSet.Name AS probeset_name, ProbeSet.Symbol AS probeset_symbol, ProbeSet.`description` AS probeset_description, ProbeSet.Chr AS chr, ProbeSet.Mb AS mb, ProbeSetXRef.Mean AS mean, ProbeSetXRef.LRS AS lrs, ProbeSetXRef.`Locus` AS locus, ProbeSetXRef.`pValue` AS pvalue, ProbeSetXRef.`additive` AS additive FROM Species, InbredSet, ProbeSetXRef, ProbeSet, ProbeFreeze, ProbeSetFreeze, Tissue WHERE InbredSet.`SpeciesId`=Species.`Id` AND ProbeFreeze.InbredSetId=InbredSet.`Id` AND ProbeFreeze.`TissueId`=Tissue.`Id` AND ProbeSetFreeze.ProbeFreezeId=ProbeFreeze.Id AND ( MATCH (ProbeSet.Name,ProbeSet.description,ProbeSet.symbol,alias,GenbankId, UniGeneId, Probe_Target_Description) AGAINST ('%s' IN BOOLEAN MODE) ) AND ProbeSet.Id = ProbeSetXRef.ProbeSetId AND ProbeSetXRef.ProbeSetFreezeId=ProbeSetFreeze.Id AND ProbeSetFreeze.public > 0 ORDER BY species_name, inbredset_name, tissue_name, probesetfreeze_name, probeset_name LIMIT 6000 """ % (self.terms) with Bench("Running query"): logger.sql(sql) re = g.db.execute(sql).fetchall() self.trait_list = [] with Bench("Creating trait objects"): for line in re: dataset = create_dataset(line[3], "ProbeSet", get_samplelist=False) trait_id = line[4] #with Bench("Building trait object"): this_trait = GeneralTrait(dataset=dataset, name=trait_id, get_qtl_info=True, get_sample_info=False) self.trait_list.append(this_trait) elif self.type == "phenotype": sql = """ SELECT Species.`Name`, InbredSet.`Name`, PublishFreeze.`Name`, PublishXRef.`Id`, Phenotype.`Post_publication_description`, Publication.`Authors`, Publication.`Year`, PublishXRef.`LRS`, PublishXRef.`Locus`, PublishXRef.`additive` FROM Species,InbredSet,PublishFreeze,PublishXRef,Phenotype,Publication WHERE PublishXRef.`InbredSetId`=InbredSet.`Id` AND PublishFreeze.`InbredSetId`=InbredSet.`Id` AND InbredSet.`SpeciesId`=Species.`Id` AND PublishXRef.`PhenotypeId`=Phenotype.`Id` AND PublishXRef.`PublicationId`=Publication.`Id` AND (Phenotype.Post_publication_description REGEXP "[[:<:]]%s[[:>:]]" OR Phenotype.Pre_publication_description REGEXP "[[:<:]]%s[[:>:]]" OR Phenotype.Pre_publication_abbreviation REGEXP "[[:<:]]%s[[:>:]]" OR Phenotype.Post_publication_abbreviation REGEXP "[[:<:]]%s[[:>:]]" OR Phenotype.Lab_code REGEXP "[[:<:]]%s[[:>:]]" OR Publication.PubMed_ID REGEXP "[[:<:]]%s[[:>:]]" OR Publication.Abstract REGEXP "[[:<:]]%s[[:>:]]" OR Publication.Title REGEXP "[[:<:]]%s[[:>:]]" OR Publication.Authors REGEXP "[[:<:]]%s[[:>:]]" OR PublishXRef.Id REGEXP "[[:<:]]%s[[:>:]]") ORDER BY Species.`Name`, InbredSet.`Name`, PublishXRef.`Id` LIMIT 6000 """ % (self.terms, self.terms, self.terms, self.terms, self.terms, self.terms, self.terms, self.terms, self.terms, self.terms) logger.sql(sql) re = g.db.execute(sql).fetchall() self.trait_list = [] with Bench("Creating trait objects"): for line in re: dataset = create_dataset(line[2], "Publish") trait_id = line[3] this_trait = GeneralTrait(dataset=dataset, name=trait_id, get_qtl_info=True, get_sample_info=False) self.trait_list.append(this_trait) self.results = self.convert_to_json()
def __init__(self, params): self.data_set_1 = data_set.create_dataset(params['dataset_1']) self.data_set_2 = data_set.create_dataset(params['dataset_2']) #self.data_set_3 = data_set.create_dataset(params['dataset_3']) self.trait_1 = GeneralTrait(name=params['trait_1'], dataset=self.data_set_1) self.trait_2 = GeneralTrait(name=params['trait_2'], dataset=self.data_set_2) #self.trait_3 = GeneralTrait(name=params['trait_3'], dataset=self.data_set_3) samples_1, samples_2, num_overlap = corr_result_helpers.normalize_values_with_samples(self.trait_1.data, self.trait_2.data) self.data = [] self.indIDs = samples_1.keys() vals_1 = [] for sample in samples_1.keys(): vals_1.append(samples_1[sample].value) self.data.append(vals_1) vals_2 = [] for sample in samples_2.keys(): vals_2.append(samples_2[sample].value) self.data.append(vals_2) x = np.array(vals_1) y = np.array(vals_2) slope, intercept, r_value, p_value, std_err = stats.linregress(x, y) rx = stats.rankdata(x) ry = stats.rankdata(y) self.rdata = [] self.rdata.append(rx.tolist()) self.rdata.append(ry.tolist()) srslope, srintercept, srr_value, srp_value, srstd_err = stats.linregress(rx, ry) #vals_3 = [] #for sample in self.trait_3.data: # vals_3.append(self.trait_3.data[sample].value) self.js_data = dict( data = self.data, rdata = self.rdata, indIDs = self.indIDs, trait_1 = self.trait_1.dataset.name + ": " + str(self.trait_1.name), trait_2 = self.trait_2.dataset.name + ": " + str(self.trait_2.name), samples_1 = samples_1, samples_2 = samples_2, num_overlap = num_overlap, vals_1 = vals_1, vals_2 = vals_2, slope = slope, intercept = intercept, r_value = r_value, p_value = p_value, srslope = srslope, srintercept = srintercept, srr_value = srr_value, srp_value = srp_value #trait3 = self.trait_3.data, #vals_3 = vals_3 ) self.jsdata = self.js_data
def __init__(self, kw): self.type = kw['type'] self.terms = kw['terms'] if self.type == "gene": sql = """ SELECT Species.`Name` AS species_name, InbredSet.`Name` AS inbredset_name, Tissue.`Name` AS tissue_name, ProbeSetFreeze.Name AS probesetfreeze_name, ProbeSet.Name AS probeset_name, ProbeSet.Symbol AS probeset_symbol, ProbeSet.`description` AS probeset_description, ProbeSet.Chr AS chr, ProbeSet.Mb AS mb, ProbeSetXRef.Mean AS mean, ProbeSetXRef.LRS AS lrs, ProbeSetXRef.`Locus` AS locus, ProbeSetXRef.`pValue` AS pvalue, ProbeSetXRef.`additive` AS additive FROM Species, InbredSet, ProbeSetXRef, ProbeSet, ProbeFreeze, ProbeSetFreeze, Tissue WHERE InbredSet.`SpeciesId`=Species.`Id` AND ProbeFreeze.InbredSetId=InbredSet.`Id` AND ProbeFreeze.`TissueId`=Tissue.`Id` AND ProbeSetFreeze.ProbeFreezeId=ProbeFreeze.Id AND ( MATCH (ProbeSet.Name,ProbeSet.description,ProbeSet.symbol,alias,GenbankId, UniGeneId, Probe_Target_Description) AGAINST ('%s' IN BOOLEAN MODE) ) AND ProbeSet.Id = ProbeSetXRef.ProbeSetId AND ProbeSetXRef.ProbeSetFreezeId=ProbeSetFreeze.Id AND ProbeSetFreeze.public > 0 ORDER BY species_name, inbredset_name, tissue_name, probesetfreeze_name, probeset_name LIMIT 5000 """ % (self.terms) re = g.db.execute(sql).fetchall() self.trait_list = [] for line in re: dataset = create_dataset(line[3], "ProbeSet") trait_id = line[4] this_trait = GeneralTrait(dataset=dataset, name=trait_id, get_qtl_info=True) self.trait_list.append(this_trait) species = webqtlDatabaseFunction.retrieve_species(dataset.group.name) dataset.get_trait_info([this_trait], species) elif self.type == "phenotype": sql = """ SELECT Species.`Name`, InbredSet.`Name`, PublishFreeze.`Name`, PublishXRef.`Id`, Phenotype.`Post_publication_description`, Publication.`Authors`, Publication.`Year`, PublishXRef.`LRS`, PublishXRef.`Locus`, PublishXRef.`additive` FROM Species,InbredSet,PublishFreeze,PublishXRef,Phenotype,Publication WHERE PublishXRef.`InbredSetId`=InbredSet.`Id` AND PublishFreeze.`InbredSetId`=InbredSet.`Id` AND InbredSet.`SpeciesId`=Species.`Id` AND PublishXRef.`PhenotypeId`=Phenotype.`Id` AND PublishXRef.`PublicationId`=Publication.`Id` AND (Phenotype.Post_publication_description REGEXP "[[:<:]]%s[[:>:]]" OR Phenotype.Pre_publication_description REGEXP "[[:<:]]%s[[:>:]]" OR Phenotype.Pre_publication_abbreviation REGEXP "[[:<:]]%s[[:>:]]" OR Phenotype.Post_publication_abbreviation REGEXP "[[:<:]]%s[[:>:]]" OR Phenotype.Lab_code REGEXP "[[:<:]]%s[[:>:]]" OR Publication.PubMed_ID REGEXP "[[:<:]]%s[[:>:]]" OR Publication.Abstract REGEXP "[[:<:]]%s[[:>:]]" OR Publication.Title REGEXP "[[:<:]]%s[[:>:]]" OR Publication.Authors REGEXP "[[:<:]]%s[[:>:]]" OR PublishXRef.Id REGEXP "[[:<:]]%s[[:>:]]") ORDER BY Species.`Name`, InbredSet.`Name`, PublishXRef.`Id` LIMIT 5000 """ % (self.terms, self.terms, self.terms, self.terms, self.terms, self.terms, self.terms, self.terms, self.terms, self.terms) re = g.db.execute(sql).fetchall() self.trait_list = [] for line in re: dataset = create_dataset(line[2], "Publish") trait_id = line[3] this_trait = GeneralTrait(dataset=dataset, name=trait_id, get_qtl_info=True) self.trait_list.append(this_trait) species = webqtlDatabaseFunction.retrieve_species(dataset.group.name) dataset.get_trait_info([this_trait], species)
def __init__(self, kw): logger.debug("in ShowTrait, kw are:", kw) if 'trait_id' in kw and kw['dataset'] != "Temp": self.temp_trait = False self.trait_id = kw['trait_id'] helper_functions.get_species_dataset_trait(self, kw) elif 'group' in kw: self.temp_trait = True self.trait_id = "Temp_"+kw['species']+ "_" + kw['group'] + "_" + datetime.datetime.now().strftime("%m%d%H%M%S") self.temp_species = kw['species'] self.temp_group = kw['group'] self.dataset = data_set.create_dataset(dataset_name = "Temp", dataset_type = "Temp", group_name = self.temp_group) self.this_trait = GeneralTrait(dataset=self.dataset, name=self.trait_id, cellid=None) self.trait_vals = kw['trait_paste'].split() # Put values in Redis so they can be looked up later if added to a collection Redis.set(self.trait_id, kw['trait_paste']) else: self.temp_trait = True self.trait_id = kw['trait_id'] self.temp_species = self.trait_id.split("_")[1] self.temp_group = self.trait_id.split("_")[2] self.dataset = data_set.create_dataset(dataset_name = "Temp", dataset_type = "Temp", group_name = self.temp_group) self.this_trait = GeneralTrait(dataset=self.dataset, name=self.trait_id, cellid=None) self.trait_vals = Redis.get(self.trait_id).split() #self.dataset.group.read_genotype_file() #if this_trait: # if this_trait.dataset and this_trait.dataset.type and this_trait.dataset.type == 'ProbeSet': # self.cursor.execute("SELECT h2 from ProbeSetXRef WHERE DataId = %d" % # this_trait.mysqlid) # heritability = self.cursor.fetchone() #ZS: Get verify/rna-seq link URLs try: blatsequence = self.this_trait.blatseq if not blatsequence: #XZ, 06/03/2009: ProbeSet name is not unique among platforms. We should use ProbeSet Id instead. query1 = """SELECT Probe.Sequence, Probe.Name FROM Probe, ProbeSet, ProbeSetFreeze, ProbeSetXRef WHERE ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id AND ProbeSetXRef.ProbeSetId = ProbeSet.Id AND ProbeSetFreeze.Name = '%s' AND ProbeSet.Name = '%s' AND Probe.ProbeSetId = ProbeSet.Id order by Probe.SerialOrder""" % (self.this_trait.dataset.name, self.this_trait.name) seqs = g.db.execute(query1).fetchall() if not seqs: raise ValueError else: blatsequence = '' for seqt in seqs: if int(seqt[1][-1]) % 2 == 1: blatsequence += string.strip(seqt[0]) #--------Hongqiang add this part in order to not only blat ProbeSet, but also blat Probe blatsequence = '%3E' + self.this_trait.name + '%0A' + blatsequence + '%0A' #XZ, 06/03/2009: ProbeSet name is not unique among platforms. We should use ProbeSet Id instead. query2 = """SELECT Probe.Sequence, Probe.Name FROM Probe, ProbeSet, ProbeSetFreeze, ProbeSetXRef WHERE ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id AND ProbeSetXRef.ProbeSetId = ProbeSet.Id AND ProbeSetFreeze.Name = '%s' AND ProbeSet.Name = '%s' AND Probe.ProbeSetId = ProbeSet.Id order by Probe.SerialOrder""" % (self.this_trait.dataset.name, self.this_trait.name) seqs = g.db.execute(query2).fetchall() for seqt in seqs: if int(seqt[1][-1]) %2 == 1: blatsequence += '%3EProbe_' + string.strip(seqt[1]) + '%0A' + string.strip(seqt[0]) + '%0A' if self.dataset.group.species == "rat": self.UCSC_BLAT_URL = webqtlConfig.UCSC_BLAT % ('rat', 'rn3', blatsequence) self.UTHSC_BLAT_URL = "" elif self.dataset.group.species == "mouse": self.UCSC_BLAT_URL = webqtlConfig.UTHSC_BLAT2 % ('mouse', 'mm10', blatsequence) self.UTHSC_BLAT_URL = webqtlConfig.UTHSC_BLAT % ('mouse', 'mm10', blatsequence) elif self.dataset.group.species == "human": self.UCSC_BLAT_URL = webqtlConfig.UTHSC_BLAT2 % ('human', 'hg19', blatsequence) self.UTHSC_BLAT_URL = "" else: self.UCSC_BLAT_URL = "" self.UTHSC_BLAT_URL = "" except: self.UCSC_BLAT_URL = "" self.UTHSC_BLAT_URL = "" self.build_correlation_tools() #Get nearest marker for composite mapping if not self.temp_trait: if hasattr(self.this_trait, 'locus_chr') and self.this_trait.locus_chr != "" and self.dataset.type != "Geno" and self.dataset.type != "Publish": self.nearest_marker = get_nearest_marker(self.this_trait, self.dataset) #self.nearest_marker1 = get_nearest_marker(self.this_trait, self.dataset)[0] #self.nearest_marker2 = get_nearest_marker(self.this_trait, self.dataset)[1] else: self.nearest_marker = "" #self.nearest_marker1 = "" #self.nearest_marker2 = "" self.make_sample_lists() # Todo: Add back in the ones we actually need from below, as we discover we need them hddn = OrderedDict() if self.dataset.group.allsamples: hddn['allsamples'] = string.join(self.dataset.group.allsamples, ' ') hddn['trait_id'] = self.trait_id hddn['dataset'] = self.dataset.name hddn['temp_trait'] = False if self.temp_trait: hddn['temp_trait'] = True hddn['group'] = self.temp_group hddn['species'] = self.temp_species hddn['use_outliers'] = False hddn['method'] = "pylmm" hddn['mapping_display_all'] = True hddn['suggestive'] = 0 hddn['num_perm'] = 0 hddn['manhattan_plot'] = "" hddn['control_marker'] = "" if not self.temp_trait: if hasattr(self.this_trait, 'locus_chr') and self.this_trait.locus_chr != "" and self.dataset.type != "Geno" and self.dataset.type != "Publish": hddn['control_marker'] = self.nearest_marker #hddn['control_marker'] = self.nearest_marker1+","+self.nearest_marker2 hddn['do_control'] = False hddn['maf'] = 0.01 hddn['compare_traits'] = [] hddn['export_data'] = "" # We'll need access to this_trait and hddn in the Jinja2 Template, so we put it inside self self.hddn = hddn self.temp_uuid = uuid.uuid4() self.sample_group_types = OrderedDict() if len(self.sample_groups) > 1: self.sample_group_types['samples_primary'] = self.dataset.group.name self.sample_group_types['samples_other'] = "Other" self.sample_group_types['samples_all'] = "All" else: self.sample_group_types['samples_primary'] = self.dataset.group.name sample_lists = [group.sample_list for group in self.sample_groups] self.get_mapping_methods() self.stats_table_width, self.trait_table_width = get_table_widths(self.sample_groups) trait_symbol = None if not self.temp_trait: if self.this_trait.symbol: trait_symbol = self.this_trait.symbol js_data = dict(trait_id = self.trait_id, trait_symbol = trait_symbol, dataset_type = self.dataset.type, data_scale = self.dataset.data_scale, sample_group_types = self.sample_group_types, sample_lists = sample_lists, attribute_names = self.sample_groups[0].attributes, temp_uuid = self.temp_uuid) self.js_data = js_data
def __init__(self, kw): logger.debug("in ShowTrait, kw are:", kw) if 'trait_id' in kw and kw['dataset'] != "Temp": self.temp_trait = False self.trait_id = kw['trait_id'] helper_functions.get_species_dataset_trait(self, kw) elif 'group' in kw: self.temp_trait = True self.trait_id = "Temp_"+kw['species']+ "_" + kw['group'] + "_" + datetime.datetime.now().strftime("%m%d%H%M%S") self.temp_species = kw['species'] self.temp_group = kw['group'] self.dataset = data_set.create_dataset(dataset_name = "Temp", dataset_type = "Temp", group_name = self.temp_group) self.this_trait = GeneralTrait(dataset=self.dataset, name=self.trait_id, cellid=None) self.trait_vals = kw['trait_paste'].split() # Put values in Redis so they can be looked up later if added to a collection Redis.set(self.trait_id, kw['trait_paste']) else: self.temp_trait = True self.trait_id = kw['trait_id'] self.temp_species = self.trait_id.split("_")[1] self.temp_group = self.trait_id.split("_")[2] self.dataset = data_set.create_dataset(dataset_name = "Temp", dataset_type = "Temp", group_name = self.temp_group) self.this_trait = GeneralTrait(dataset=self.dataset, name=self.trait_id, cellid=None) self.trait_vals = Redis.get(self.trait_id).split() #self.dataset.group.read_genotype_file() #if this_trait: # if this_trait.dataset and this_trait.dataset.type and this_trait.dataset.type == 'ProbeSet': # self.cursor.execute("SELECT h2 from ProbeSetXRef WHERE DataId = %d" % # this_trait.mysqlid) # heritability = self.cursor.fetchone() self.build_correlation_tools() #Get nearest marker for composite mapping if not self.temp_trait: if hasattr(self.this_trait, 'locus_chr') and self.this_trait.locus_chr != "" and self.dataset.type != "Geno" and self.dataset.type != "Publish": self.nearest_marker = get_nearest_marker(self.this_trait, self.dataset) #self.nearest_marker1 = get_nearest_marker(self.this_trait, self.dataset)[0] #self.nearest_marker2 = get_nearest_marker(self.this_trait, self.dataset)[1] else: self.nearest_marker = "" #self.nearest_marker1 = "" #self.nearest_marker2 = "" self.make_sample_lists() # Todo: Add back in the ones we actually need from below, as we discover we need them hddn = OrderedDict() if self.dataset.group.allsamples: hddn['allsamples'] = string.join(self.dataset.group.allsamples, ' ') hddn['trait_id'] = self.trait_id hddn['dataset'] = self.dataset.name hddn['temp_trait'] = False if self.temp_trait: hddn['temp_trait'] = True hddn['group'] = self.temp_group hddn['species'] = self.temp_species hddn['use_outliers'] = False hddn['method'] = "pylmm" hddn['mapping_display_all'] = True hddn['suggestive'] = 0 hddn['num_perm'] = 0 hddn['manhattan_plot'] = "" hddn['control_marker'] = "" if not self.temp_trait: if hasattr(self.this_trait, 'locus_chr') and self.this_trait.locus_chr != "" and self.dataset.type != "Geno" and self.dataset.type != "Publish": hddn['control_marker'] = self.nearest_marker #hddn['control_marker'] = self.nearest_marker1+","+self.nearest_marker2 hddn['do_control'] = False hddn['maf'] = 0.01 hddn['compare_traits'] = [] hddn['export_data'] = "" # We'll need access to this_trait and hddn in the Jinja2 Template, so we put it inside self self.hddn = hddn self.temp_uuid = uuid.uuid4() self.sample_group_types = OrderedDict() if len(self.sample_groups) > 1: self.sample_group_types['samples_primary'] = self.dataset.group.name + " Only" self.sample_group_types['samples_other'] = "Non-" + self.dataset.group.name self.sample_group_types['samples_all'] = "All Cases" else: self.sample_group_types['samples_primary'] = self.dataset.group.name sample_lists = [group.sample_list for group in self.sample_groups] self.get_mapping_methods() self.trait_table_width = get_trait_table_width(self.sample_groups) trait_symbol = None if not self.temp_trait: if self.this_trait.symbol: trait_symbol = self.this_trait.symbol js_data = dict(trait_id = self.trait_id, trait_symbol = trait_symbol, dataset_type = self.dataset.type, data_scale = self.dataset.data_scale, sample_group_types = self.sample_group_types, sample_lists = sample_lists, attribute_names = self.sample_groups[0].attributes, temp_uuid = self.temp_uuid) self.js_data = js_data
def run_analysis(self, requestform): print("Starting CTL analysis on dataset") self.trait_db_list = [trait.strip() for trait in requestform['trait_list'].split(',')] self.trait_db_list = [x for x in self.trait_db_list if x] print("strategy:", requestform.get("strategy")) strategy = requestform.get("strategy") print("nperm:", requestform.get("nperm")) nperm = int(requestform.get("nperm")) print("parametric:", requestform.get("parametric")) parametric = bool(requestform.get("parametric")) print("significance:", requestform.get("significance")) significance = float(requestform.get("significance")) # Get the name of the .geno file belonging to the first phenotype datasetname = self.trait_db_list[0].split(":")[1] dataset = data_set.create_dataset(datasetname) genofilelocation = locate(dataset.group.name + ".geno", "genotype") parser = genofile_parser.ConvertGenoFile(genofilelocation) parser.process_csv() print(dataset.group) # Create a genotype matrix individuals = parser.individuals markers = [] markernames = [] for marker in parser.markers: markernames.append(marker["name"]) markers.append(marker["genotypes"]) genotypes = list(itertools.chain(*markers)) print(len(genotypes) / len(individuals), "==", len(parser.markers)) rGeno = r_t(ro.r.matrix(r_unlist(genotypes), nrow=len(markernames), ncol=len(individuals), dimnames = r_list(markernames, individuals), byrow=True)) # Create a phenotype matrix traits = [] for trait in self.trait_db_list: print("retrieving data for", trait) if trait != "": ts = trait.split(':') gt = TRAIT.GeneralTrait(name = ts[0], dataset_name = ts[1]) gt = TRAIT.retrieve_sample_data(gt, dataset, individuals) for ind in individuals: if ind in gt.data.keys(): traits.append(gt.data[ind].value) else: traits.append("-999") rPheno = r_t(ro.r.matrix(r_as_numeric(r_unlist(traits)), nrow=len(self.trait_db_list), ncol=len(individuals), dimnames = r_list(self.trait_db_list, individuals), byrow=True)) print(rPheno) # Use a data frame to store the objects rPheno = r_data_frame(rPheno, check_names = False) rGeno = r_data_frame(rGeno, check_names = False) # Debug: Print the genotype and phenotype files to disk #r_write_table(rGeno, "~/outputGN/geno.csv") #r_write_table(rPheno, "~/outputGN/pheno.csv") # Perform the CTL scan res = self.r_CTLscan(rGeno, rPheno, strategy = strategy, nperm = nperm, parametric = parametric, ncores = 6) # Get significant interactions significant = self.r_CTLsignificant(res, significance = significance) # Create an image for output self.results = {} self.results['imgurl1'] = webqtlUtil.genRandStr("CTLline_") + ".png" self.results['imgloc1'] = GENERATED_IMAGE_DIR + self.results['imgurl1'] self.results['ctlresult'] = significant self.results['requestform'] = requestform # Store the user specified parameters for the output page # Create the lineplot r_png(self.results['imgloc1'], width=1000, height=600, type='cairo-png') self.r_lineplot(res, significance = significance) r_dev_off() n = 2 # We start from 2, since R starts from 1 :) for trait in self.trait_db_list: # Create the QTL like CTL plots self.results['imgurl' + str(n)] = webqtlUtil.genRandStr("CTL_") + ".png" self.results['imgloc' + str(n)] = GENERATED_IMAGE_DIR + self.results['imgurl' + str(n)] r_png(self.results['imgloc' + str(n)], width=1000, height=600, type='cairo-png') self.r_plotCTLobject(res, (n-1), significance = significance, main='Phenotype ' + trait) r_dev_off() n = n + 1 # Flush any output from R sys.stdout.flush() # Create the interactive graph for cytoscape visualization (Nodes and Edges) print(type(significant)) if not type(significant) == ri.RNULLType: for x in range(len(significant[0])): print(significant[0][x], significant[1][x], significant[2][x]) # Debug to console tsS = significant[0][x].split(':') # Source tsT = significant[2][x].split(':') # Target gtS = TRAIT.GeneralTrait(name = tsS[0], dataset_name = tsS[1]) # Retrieve Source info from the DB gtT = TRAIT.GeneralTrait(name = tsT[0], dataset_name = tsT[1]) # Retrieve Target info from the DB self.addNode(gtS) self.addNode(gtT) self.addEdge(gtS, gtT, significant, x) significant[0][x] = gtS.symbol + " (" + gtS.name + ")" # Update the trait name for the displayed table significant[2][x] = gtT.symbol + " (" + gtT.name + ")" # Update the trait name for the displayed table self.elements = json.dumps(self.nodes_list + self.edges_list)
import sys from base import webqtlConfig from base.data_set import create_dataset from base.templatePage import templatePage from utility import webqtlUtil from dbFunction import webqtlDatabaseFunction db_conn = MySQLdb.Connect(db=webqtlConfig.DB_NAME, host=webqtlConfig.MYSQL_SERVER, user=webqtlConfig.DB_USER, passwd=webqtlConfig.DB_PASSWD) cursor = db_conn.cursor() dataset_name = "HC_M2_0606_P" dataset = create_dataset(db_conn, dataset_name) #cursor.execute(""" # SELECT ProbeSet.Name as TNAME, 0 as thistable, # ProbeSetXRef.Mean as TMEAN, ProbeSetXRef.LRS as TLRS, # ProbeSetXRef.PVALUE as TPVALUE, ProbeSet.Chr_num as TCHR_NUM, # ProbeSet.Mb as TMB, ProbeSet.Symbol as TSYMBOL, # ProbeSet.name_num as TNAME_NUM # FROM ProbeSetXRef, ProbeSet, Geno # WHERE ProbeSetXRef.LRS > 99.0 and # ABS(ProbeSet.Mb-Geno.Mb) < 5 and # ProbeSetXRef.Locus = Geno.name and # Geno.SpeciesId = 1 and # ProbeSet.Chr = Geno.Chr and # ProbeSet.Id = ProbeSetXRef.ProbeSetId and # ProbeSetXRef.ProbeSetFreezeId = 112""")
def __init__(self, params): self.data_set_1 = data_set.create_dataset(params['dataset_1']) self.data_set_2 = data_set.create_dataset(params['dataset_2']) self.trait_1 = GeneralTrait(name=params['trait_1'], dataset=self.data_set_1) self.trait_2 = GeneralTrait(name=params['trait_2'], dataset=self.data_set_2) try: width = int(params['width']) except: width = 800 self.width = width try: height = int(params['height']) except: height = 600 self.height = height try: circle_color = params['circle_color'] except: circle_color = 'steelblue' self.circle_color = circle_color try: circle_radius = int(params['circle_radius']) except: circle_radius = 5 self.circle_radius = circle_radius try: line_color = params['line_color'] except: line_color = 'red' self.line_color = line_color try: line_width = int(params['line_width']) except: line_width = 1 self.line_width = line_width samples_1, samples_2, num_overlap = corr_result_helpers.normalize_values_with_samples(self.trait_1.data, self.trait_2.data) vals_1 = [] for sample in samples_1.keys(): vals_1.append(samples_1[sample].value) vals_2 = [] for sample in samples_2.keys(): vals_2.append(samples_2[sample].value) x = np.array(vals_1) y = np.array(vals_2) slope, intercept, r_value, p_value, std_err = stats.linregress(x, y) self.js_data = dict( trait_1 = self.trait_1.dataset.name + ": " + self.trait_1.name, trait_2 = self.trait_2.dataset.name + ": " + self.trait_2.name, samples_1 = samples_1, samples_2 = samples_2, num_overlap = num_overlap, vals_1 = vals_1, vals_2 = vals_2, slope = slope, intercept = intercept, r_value = r_value, p_value = p_value, width = width, height = height, circle_color = circle_color, circle_radius = circle_radius, line_color = line_color, line_width = line_width )
def run_analysis(self, requestform): print("Starting CTL analysis on dataset") self.trait_db_list = [trait.strip() for trait in requestform['trait_list'].split(',')] self.trait_db_list = [x for x in self.trait_db_list if x] print("strategy:", requestform.get("strategy")) strategy = requestform.get("strategy") print("nperm:", requestform.get("nperm")) nperm = int(requestform.get("nperm")) print("parametric:", requestform.get("parametric")) parametric = bool(requestform.get("parametric")) print("significance:", requestform.get("significance")) significance = float(requestform.get("significance")) # Get the name of the .geno file belonging to the first phenotype datasetname = self.trait_db_list[0].split(":")[1] dataset = data_set.create_dataset(datasetname) genofilelocation = locate(dataset.group.name + ".geno", "genotype") parser = genofile_parser.ConvertGenoFile(genofilelocation) parser.process_csv() # Create a genotype matrix individuals = parser.individuals markers = [] markernames = [] for marker in parser.markers: markernames.append(marker["name"]) markers.append(marker["genotypes"]) genotypes = list(itertools.chain(*markers)) print(len(genotypes) / len(individuals), "==", len(parser.markers)) rGeno = r_t(ro.r.matrix(r_unlist(genotypes), nrow=len(markernames), ncol=len(individuals), dimnames = r_list(markernames, individuals), byrow=True)) # Create a phenotype matrix traits = [] for trait in self.trait_db_list: print("retrieving data for", trait) if trait != "": ts = trait.split(':') gt = TRAIT.GeneralTrait(name = ts[0], dataset_name = ts[1]) gt.retrieve_sample_data(individuals) for ind in individuals: if ind in gt.data.keys(): traits.append(gt.data[ind].value) else: traits.append("-999") rPheno = r_t(ro.r.matrix(r_as_numeric(r_unlist(traits)), nrow=len(self.trait_db_list), ncol=len(individuals), dimnames = r_list(self.trait_db_list, individuals), byrow=True)) # Use a data frame to store the objects rPheno = r_data_frame(rPheno) rGeno = r_data_frame(rGeno) # Debug: Print the genotype and phenotype files to disk #r_write_table(rGeno, "~/outputGN/geno.csv") #r_write_table(rPheno, "~/outputGN/pheno.csv") # Perform the CTL scan res = self.r_CTLscan(rGeno, rPheno, strategy = strategy, nperm = nperm, parametric = parametric, ncores = 6) # Get significant interactions significant = self.r_CTLsignificant(res, significance = significance) # Create an image for output self.results = {} self.results['imgurl1'] = webqtlUtil.genRandStr("CTLline_") + ".png" self.results['imgloc1'] = GENERATED_IMAGE_DIR + self.results['imgurl1'] self.results['ctlresult'] = significant self.results['requestform'] = requestform # Store the user specified parameters for the output page # Create the lineplot r_png(self.results['imgloc1'], width=1000, height=600) self.r_lineplot(res, significance = significance) r_dev_off() n = 2 for trait in self.trait_db_list: # Create the QTL like CTL plots self.results['imgurl' + str(n)] = webqtlUtil.genRandStr("CTL_") + ".png" self.results['imgloc' + str(n)] = GENERATED_IMAGE_DIR + self.results['imgurl' + str(n)] r_png(self.results['imgloc' + str(n)], width=1000, height=600) self.r_plotCTLobject(res, (n-1), significance = significance, main='Phenotype ' + trait) r_dev_off() n = n + 1 # Flush any output from R sys.stdout.flush()