Пример #1
0
def do_correlation(start_vars):
    assert ('db' in start_vars)
    assert ('target_db' in start_vars)
    assert ('trait_id' in start_vars)

    this_dataset = data_set.create_dataset(dataset_name=start_vars['db'])
    target_dataset = data_set.create_dataset(
        dataset_name=start_vars['target_db'])
    this_trait = GeneralTrait(dataset=this_dataset,
                              name=start_vars['trait_id'])
    this_trait = retrieve_sample_data(this_trait, this_dataset)

    corr_params = init_corr_params(start_vars)

    corr_results = calculate_results(this_trait, this_dataset, target_dataset,
                                     corr_params)
    #corr_results = collections.OrderedDict(sorted(corr_results.items(), key=lambda t: -abs(t[1][0])))

    final_results = []
    for _trait_counter, trait in enumerate(
            corr_results.keys()[:corr_params['return_count']]):
        if corr_params['type'] == "tissue":
            [sample_r, num_overlap, sample_p, symbol] = corr_results[trait]
            result_dict = {
                "trait": trait,
                "sample_r": sample_r,
                "#_strains": num_overlap,
                "p_value": sample_p,
                "symbol": symbol
            }
        elif corr_params['type'] == "literature" or corr_params[
                'type'] == "lit":
            [gene_id, sample_r] = corr_results[trait]
            result_dict = {
                "trait": trait,
                "sample_r": sample_r,
                "gene_id": gene_id
            }
        else:
            [sample_r, sample_p, num_overlap] = corr_results[trait]
            result_dict = {
                "trait": trait,
                "sample_r": sample_r,
                "#_strains": num_overlap,
                "p_value": sample_p
            }

        final_results.append(result_dict)

    # json_corr_results = generate_corr_json(final_corr_results, this_trait, this_dataset, target_dataset, for_api = True)

    return final_results
Пример #2
0
def fetch_sample_data(start_vars, this_trait, this_dataset, target_dataset):

    sample_data = process_samples(start_vars,
                                  this_dataset.group.all_samples_ordered())

    target_dataset.get_trait_data(list(sample_data.keys()))
    this_trait = retrieve_sample_data(this_trait, this_dataset)
    this_trait_data = {
        "trait_sample_data": sample_data,
        "trait_id": start_vars["trait_id"]
    }
    results = map_shared_keys_to_values(target_dataset.samplelist,
                                        target_dataset.trait_data)

    return (this_trait_data, results)
def do_correlation(start_vars):
    assert('db' in start_vars)
    assert('target_db' in start_vars)
    assert('trait_id' in start_vars)

    this_dataset = data_set.create_dataset(dataset_name=start_vars['db'])
    target_dataset = data_set.create_dataset(
        dataset_name=start_vars['target_db'])
    this_trait = create_trait(dataset=this_dataset,
                              name=start_vars['trait_id'])
    this_trait = retrieve_sample_data(this_trait, this_dataset)

    corr_params = init_corr_params(start_vars)

    corr_results = calculate_results(
        this_trait, this_dataset, target_dataset, corr_params)

    final_results = []
    for _trait_counter, trait in enumerate(list(corr_results.keys())[:corr_params['return_count']]):
        if corr_params['type'] == "tissue":
            [sample_r, num_overlap, sample_p, symbol] = corr_results[trait]
            result_dict = {
                "trait": trait,
                "sample_r": sample_r,
                "#_strains": num_overlap,
                "p_value": sample_p,
                "symbol": symbol
            }
        elif corr_params['type'] == "literature" or corr_params['type'] == "lit":
            [gene_id, sample_r] = corr_results[trait]
            result_dict = {
                "trait": trait,
                "sample_r": sample_r,
                "gene_id": gene_id
            }
        else:
            [sample_r, sample_p, num_overlap] = corr_results[trait]
            result_dict = {
                "trait": trait,
                "sample_r": sample_r,
                "#_strains": num_overlap,
                "p_value": sample_p
            }
        final_results.append(result_dict)
    return final_results
    def run_analysis(self, requestform):
        print("Starting CTL analysis on dataset")
        self.trait_db_list = [trait.strip() for trait in requestform['trait_list'].split(',')]
        self.trait_db_list = [x for x in self.trait_db_list if x]

        print("strategy:", requestform.get("strategy"))
        strategy = requestform.get("strategy")

        print("nperm:", requestform.get("nperm"))
        nperm = int(requestform.get("nperm"))

        print("parametric:", requestform.get("parametric"))
        parametric = bool(requestform.get("parametric"))

        print("significance:", requestform.get("significance"))
        significance = float(requestform.get("significance"))

        # Get the name of the .geno file belonging to the first phenotype
        datasetname = self.trait_db_list[0].split(":")[1]
        dataset = data_set.create_dataset(datasetname)

        genofilelocation = locate(dataset.group.name + ".geno", "genotype")
        parser = genofile_parser.ConvertGenoFile(genofilelocation)
        parser.process_csv()
        print(dataset.group)
        # Create a genotype matrix
        individuals = parser.individuals
        markers = []
        markernames = []
        for marker in parser.markers:
          markernames.append(marker["name"])
          markers.append(marker["genotypes"])

        genotypes = list(itertools.chain(*markers))
        print(len(genotypes) / len(individuals), "==", len(parser.markers))

        rGeno = r_t(ro.r.matrix(r_unlist(genotypes), nrow=len(markernames), ncol=len(individuals), dimnames = r_list(markernames, individuals), byrow=True))

        # Create a phenotype matrix
        traits = []
        for trait in self.trait_db_list:
          print("retrieving data for", trait)
          if trait != "":
            ts = trait.split(':')
            gt = TRAIT.GeneralTrait(name = ts[0], dataset_name = ts[1])
            gt = TRAIT.retrieve_sample_data(gt, dataset, individuals)
            for ind in individuals:
              if ind in gt.data.keys():
                traits.append(gt.data[ind].value)
              else:
                traits.append("-999")

        rPheno = r_t(ro.r.matrix(r_as_numeric(r_unlist(traits)), nrow=len(self.trait_db_list), ncol=len(individuals), dimnames = r_list(self.trait_db_list, individuals), byrow=True))

        print(rPheno)

        # Use a data frame to store the objects
        rPheno = r_data_frame(rPheno, check_names = False)
        rGeno = r_data_frame(rGeno, check_names = False)

        # Debug: Print the genotype and phenotype files to disk
        #r_write_table(rGeno, "~/outputGN/geno.csv")
        #r_write_table(rPheno, "~/outputGN/pheno.csv")

        # Perform the CTL scan
        res = self.r_CTLscan(rGeno, rPheno, strategy = strategy, nperm = nperm, parametric = parametric, ncores = 6)

        # Get significant interactions
        significant = self.r_CTLsignificant(res, significance = significance)

        # Create an image for output
        self.results = {}
        self.results['imgurl1'] = webqtlUtil.genRandStr("CTLline_") + ".png"
        self.results['imgloc1'] = GENERATED_IMAGE_DIR + self.results['imgurl1']

        self.results['ctlresult'] = significant
        self.results['requestform'] = requestform             # Store the user specified parameters for the output page

        # Create the lineplot
        r_png(self.results['imgloc1'], width=1000, height=600, type='cairo-png')
        self.r_lineplot(res, significance = significance)
        r_dev_off()

        n = 2                                                 # We start from 2, since R starts from 1 :)
        for trait in self.trait_db_list:
          # Create the QTL like CTL plots
          self.results['imgurl' + str(n)] = webqtlUtil.genRandStr("CTL_") + ".png"
          self.results['imgloc' + str(n)] = GENERATED_IMAGE_DIR + self.results['imgurl' + str(n)]
          r_png(self.results['imgloc' + str(n)], width=1000, height=600, type='cairo-png')
          self.r_plotCTLobject(res, (n-1), significance = significance, main='Phenotype ' + trait)
          r_dev_off()
          n = n + 1

        # Flush any output from R
        sys.stdout.flush()

        # Create the interactive graph for cytoscape visualization (Nodes and Edges)
        print(type(significant))
        if not type(significant) == ri.RNULLType:
          for x in range(len(significant[0])):
            print(significant[0][x], significant[1][x], significant[2][x])            # Debug to console
            tsS = significant[0][x].split(':')                                        # Source
            tsT = significant[2][x].split(':')                                        # Target
            gtS = TRAIT.GeneralTrait(name = tsS[0], dataset_name = tsS[1])            # Retrieve Source info from the DB
            gtT = TRAIT.GeneralTrait(name = tsT[0], dataset_name = tsT[1])            # Retrieve Target info from the DB
            self.addNode(gtS)
            self.addNode(gtT)
            self.addEdge(gtS, gtT, significant, x)

            significant[0][x] = gtS.symbol + " (" + gtS.name + ")"                    # Update the trait name for the displayed table
            significant[2][x] = gtT.symbol + " (" + gtT.name + ")"                    # Update the trait name for the displayed table

        self.elements = json.dumps(self.nodes_list + self.edges_list)
Пример #5
0
def do_mapping_for_api(start_vars):
    assert ('db' in start_vars)
    assert ('trait_id' in start_vars)

    dataset = data_set.create_dataset(dataset_name=start_vars['db'])
    dataset.group.get_markers()
    this_trait = GeneralTrait(dataset=dataset, name=start_vars['trait_id'])
    this_trait = retrieve_sample_data(this_trait, dataset)

    samples = []
    vals = []

    for sample in dataset.group.samplelist:
        in_trait_data = False
        for item in this_trait.data:
            if this_trait.data[item].name == sample:
                value = str(this_trait.data[item].value)
                samples.append(item)
                vals.append(value)
                in_trait_data = True
                break
        if not in_trait_data:
            vals.append("x")

    mapping_params = initialize_parameters(start_vars, dataset, this_trait)

    covariates = ""  #ZS: It seems to take an empty string as default. This should probably be changed.

    if mapping_params['mapping_method'] == "gemma":
        header_row = ["name", "chr", "Mb", "lod_score", "p_value"]
        if mapping_params[
                'use_loco'] == "True":  #ZS: gemma_mapping returns both results and the filename for LOCO, so need to only grab the former for api
            result_markers = gemma_mapping.run_gemma(
                this_trait, dataset, samples, vals, covariates,
                mapping_params['use_loco'], mapping_params['maf'])[0]
        else:
            result_markers = gemma_mapping.run_gemma(
                this_trait, dataset, samples, vals, covariates,
                mapping_params['use_loco'], mapping_params['maf'])
    elif mapping_params['mapping_method'] == "rqtl":
        header_row = ["name", "chr", "cM", "lod_score"]
        if mapping_params['num_perm'] > 0:
            _sperm_output, _suggestive, _significant, result_markers = rqtl_mapping.run_rqtl_geno(
                vals, dataset, mapping_params['rqtl_method'],
                mapping_params['rqtl_model'], mapping_params['perm_check'],
                mapping_params['num_perm'], mapping_params['do_control'],
                mapping_params['control_marker'],
                mapping_params['manhattan_plot'], mapping_params['pair_scan'])
        else:
            result_markers = rqtl_mapping.run_rqtl_geno(
                vals, dataset, mapping_params['rqtl_method'],
                mapping_params['rqtl_model'], mapping_params['perm_check'],
                mapping_params['num_perm'], mapping_params['do_control'],
                mapping_params['control_marker'],
                mapping_params['manhattan_plot'], mapping_params['pair_scan'])

    if mapping_params['limit_to']:
        result_markers = result_markers[:mapping_params['limit_to']]

    if mapping_params['format'] == "csv":
        output_rows = []
        output_rows.append(header_row)
        for marker in result_markers:
            this_row = [marker[header] for header in header_row]
            output_rows.append(this_row)

        return output_rows, mapping_params['format']
    elif mapping_params['format'] == "json":
        return result_markers, mapping_params['format']
    else:
        return result_markers, None
    def __init__(self, params):
        if "Temp" in params['dataset_1']:
            self.dataset_1 = data_set.create_dataset(dataset_name = "Temp", dataset_type = "Temp", group_name = params['dataset_1'].split("_")[1])
        else:
            self.dataset_1 = data_set.create_dataset(params['dataset_1'])
        if "Temp" in params['dataset_2']:
            self.dataset_2 = data_set.create_dataset(dataset_name = "Temp", dataset_type = "Temp", group_name = params['dataset_2'].split("_")[1])
        else:
            self.dataset_2 = data_set.create_dataset(params['dataset_2'])

        #self.dataset_3 = data_set.create_dataset(params['dataset_3'])
        self.trait_1 = create_trait(name=params['trait_1'], dataset=self.dataset_1)
        self.trait_2 = create_trait(name=params['trait_2'], dataset=self.dataset_2)
        #self.trait_3 = create_trait(name=params['trait_3'], dataset=self.dataset_3)

        self.method = params['method']

        primary_samples = self.dataset_1.group.samplelist
        if self.dataset_1.group.parlist != None:
            primary_samples += self.dataset_1.group.parlist
        if self.dataset_1.group.f1list != None:
            primary_samples += self.dataset_1.group.f1list

        self.trait_1 = retrieve_sample_data(self.trait_1, self.dataset_1, primary_samples)
        self.trait_2 = retrieve_sample_data(self.trait_2, self.dataset_2, primary_samples)

        samples_1, samples_2, num_overlap = corr_result_helpers.normalize_values_with_samples(self.trait_1.data, self.trait_2.data)

        self.data = []
        self.indIDs = list(samples_1.keys())
        vals_1 = []
        for sample in list(samples_1.keys()):
            vals_1.append(samples_1[sample].value)
        self.data.append(vals_1)
        vals_2 = []
        for sample in list(samples_2.keys()):
            vals_2.append(samples_2[sample].value)
        self.data.append(vals_2)

        slope, intercept, r_value, p_value, std_err = stats.linregress(vals_1, vals_2)

        if slope < 0.001:
            slope_string = '%.3E' % slope
        else:
            slope_string = '%.3f' % slope
        
        x_buffer = (max(vals_1) - min(vals_1))*0.1
        y_buffer = (max(vals_2) - min(vals_2))*0.1

        x_range = [min(vals_1) - x_buffer, max(vals_1) + x_buffer]
        y_range = [min(vals_2) - y_buffer, max(vals_2) + y_buffer]

        intercept_coords = get_intercept_coords(slope, intercept, x_range, y_range)

        rx = stats.rankdata(vals_1)
        ry = stats.rankdata(vals_2)
        self.rdata = []
        self.rdata.append(rx.tolist())
        self.rdata.append(ry.tolist())        
        srslope, srintercept, srr_value, srp_value, srstd_err = stats.linregress(rx, ry)

        if srslope < 0.001:
            srslope_string = '%.3E' % srslope
        else:
            srslope_string = '%.3f' % srslope

        x_buffer = (max(rx) - min(rx))*0.1
        y_buffer = (max(ry) - min(ry))*0.1

        sr_range = [min(rx) - x_buffer, max(rx) + x_buffer]

        sr_intercept_coords = get_intercept_coords(srslope, srintercept, sr_range, sr_range)

        self.collections_exist = "False"
        if g.user_session.num_collections > 0:
            self.collections_exist = "True"

        self.js_data = dict(
            data = self.data,
            rdata = self.rdata,
            indIDs = self.indIDs,
            trait_1 = self.trait_1.dataset.name + ": " + str(self.trait_1.name),
            trait_2 = self.trait_2.dataset.name + ": " + str(self.trait_2.name),
            samples_1 = samples_1,
            samples_2 = samples_2,
            num_overlap = num_overlap,
            vals_1 = vals_1,
            vals_2 = vals_2,
            x_range = x_range,
            y_range = y_range,
            sr_range = sr_range,
            intercept_coords = intercept_coords,
            sr_intercept_coords = sr_intercept_coords,

            slope = slope,
            slope_string = slope_string,
            intercept = intercept,
            r_value = r_value,
            p_value = p_value,

            srslope = srslope,
            srslope_string = srslope_string,
            srintercept = srintercept,
            srr_value = srr_value,
            srp_value = srp_value

            #trait3 = self.trait_3.data,
            #vals_3 = vals_3
        )
        self.jsdata = self.js_data