示例#1
0
def send_positions_to_server(positions, chrom, client_config, env):
    client_name = client_config['name']

    data = pickle.dumps({'CHROM': chrom, 'POS': positions})

    networking.respond_to_server('api/tasks/INIT/POS', 'POST', data,
                                 client_name, env)
示例#2
0
 def run_covar_regression(self, warm_start=None, rho=250.0, alpha=1.0):
     # instead of making many function calls, I'll separate the covariate and chromosome regressions
     model = "Small"
     ncov = self.covariates.shape[1]
     estimates = np.zeros((ncov - 1, 1))
     idx = [i for i in range(ncov) if i != 1]
     covariates = self.covariates[:, idx]
     if self.prev_cov_estimate is not None:
         z_hat = self.prev_cov_estimate
         all_Us = self.previous_Us[model] + z_hat - warm_start
     else:
         all_Us = 0
     if warm_start is None:
         estimates[:, 0] = other_newton(covariates, np.zeros((ncov - 1)),
                                        np.zeros((ncov - 1, )), rho,
                                        estimates[:, 0], ncov - 1)
         z_hat = estimates
     else:
         estimates[:, 0] = other_newton(covariates, all_Us[:, 0],
                                        warm_start[:, 0], rho, z_hat[:, 0],
                                        ncov - 1)
         z_hat = alpha * estimates + (1 - alpha) * warm_start
     self.prev_cov_estimate = estimates
     self.previous_Us[model] = all_Us
     est = z_hat + all_Us
     msg = pickle.dumps({"VALS": est, "Estimated": "Small"})
     networking.respond_to_server('api/tasks/ASSO/estimate', 'POST', msg,
                                  self.client_config['name'], self.env)
示例#3
0
 def cost(self, data):
     msg = pickle.loads(data)
     chrom = msg["Estimated"]
     mask = msg["conv"]
     x0 = msg["x0"]
     estimates = self.evaluate_estimate(chrom, mask, x0)
     estimates -= self.baseline_likelihood[chrom][mask[:, 0]]
     msg = pickle.dumps({'estimated': chrom, 'v': estimates})
     networking.respond_to_server('api/tasks/ASSO/valback', 'POST', msg,
                                  self.client_config['name'], self.env)
示例#4
0
    def send_likelihood(self, message):
        # TODO Important, if we are excluding missing values, we should recompute baseline every time
        message = pickle.loads(message)
        include_mask = self.include_mask
        model = message["Estimated"]
        coef = message["Coef"]
        coef = coef.T
        covariates = self.covariates
        # n = int(np.sum(self.include_mask))
        y = self.Ys.copy()
        ell = None
        if self.flipped_covar:
            self.covariates *= -y
            self.flipped_covar = False
        y += 1
        y /= 2
        if model == "Small":
            indx = [i for i in range(covariates.shape[1]) if i != 1]
            y_model = 1.0 / (1 + np.exp(-covariates[:, indx].dot(coef.T)))
            self.base_y_pred = y_model
            # ell = log_loss((y+1)/2, y_model, normalize=False, labels=[0,1])
        else:
            group = self.store[model]
            af = group["MAF"].value
            tokeep = np.logical_and(af > self.threshold,
                                    1 - af > self.threshold)
            positions = group["QC_positions"].value
            ell = np.zeros((1, positions.shape[0]))
            for i, position in enumerate(positions):
                if not tokeep[i]:
                    ell[0, i] = np.nan
                else:
                    val = group[str(position)].value[include_mask]
                    ind = ~np.isnan(val)  # TODO impute or something?
                    covariates[:, 1] = val
                    y_model = 1.0 / (
                        1 + np.exp(-covariates[ind, :].dot(coef[:, i].T)))
                    ell[0, i] = log_loss(y[ind],
                                         y_model,
                                         normalize=False,
                                         labels=[0, 1])
                    ell[0, i] -= log_loss(y[ind],
                                          self.base_y_pred[ind],
                                          normalize=False,
                                          labels=[0, 1])

        msg = pickle.dumps({"Estimated": model, "estimate": ell})
        networking.respond_to_server('api/tasks/ASSO/pval', 'POST', msg,
                                     self.client_config['name'], self.env)
示例#5
0
def report_cov(client_config, env):
    def standardize_mat(mat, af, sd):
        af = 2 * af.reshape(af.shape[0], 1)
        mat -= af
        ind = sd > 0
        mat[ind, :] /= sd[ind].reshape(np.sum(ind), 1)
        mat[np.isnan(mat)] = 0
        return mat
    pfile = shared.get_plink_store(client_config["plinkfile"])
    with h5py.File(pfile, 'r') as store:
        n = store.attrs["n"]
        chroms = sorted([ch for ch in store if ch != "meta"], key=int)
        size = 0
        for chi, ch1 in enumerate(chroms):
            group = store[ch1]
            tokeep = group['PCA_mask'].value
            pos = group["positions"].value[tokeep]
            af1 = group["MAF"].value[tokeep]
            sd1 = np.sqrt(group["VAR"].value[tokeep])
            g1 = np.empty((len(pos), n))
            for i, snp1 in enumerate(pos):
                g1[i, :] = group[str(snp1)].value
            g1 = standardize_mat(g1, af1, sd1)
            size += i+1
            for j, ch2 in enumerate(chroms):
                if j > chi:
                    continue
                msg = {}
                group = store[ch2]
                tokeep = group['PCA_mask'].value
                af2 = group["MAF"].value[tokeep]
                sd2 = np.sqrt(group["VAR"].value[tokeep])
                pos = group["positions"].value[tokeep]
                g2 = np.empty((n, len(pos)))
                for i, snp2 in enumerate(pos):
                    g2[:, i] = group[str(snp2)].value
                g2 = standardize_mat(g2.transpose(), af2, sd2).transpose()
                msg["CH1"] = ch1
                msg["CH2"] = ch2
                logger.info(f"Reporting cov: {ch1}_{ch2}: {g1.shape} x {g2.shape}")
                msg["MAT"] = g1.dot(g2).astype(np.float32)
                if ch1 == chroms[-1] and ch2 == chroms[-1]:
                    msg["E"] = True
                msg = pickle.dumps(msg)
                networking.respond_to_server('api/tasks/PCA/COV', 'POST', msg, client_config['name'], env)
        logger.info(f"Final size will be {size}")
示例#6
0
def init_stats(msg_dict, client_config, env):
    print(msg_dict.keys())
    # Wait on previous tasks to finish
    i = current_app.control.inspect()
    client_name = client_config['name']
    while i.active() is not None:
        active_tasks = i.active()[f'celery@{client_name}']
        dependent_tasks = list(
            filter(lambda x: x['name'] == 'tasks.init_store', active_tasks))
        if len(dependent_tasks) > 0:
            logger.info('Waiting on tasks.init_store to finish.')
            time.sleep(.1)
        else:
            break
    #message = pickle.loads(message)
    pfile = client_config['plinkfile']
    #chrom = message["CHROM"]
    with h5py.File(shared.get_plink_store(pfile), 'a') as store:
        for chrom, message in msg_dict.items():
            logger.info(f'Computing statistics for Chrom: {chrom}.')
            chrom_group = store[chrom]
            if "MISS" in message:
                vals = np.array(message["MISS"])
                task = "not_missing_per_snp"
                write_or_replace(chrom_group, task, val=1 - vals)
            if "AF" in message:
                vals = np.array(message["AF"])
                task = 'MAF'
                write_or_replace(chrom_group, task, val=vals)
            if "HWE" in message:
                vals = np.array(message["HWE"])
                task = "hwe"
                write_or_replace(chrom_group, task, val=vals)
            if "VAR" in message:
                vals = np.array(message["VAR"])
                task = "VAR"
                write_or_replace(chrom_group, task, val=vals)
        logging.info(f'Finished initializing QC statistics for chrom {chrom}.')

    client_name = client_config['name']
    status = f'Finished with init stats.'
    networking.respond_to_server(
        f'api/clients/{client_name}/report?status={status}', 'POST', env=env)
示例#7
0
 def send_summary_to_standardize(self):
     # This is a rough sketch, the assumption here is that if you are quantitative factor, then you
     # exhibit more than 2 values within each silo. I never actually verify this with the current version
     # So that might be a good TODO for future implementations.
     quant_covars = [
         i for i in range(2, self.covariates.shape[1])
         if len(np.unique(self.covariates[:, i])) > 2
     ]
     sums = np.sum(self.covariates[:, quant_covars], axis=0)
     sumsq = np.sum(self.covariates[:, quant_covars]**2, axis=0)
     msg = {
         "Indx": quant_covars,
         "Sums": sums,
         "SS": sumsq,
         "N": self.covariates.shape[0]
     }
     msg = pickle.dumps(msg)
     networking.respond_to_server('api/tasks/ASSO/adjust', 'POST', msg,
                                  self.client_config['name'], self.env)
示例#8
0
 def update(self, data, client_config, env):
     data = pickle.loads(data)
     n = self.store.attrs['n']
     msg = {}
     if self.r3 == 0:  # fake start the first round
         for chrom in self.chroms:
             # if the length is less than r1, you deserve an error.
             # No apologies
             tags = self.store["{}/PCA_passed".format(chrom)]
             data[chrom] = tags[0:self.r1]
     for key, state in data.items():
         if key == "TASK" or key == "SUBTASK":
             continue
         chrom = key
         tags = self.store["{}/PCA_passed".format(chrom)]
         if state[0] == "E":  # Finished with this chrom
             if len(data) == 1:  # Done with everything
                 msg = pickle.dumps({})
                 networking.respond_to_server('api/tasks/PCA/PCAPOS', 'POST', msg,
                                              client_config['name'], env)
                 self.store.close()
                 logger.info("Done with LD pruning.")
                 return
             continue
         else:
             tokeep = state
             end = self.r3 + len(tokeep)
         pos = self.store["{}/PCA_positions".format(chrom)]
         positions = pos[self.r3: end]
         positions = positions[tokeep]
         genotypes = np.empty((n, len(positions)), dtype=np.float32)
         for i, snp in enumerate(positions):
             genotypes[:, i] = self.store["{}/{}".format(chrom, snp)].value
         corr = nancorr(genotypes)
         msg[chrom] = corr
     msg = pickle.dumps(msg)
     networking.respond_to_server('api/tasks/PCA/LD', 'POST', msg, client_config['name'], env)
     self.r3 += self.r2
     if self.r3 > self.print_int:
         logger.info(f"pruning at {self.r3}")
         self.print_int += 1000
示例#9
0
def send_counts_to_server(data, client_config, env):
    client_name = client_config['name']
    data = pickle.dumps(data)
    networking.respond_to_server('api/tasks/INIT/COUNT', 'POST', data,
                                 client_name, env)
示例#10
0
def echo(client_config, env):
    networking.respond_to_server('api/tasks/Echo/itr', 'POST', env)
示例#11
0
    def run_logistic_regression(self,
                                y,
                                chrom=None,
                                warm_start=None,
                                rho=250.0,
                                alpha=1.00):
        store = self.store
        include_mask = self.include_mask
        n = int(np.sum(self.include_mask))
        y = y.reshape(n)
        covariates = self.covariates.copy()
        group = store[chrom]
        positions = group["QC_positions"]
        ncov = self.covariates.shape[1]
        estimates = np.zeros((len(positions), ncov))
        if warm_start is None:
            est = np.zeros(ncov)
            covar_estimates = self.prev_cov_estimate
            # boundary condition for the loop
            est[1] = 0
            est[0] = covar_estimates[0]
            est[2:] = covar_estimates[1:].ravel()
        af = group["MAF"]
        if chrom in self.previous_estimates:
            z_hat = self.previous_estimates[chrom]
            all_Us = self.previous_Us[chrom] + z_hat - warm_start
        else:
            all_Us = np.zeros((ncov))
            all_Us[0] = self.previous_Us["Small"][0]
            all_Us[2:] = self.previous_Us["Small"][1:].ravel()
        count = 0
        t = time.time()
        for i, position in enumerate(positions):
            #  if i == 10: #TODO
            #      totalT = (time.time()-t)/500
            #      print(f"Average time is {totalT} for n={n}, {t2/500}")
            #      print(f"Count is {count}")
            #      break
            if not i % 100:
                logger.info(f"{time.time()-t}")
                t = time.time()
                # t2 = 0
                logger.info(f"{float(i/len(positions))}")
            if af[i] < self.threshold or (1 - af[i]) < self.threshold:
                estimates[i, :] = np.nan
                continue
            else:
                val = group[str(position)].value[include_mask]
                ind = ~np.isnan(val)
                covariates[ind, 1] = val[ind] * -y[ind]
                count += 1
                # t3 = time.time()
                if warm_start is None:
                    # est = np.ascontiguousarray(estimates[:,i])
                    # estimates[:,i] = minimize_lbfgs(covariates[ind, :],
                    #                                 np.zeros((ncov)), np.zeros((ncov,)),rho, est, ncov)
                    # estimates[i, :] = minimize_lbfgs(covariates[ind, :], np.zeros((ncov)), est, rho, est, ncov)
                    estimates[i, :] = other_newton(covariates[ind, :], all_Us,
                                                   est, rho, est, ncov)
                    # est *= .2
                    # est += .8*estimates[i, :]
                    # est[1] = 0
                    #                    print(f"{estimates[i, :]} from myc")
                    #                    estimates[i, :] = bfgs_more_gutted(covariates[ind, :], np.zeros((ncov)),
                    #                                  np.zeros((ncov,)), rho, est, ncov)
                    #                    print(f"{estimates[i, :]} from sci")
                    z_hat = alpha * estimates + (1 - alpha) * est
                else:
                    # estimates[i, :] = bfgs_more_gutted(covariates[ind, :], all_Us[i, :],
                    #    warm_start[i, :], rho, z_hat[i, :], ncov)
                    # estimates[i, :] = minimize_lbfgs(covariates[ind, :], all_Us[i, :],
                    #    warm_start[i, :], rho, z_hat[i, :], ncov)
                    estimates[i, :] = other_newton(covariates[ind, :],
                                                   all_Us[i, :],
                                                   warm_start[i, :], rho,
                                                   z_hat[i, :], ncov)

                    z_hat = alpha * estimates + (1 - alpha) * warm_start
            #  t2 += time.time()-t3
        self.previous_estimates[chrom] = estimates
        self.previous_Us[chrom] = all_Us
        msg = pickle.dumps({
            "Estimated": chrom,
            "VALS": z_hat + all_Us
        })  # , 'cov': covariates})
        networking.respond_to_server('api/tasks/ASSO/estimate', 'POST', msg,
                                     self.client_config['name'], self.env)
示例#12
0
    def run_newton_lr(self, y, chrom=None, warm_start=None, unconverged=None):
        store = self.store
        logger.info("starting with newtons")
        include_mask = self.include_mask
        n = np.sum(include_mask)
        y = y.reshape(n)
        covariates = self.covariates.copy()
        group = store[chrom]
        positions = group["QC_positions"]
        mask = group["QC_mask"].value
        af = group["MAF"].value[mask]
        ncov = self.covariates.shape[1]
        baselikelihood = self.baseline_likelihood[chrom]
        if unconverged is None:
            L = len(positions)
        else:
            L = np.sum(unconverged)
            positions = positions[unconverged[:, 0]]
            af = af[unconverged[:, 0]]
            baselikelihood = baselikelihood[unconverged[:, 0]]

        hessians = np.zeros((int(np.ceil(L / 2)), ncov, ncov))
        diagonals = np.zeros((L, ncov))
        gradients = np.zeros((L, ncov))
        vals = np.zeros((L, 1))
        count = 0
        t = time.time()
        for i, position in enumerate(positions):
            #  if i == 10: #TODO
            #      totalT = (time.time()-t)/500
            #      print(f"Average time is {totalT} for n={n}, {t2/500}")
            #      print(f"Count is {count}")
            #      break
            if not i % 5000:
                logger.info(
                    f"After {time.time()-t:.1f}s done with {float(i/L)*100:.1f}% of iteration."
                )
            if af[i] < self.threshold or (1 - af[i]) < self.threshold:
                continue
            val = group[str(position)].value[include_mask]
            # Dumb imputation. Hopefully your data is already imputed and this doesn't happen
            val[np.isnan(val)] = 0
            # ind = ~np.isnan(val)
            # covariates[ind,1] = val[ind] * -y[ind]
            covariates[:, 1] = val * -y
            count += 1
            # submat = covariates[ind, :]
            h, diagonals[i], gradients[i], vals[i, 0] = ltri_Hessians(
                # submat, warm_start[i, :,0], ncov, submat.shape[0], 0) #rho set to zero
                covariates,
                warm_start[i, :, 0],
                ncov,
                covariates.shape[0],
                0)  # rho set to zero
            if i % 2:
                hessians[i // 2, :, :] += h.T
            else:
                hessians[i // 2, :, :] += h
        vals -= baselikelihood
        msg = pickle.dumps({
            "Estimated": chrom,
            "H": hessians,
            'g': gradients,
            'd': diagonals,
            'v': vals,
            "covar": covariates
        })
        networking.respond_to_server('api/tasks/ASSO/hessians', 'POST', msg,
                                     self.client_config['name'], self.env)
示例#13
0
def run_QC(filters, client_config, prefix, remove=True, env="production"):
    def find_what_passes(qc_name, dset_name, tokeep, doubleSided=False):
        vals = group[dset_name].value
        if qc_name in filters:
            thresh = float(filters[qc_name])
            if not doubleSided:
                tokeep = np.logical_and(tokeep, vals > thresh)
            else:
                tokeep = np.logical_and(
                    tokeep,
                    np.logical_and(
                        vals > thresh - Settings.kSmallEpsilon,
                        (1.0 - vals) > thresh - Settings.kSmallEpsilon))
        return tokeep

    def replace_dataset(tokeep, dset_name, return_deleted=False):
        vals = group[dset_name].value
        remaining = vals[tokeep]
        deleted = vals[np.logical_not(tokeep)]
        write_or_replace(group, dset_name, remaining)
        if return_deleted:
            return deleted

    pfile = client_config["plinkfile"]
    store_name = shared.get_plink_store(pfile)
    with h5py.File(store_name, 'a') as store:
        for chrom in store.keys():
            if chrom == "meta":
                continue
            group = store[chrom]
            positions = group['positions'].value
            if "QC_mask" in group:
                tokeep = group["QC_mask"].value
            else:
                tokeep = np.ones_like(positions, dtype=bool)

            tokeep = find_what_passes(QCFilterNames.QC_HWE, "hwe", tokeep)
            tokeep = find_what_passes(QCFilterNames.QC_MAF,
                                      "MAF",
                                      tokeep,
                                      doubleSided=True)
            if QCFilterNames.QC_MPS in filters:
                filters[
                    QCFilterNames.QC_MPS] = 1 - filters[QCFilterNames.QC_MPS]
            tokeep = find_what_passes(QCFilterNames.QC_MPS,
                                      "not_missing_per_snp", tokeep)
            logger.info(
                f"After filtering {chrom}, {np.sum(tokeep)} snps remain")
            if remove:  # Delete what doesn't pass
                replace_dataset(tokeep, 'hwe')
                replace_dataset(tokeep, 'VAR')
                replace_dataset(tokeep, 'MAF')
                replace_dataset(tokeep, 'not_missing_per_snp')
                deleted = replace_dataset(tokeep,
                                          'positions',
                                          return_deleted=True)
                for snp in deleted:
                    snp = str(snp)
                    if snp in group:
                        del group[snp]
            else:  # Store what has been tagged
                pass_mask = prefix + "_mask"
                pos_mask = prefix + "_positions"
                if pass_mask in group:
                    del group[pass_mask]
                if pos_mask in group:
                    del group[pos_mask]
                write_or_replace(group, pass_mask, val=tokeep, dtype=bool)
                positions = group['positions'].value[tokeep]
                write_or_replace(group, pos_mask, val=positions)
                if prefix == "PCA":
                    write_or_replace(group,
                                     "PCA_passed",
                                     val=np.ones(np.sum(tokeep), dtype=bool))
                    if 'non_ld_mask' in group:
                        del group['non_ld_mask']
    client_name = client_config['name']
    if prefix == "QC":
        networking.respond_to_server('api/tasks/QC/FIN', "POST", b'',
                                     client_name, env)
    else:
        networking.respond_to_server('api/tasks/PCA/FIN', "POST", b'',
                                     client_name, env)