def optimize_model(genes, samples, w, samples_to_genes, args, seed, logger=getLogger()): """Builds and optimizes the model using Gurobi. * **bound** (*bool*) - use a bound (opt value has to be better than that bound) * **z_bound** (*float*) - bound to use in that case **Returns:** * **z** (*float*) - optimal solution value. * **module** (*list*) - an optimal solution. * **cov** (*int*) - no of samples covered by solution * **act_cov** (*int*) - no of active samples covered by solution * **act_sample** (*int*) - no of total active samples """ try: # logger.info('* Finding best event...') z_single = float("-inf") for e in genes: z_e = sum(w[s] for s in events_to_samples[e]) if z_e > z_single: z_single = z_e e_single = e # logger.info('* Building ILP model...') m = Model('superdendrix') m.setParam('OutputFlag', int(logger.getEffectiveLevel() > logging.INFO)) if args.threads != -1: m.params.threads = args.threads x, y = {}, {} # variables for g in genes: x[g] = m.addVar(vtype=GRB.BINARY, name='x_%s' % g) for p in samples: y[p] = m.addVar(vtype=GRB.BINARY, name='y_%s' % p) m.update() # #print 'obj:', obj #obj = quicksum(y[p] for p in one_samples) - quicksum(y[p] for p in zero_samples) #if k != -1: m.setObjective(build_obj(x, y, w, args), GRB.MAXIMIZE) #else: # m.setObjective(obj - quicksum(x[g] for g in genes)/len(genes), GRB.MAXIMIZE) # set the bound constraint if appropriate # set the seed genes for g in seed: m.addConstr(x[g] == 1) # cardinality constraint if k != -1: m.addConstr(quicksum(x[g] for g in genes) <= k) # m.addConstr(quicksum(x[g] for g in genes) >= 1) # option1 for 0 len modules # coverage constraints for p in samples: m.addConstr(y[p] <= quicksum(x[g] for g in samples_to_genes[p])) for g in samples_to_genes[p]: if w[p] < 0: m.addConstr(y[p] >= x[g]) m.update() #m.write('opt_REVEALER.lp') # if verbose: print('* Tuning model...') # m.tune() # m.getTuneResult(0); # m.write('tune.prm') # logger.info('* Optimizing model...') m.optimize() if m.SolCount == 0: logger.warning('%s No solution found, optimization status = %d' % (args.target_column, m.Status)) return float( "-inf" ), [], "", -1, -1, -1, "n/a", "n/a" # z, module, e_single, cov, act_cov, act_sample, p_val_str, p_val_single_str else: z = m.ObjVal if k == -1 and args.reoptimize: #second optimization phase: min # genes logger.info('* Re-optimizing model...') m.addConstr(obj == m.ObjVal) m.setObjective(quicksum(x[g] for g in genes), GRB.MINIMIZE) m.optimize() #print m.RunTime, t_ILP ## output module ## module = [] for g in genes: if x[g].X > 0.5: module.append(g) # well... #print sorted(module) cov, act_cov, act_sample = 0, 0, 0 for p in samples: cov += int(y[p].X) if w[p] > 0: act_sample += 1 act_cov += int(y[p].X) ## pruning module #while module != None: # worst_event, worst_count = conditional_permutation_test(module, events_to_samples, profile, samples, 1000)#args.p_val_it) # if args.verbosity: print(worst_event, worst_count/float(1000))#args.p_val_it)) # print("########") # if worst_count/float(1000) <= 0.01: break # ##if worst_count <= 0: break # module.remove(worst_event) # #cases_by_event = [events_to_samples[event] for event in module] #zP = SuperW(cases_by_event, profile, samples) # ## compute p-value # #mo.addConstr(quicksum(x[g] for g in genes) == len(module)) #p_val_str = "n/a" #p_val_single_str = "n/a" #z_single = float("-inf") #if args.p_val_it != -1: # no_better, no_better_single = 0, 0 # ##values = w.values() # values = list(w.values()) # for i in range(args.p_val_it): # random.shuffle(values) # w2 = dict(zip(w.keys(), values)) # # adapt weights in objective function # mo.setObjective(build_obj(x, y, w2, args), GRB.MAXIMIZE) # mo.update() # mo.optimize() # if mo.ObjVal >= zP: no_better = no_better + 1 # # p-value for single event: # z_single_s = float("-inf") # for e in genes: # z_e = sum(w2[s] for s in events_to_samples[e]) # if z_e > z_single_s: z_single_s = z_e # if z_single_s >= z_single: no_better_single = no_better_single + 1 # p_val_str = str(no_better/float(args.p_val_it)) # p_val_single_str = str(no_better_single/float(args.p_val_it)) except GurobiError as e: logger.error('Error:', e.message) return z, module, e_single, cov, act_cov, act_sample, x, y, m ##, p_val_str, p_val_single_str
parser.add_argument('-efc', '--excluded_feature_classes', type=str, required=False, nargs='*', default=[], choices=FEATURE_CLASSES) parser.add_argument('-rs', '--random_seed', type=int, default=12345, required=False) args = parser.parse_args(sys.argv[1:]) # Set up logger logger = getLogger(args.verbosity) # Load the input data X = pd.read_csv(args.feature_file, index_col=0, sep='\t') y = pd.read_csv(args.outcome_file, index_col=0, sep='\t') feature_classes = pd.read_csv(args.feature_class_file, index_col=0, sep='\t') # Align the features and outcomes patients = X.index X = X.reindex(index=patients) y = y.reindex(index=patients) outcome_name = y.columns[0] # Create some data structures to hold our output json_output = dict(patients=list(map(float, patients)), params=vars(args))
def run(args): t_init = time.time() # Set up logger logger = getLogger(args.verbosity) logger.info("# calling %s" % " ".join(sys.argv)) logger.info("# at time %s" % time.strftime("%H:%M:%S on %a, %d %b %Y ")) logger.info("# on machine %s" % socket.gethostname()) # global k, N, events_to_samples, samples_to_genes, samples, module k = args.cardinality mutationMatrix = args.mutation_matrix if args.seed: seed = eval(args.seed) else: seed = [] random.seed(args.random_seed) # founding year of Brown University np.random.seed(args.random_seed) # Generate/load the target profile w = {} # weights global numNanInf numNanInf = 0 if args.target_format == 'revealer': with open(args.target) as f: arrs = [l.rstrip().split("\t") for l in f if not l.startswith("#")] for arr in [arr for arr in arrs]: w[arr[0]] = float(arr[1]) + args.offset if args.direction == 'negative': w[arr[0]] = -w[arr[0]] if args.unit_weights: if w[arr[0]] <= 0: w[arr[0]] = -1 else: w[arr[0]] = +1 elif args.target_format == 'achilles': assert (args.target_column) with open(args.target) as f: line = f.readline() ind = line.rstrip().split().index(args.target_column) # + 1 arrs = [l.rstrip().split("\t") for l in f if not l.startswith("#")] #arrs = [ re.findall(r"[-\w']+", l) for l in f if not l.startswith("#") ] #for arr in [arr for arr in arrs if arr[0] in patients]: for arr in arrs: try: if math.isnan(float(arr[ind])): logger.info( "Warning: profile of sample %s is NaN: removing sample." % arr[0]) numNanInf += 1 elif math.isinf(float(arr[ind])): w[arr[0]] = 100.0 if float( arr[ind]) == float("inf") else -100.0 if args.direction == 'negative': w[arr[0]] = -w[arr[0]] numNanInf += 1 else: w[arr[0]] = float(arr[ind]) + args.offset if args.direction == 'negative': w[arr[0]] = -w[arr[0]] if args.unit_weights: if w[arr[0]] <= 0: w[arr[0]] = -1 else: w[arr[0]] = +1 # except Exception as ex: # template = "An exception of type {0} occured. Arguments:\n{1!r}" # message = template.format(type(ex).__name__, ex.args) # print message except: logger.warning("Warning: %s %s" % (arr[0], arr[ind])) # # skip these # check for patients w/out weights: #print arr[0], w[arr[0]] # terminate if too many inf or NaN if float(numNanInf) / len(arrs) > 0.1: print("too many Nan or Inf, terminating") logger.info('Nan, Inf fraction: %s ' % (float(numNanInf) / len(arrs))) if args.output_file: of = open(args.output_file, 'w') of.write(str(args.target_column) + "\t") of.write("incomplete due to high Nan, Inf\t") of.write("Nan, Inf fraction: %s" % (float(numNanInf) / len(arrs))) return 1 samples1 = w.keys() eventToCases, mutation_samples = load_events(args.mutation_matrix, verbose=1) profiles, profile_samples = load_profiles( args.target, sample_whitelist=mutation_samples, verbose=1) # Load the mutation data #print('####') #print(len(samples1)) mutations = load_mutation_data(mutationMatrix, samples1, args.gene_file, args.mutations_only, args.min_freq, args.max_freq) m, N, genes, samples, events_to_samples, samples_to_genes = mutations #print(set(eventToCases.keys())-set(genes)) coef = 1 if args.direction == 'negative': coef = -1 global profile profile = dict((s, coef * profiles[args.target_column][i]) for i, s in enumerate(profile_samples) if s in samples) logger.info('* Mutation data') logger.info('\t- Alterations: %s' % m) logger.info('\t- Samples: %s' % N) logger.info('* Seed genes: %s' % ','.join(seed)) #print w # # if set(patients) - set(w.keys()): # nokeys = set(patients) - set(w.keys()) # if verbose & len(nokeys): print "Warning:", len(nokeys), "samples w/out weights" # : ", nokeys # patients = list(set(patients) & set(w.keys())) t_start = time.time() module = args.query.split(",") cases_by_event = [events_to_samples[event] for event in module] zP = SuperW(cases_by_event, profile, samples) max_score = sum(w[p] for p in samples if w[p] > 0) if max_score == 0: max_score = -0.1 print("module is :", module) print("score is :", zP, "max_score is :", max_score) print("score/max_score is :", float(zP) / max_score) if args.cond_it != -1: while len(module) > 1: logger.info("len module" + str(len(module))) worst_event, worst_count = conditional_permutation_test( module, events_to_samples, profile, samples, args.cond_it) #if args.verbosity: print('worst',worst_event, worst_count/float(args.cond_it)) logger.info('worst= ' + str(worst_event) + str(worst_count / float(args.cond_it))) if worst_count / float(args.cond_it) <= (1.0 / float(args.cond_it)): break ##if worst_count <= 0: break module.remove(worst_event) # matrix permutation using curveball method k = len(module) t_s1 = time.time() if args.curveball and len(module) > 0 and args.p_val_it > 0: print("curveball") nullmat_dir = args.null_matrices c = 0 t_curve_arr = [] t_opt_arr = [] no_better = 0 p_z_arr = [] for i in range(args.p_val_it): t_s = time.time() p_mutations = load_mutation_data(nullmat_dir + str(i) + ".tsv", samples1, args.gene_file, args.mutations_only, args.min_freq, args.max_freq) p_m, p_N, p_genes, p_samples, p_events_to_samples, p_samples_to_genes = p_mutations t_curve = time.time() - t_s # print("one curveball loading takes (seconds:)",t_curve) t_s = time.time() p_z, p_module, p_best_single, p_cov, p_act_cov, p_act_sample, p_x, p_y, p_mo = optimize_model( genes, samples, w, p_samples_to_genes, args, seed, logger) #change args.k if p_z >= zP: no_better = no_better + 1 p_z_arr.append(p_z) c += 1 t_opt2 = time.time() - t_s # t_curve_arr.append(t_curve) # t_opt_arr.append(t_opt2) # print("one optimization after curveball permutation (sec)", t_opt2) if c % 1000 == 0: print("cycle:", str(c)) print("time:", str((time.time() - t_s1) / 60), "min") p_val_str = str(float(no_better) / args.p_val_it) print(str((time.time() - t_s1) / 60), "min") avg_p_z = str(float(sum(p_z_arr)) / len(p_z_arr)) # print(str(sum(t_curve_arr)/len(t_curve_arr)), "average curve time in sec") # print(str(sum(t_opt_arr)/len(t_opt_arr)), "average opt time in sec") # compute p-value if len(module) == 0: avg_p_z = "NA" #print module #print("here") #if args.verbosity: print([(e, len(events_to_samples[e])) for e in module]) mut_samples = {s for g in module for s in events_to_samples[g]} # we multiply by minus one because the weights had been transformed that way (effect is just changing the sign of IC, but this way it is correct) ordered_w = [-w[s] for s in samples] #IC = ic(ordered_w, mut_samples, samples, seed_cases=set(), metric="IC") ##for individual aberration scores global scores_ind scores_ind = [] scores_dict = dict() for i in range(len(module)): sampleset = [events_to_samples[module[i]]] scores_ind.append(float(round(SuperW(sampleset, profile, samples), 2))) scores_dict[module[i]] = float(scores_ind[i]) scores_ind.sort(reverse=True) scores_ind = map(str, scores_ind) #print(scores_ind) module.sort(key=lambda g: float(scores_dict[g]), reverse=True) #print(module) ##for coverage cover = set() cov_ind = [] for i in range(len(module)): cov_ind.append(str(len(events_to_samples[module[i]]))) for i in range(len(module)): cover = cover.union(events_to_samples[module[i]]) logger.info(", ".join(module)) ##urlencode dataset = "Project Achilles" if args.target_format == 'achilles' else "Project Revealer" query_input = [('dataset', dataset), ('profile', args.target_column), ('sample_lists', 'CERES'), ('events', " ".join(module).replace("_MUT", ""))] queries = urlencode(query_input, quote_via=quote) url = "https://superdendrix-data-explorer.lrgr.io/#" ##Wilcoxon rank sum test if len(module) == 0: ranksum_pval = 1 else: mutsamples = set() for i in range(len(module)): mutsamples = mutsamples.union(events_to_samples[module[i]]) nomutsamples = set(samples) - mutsamples mutscores = [profile[s] for s in mutsamples] nomutscores = [profile[s] for s in nomutsamples] ranksum_pval = ranksums(mutscores, nomutscores)[1] ##output if args.output_file: of = open(args.output_file, 'w') of.write( "target\taberrations\t#sample\tscores\tp-value\tz\tmax_score\tz/max_score(%)\tcoverage\tranksum_pval\tbrowser_link\tt_total\tavg_random_z\n" ) of.write(str(args.target_column) + "\t") if len(module) > 0: of.write(",".join(module)) of.write("\t" + ",".join(cov_ind)) of.write("\t" + ",".join(scores_ind)) else: of.write("nan") of.write("\t" + "nan") of.write('\t' + "nan") of.write("\t" + str(p_val_str) + "\t" + str(round(zP, 4)) + "\t" + str(round(max_score, 4)) + '\t' + str(round(100 * zP / max_score, 4)) + "\t" + str(len(cover)) + "/" + str(N) + "\t") of.write("\t" + str(ranksum_pval) + "\t" + url + queries) of.write("\t" + str(time.time() - t_init)) of.write("\t" + avg_p_z) for g in module: logger.info('%s %s' % (g, len(events_to_samples[g]))) return module
def map_permutation_test(args): # Set up logger logger = getLogger(args.verbosity) # Load required modules from sklearn.model_selection import LeaveOneOut, GridSearchCV, cross_val_predict from metrics import compute_metrics # Load the input data X = pd.read_csv(args.feature_file, index_col=0, sep='\t') y = pd.read_csv(args.outcome_file, index_col=0, sep='\t') feature_classes = pd.read_csv(args.feature_class_file, index_col=0, sep='\t') # Align the features and outcomes patients = X.index X = X.reindex(index = patients) y = y.reindex(index = patients) outcome_name = y.columns[0] # Restrict to the training columns selected_feature_classes = set(map(str.capitalize, set(FEATURE_CLASSES) - set(args.excluded_feature_classes))) training_cols = feature_classes['Class'].isin(selected_feature_classes).index.tolist() ############################################################################ # RUN PERMUTATION TEST ############################################################################ #Initialize model pipeline, gscv = init_model(args.model, args.n_jobs, args.estimator_random_seed, args.max_iter, args.tol) # Permute the outcomes np.random.seed(args.permutation_random_seed) y[outcome_name] = np.random.permutation(y[outcome_name]) # Convert dataframes to matrices to avoid dataframe splitting error outer_cv = LeaveOneOut() preds = pd.Series(cross_val_predict(estimator = gscv, X=X.loc[:,training_cols], y=y[outcome_name], cv=outer_cv, n_jobs = args.n_jobs, verbose=61 if args.verbosity > 0 else 0), index = patients) # Evalue results sub_y = y.loc[patients][outcome_name].values sub_preds = preds.loc[patients].values metric_vals, var_explained = compute_metrics(sub_y, sub_preds) ############################################################################ # OUTPUT TO FILE ############################################################################ with open(args.output_file, 'w') as OUT: output = { "var_explained": var_explained.tolist(), "true": sub_y.tolist(), "preds": "sub_preds", "params": vars(args), "training_features": training_cols } output.update(metric_vals.items()) json.dump( output, OUT )