Exemplo n.º 1
0
def optimize_model(genes,
                   samples,
                   w,
                   samples_to_genes,
                   args,
                   seed,
                   logger=getLogger()):
    """Builds and optimizes the model using Gurobi.
      * **bound** (*bool*) - use a bound (opt value has to be better than that bound)
      * **z_bound** (*float*) - bound to use in that case

    **Returns:**
      * **z** (*float*) - optimal solution value.
      * **module** (*list*) - an optimal solution.
      * **cov** (*int*) - no of samples covered by solution
      * **act_cov** (*int*) - no of active samples covered by solution
      * **act_sample** (*int*) - no of total active samples
    """
    try:
        #        logger.info('* Finding best event...')
        z_single = float("-inf")
        for e in genes:
            z_e = sum(w[s] for s in events_to_samples[e])
            if z_e > z_single:
                z_single = z_e
                e_single = e

#        logger.info('* Building ILP model...')
        m = Model('superdendrix')
        m.setParam('OutputFlag',
                   int(logger.getEffectiveLevel() > logging.INFO))
        if args.threads != -1: m.params.threads = args.threads
        x, y = {}, {}
        # variables
        for g in genes:
            x[g] = m.addVar(vtype=GRB.BINARY, name='x_%s' % g)
        for p in samples:
            y[p] = m.addVar(vtype=GRB.BINARY, name='y_%s' % p)
        m.update()

        # #print 'obj:', obj
        #obj = quicksum(y[p] for p in one_samples) - quicksum(y[p] for p in zero_samples)
        #if k != -1:
        m.setObjective(build_obj(x, y, w, args), GRB.MAXIMIZE)
        #else:
        #    m.setObjective(obj - quicksum(x[g] for g in genes)/len(genes), GRB.MAXIMIZE)
        # set the bound constraint if appropriate
        # set the seed genes
        for g in seed:
            m.addConstr(x[g] == 1)

        # cardinality constraint
        if k != -1: m.addConstr(quicksum(x[g] for g in genes) <= k)
        # m.addConstr(quicksum(x[g] for g in genes) >= 1)  # option1 for 0 len modules

        # coverage constraints
        for p in samples:
            m.addConstr(y[p] <= quicksum(x[g] for g in samples_to_genes[p]))
            for g in samples_to_genes[p]:
                if w[p] < 0: m.addConstr(y[p] >= x[g])

        m.update()
        #m.write('opt_REVEALER.lp')

        # if verbose: print('* Tuning model...')
        # m.tune()
        # m.getTuneResult(0);
        # m.write('tune.prm')

        #        logger.info('* Optimizing model...')
        m.optimize()

        if m.SolCount == 0:
            logger.warning('%s No solution found, optimization status = %d' %
                           (args.target_column, m.Status))
            return float(
                "-inf"
            ), [], "", -1, -1, -1, "n/a", "n/a"  # z, module, e_single, cov, act_cov, act_sample, p_val_str, p_val_single_str
        else:
            z = m.ObjVal

            if k == -1 and args.reoptimize:  #second optimization phase: min # genes
                logger.info('* Re-optimizing model...')
                m.addConstr(obj == m.ObjVal)
                m.setObjective(quicksum(x[g] for g in genes), GRB.MINIMIZE)
                m.optimize()

        #print m.RunTime, t_ILP
        ## output module ##
        module = []
        for g in genes:
            if x[g].X > 0.5: module.append(g)  # well...
        #print sorted(module)
        cov, act_cov, act_sample = 0, 0, 0
        for p in samples:
            cov += int(y[p].X)
            if w[p] > 0:
                act_sample += 1
                act_cov += int(y[p].X)

## pruning module
#while module != None:
#    worst_event, worst_count = conditional_permutation_test(module, events_to_samples, profile, samples, 1000)#args.p_val_it)
#    if args.verbosity: print(worst_event, worst_count/float(1000))#args.p_val_it))
#    print("########")
#    if worst_count/float(1000) <= 0.01: break
#    ##if worst_count <= 0: break
#    module.remove(worst_event)
#
#cases_by_event = [events_to_samples[event] for event in module]
#zP = SuperW(cases_by_event, profile, samples)
#
## compute p-value
#
#mo.addConstr(quicksum(x[g] for g in genes) == len(module))
#p_val_str = "n/a"
#p_val_single_str = "n/a"
#z_single = float("-inf")
#if args.p_val_it != -1:
#    no_better, no_better_single = 0, 0
#    ##values = w.values()
#    values = list(w.values())
#    for i in range(args.p_val_it):
#        random.shuffle(values)
#        w2 = dict(zip(w.keys(), values))
#        # adapt weights in objective function
#        mo.setObjective(build_obj(x, y, w2, args), GRB.MAXIMIZE)
#        mo.update()
#        mo.optimize()
#        if mo.ObjVal >= zP: no_better = no_better + 1
#        # p-value for single event:
#        z_single_s = float("-inf")
#        for e in genes:
#            z_e = sum(w2[s] for s in events_to_samples[e])
#            if z_e > z_single_s: z_single_s = z_e
#        if z_single_s >= z_single: no_better_single = no_better_single + 1
#    p_val_str = str(no_better/float(args.p_val_it))
#    p_val_single_str = str(no_better_single/float(args.p_val_it))

    except GurobiError as e:
        logger.error('Error:', e.message)

    return z, module, e_single, cov, act_cov, act_sample, x, y, m  ##, p_val_str, p_val_single_str
Exemplo n.º 2
0
parser.add_argument('-efc',
                    '--excluded_feature_classes',
                    type=str,
                    required=False,
                    nargs='*',
                    default=[],
                    choices=FEATURE_CLASSES)
parser.add_argument('-rs',
                    '--random_seed',
                    type=int,
                    default=12345,
                    required=False)
args = parser.parse_args(sys.argv[1:])

# Set up logger
logger = getLogger(args.verbosity)

# Load the input data
X = pd.read_csv(args.feature_file, index_col=0, sep='\t')
y = pd.read_csv(args.outcome_file, index_col=0, sep='\t')
feature_classes = pd.read_csv(args.feature_class_file, index_col=0, sep='\t')

# Align the features and outcomes
patients = X.index
X = X.reindex(index=patients)
y = y.reindex(index=patients)
outcome_name = y.columns[0]

# Create some data structures to hold our output
json_output = dict(patients=list(map(float, patients)), params=vars(args))
Exemplo n.º 3
0
def run(args):
    t_init = time.time()
    # Set up logger
    logger = getLogger(args.verbosity)
    logger.info("# calling %s" % " ".join(sys.argv))
    logger.info("# at time %s" % time.strftime("%H:%M:%S on %a, %d %b %Y "))
    logger.info("# on machine %s" % socket.gethostname())

    #
    global k, N, events_to_samples, samples_to_genes, samples, module
    k = args.cardinality
    mutationMatrix = args.mutation_matrix
    if args.seed:
        seed = eval(args.seed)
    else:
        seed = []

    random.seed(args.random_seed)  # founding year of Brown University
    np.random.seed(args.random_seed)

    # Generate/load the target profile
    w = {}  # weights
    global numNanInf
    numNanInf = 0
    if args.target_format == 'revealer':
        with open(args.target) as f:
            arrs = [l.rstrip().split("\t") for l in f if not l.startswith("#")]
            for arr in [arr for arr in arrs]:
                w[arr[0]] = float(arr[1]) + args.offset
                if args.direction == 'negative': w[arr[0]] = -w[arr[0]]
                if args.unit_weights:
                    if w[arr[0]] <= 0: w[arr[0]] = -1
                    else: w[arr[0]] = +1
    elif args.target_format == 'achilles':
        assert (args.target_column)
        with open(args.target) as f:
            line = f.readline()
            ind = line.rstrip().split().index(args.target_column)  # + 1
            arrs = [l.rstrip().split("\t") for l in f if not l.startswith("#")]
            #arrs = [ re.findall(r"[-\w']+", l) for l in f if not l.startswith("#") ]
            #for arr in [arr for arr in arrs if arr[0] in patients]:
            for arr in arrs:
                try:
                    if math.isnan(float(arr[ind])):
                        logger.info(
                            "Warning: profile of sample %s is NaN: removing sample."
                            % arr[0])
                        numNanInf += 1
                    elif math.isinf(float(arr[ind])):
                        w[arr[0]] = 100.0 if float(
                            arr[ind]) == float("inf") else -100.0
                        if args.direction == 'negative': w[arr[0]] = -w[arr[0]]
                        numNanInf += 1
                    else:
                        w[arr[0]] = float(arr[ind]) + args.offset
                        if args.direction == 'negative': w[arr[0]] = -w[arr[0]]
                        if args.unit_weights:
                            if w[arr[0]] <= 0: w[arr[0]] = -1
                            else: w[arr[0]] = +1
                # except Exception as ex:
                #     template = "An exception of type {0} occured. Arguments:\n{1!r}"
                #     message = template.format(type(ex).__name__, ex.args)
                #     print message
                except:
                    logger.warning("Warning: %s %s" % (arr[0], arr[ind]))
                #     # skip these
                # check for patients w/out weights:
                #print arr[0], w[arr[0]]

    # terminate if too many inf or NaN
    if float(numNanInf) / len(arrs) > 0.1:
        print("too many Nan or Inf, terminating")
        logger.info('Nan, Inf fraction: %s ' % (float(numNanInf) / len(arrs)))
        if args.output_file:
            of = open(args.output_file, 'w')
            of.write(str(args.target_column) + "\t")
            of.write("incomplete due to high Nan, Inf\t")
            of.write("Nan, Inf fraction: %s" % (float(numNanInf) / len(arrs)))
        return 1

    samples1 = w.keys()
    eventToCases, mutation_samples = load_events(args.mutation_matrix,
                                                 verbose=1)
    profiles, profile_samples = load_profiles(
        args.target, sample_whitelist=mutation_samples, verbose=1)

    # Load the mutation data
    #print('####')
    #print(len(samples1))
    mutations = load_mutation_data(mutationMatrix, samples1, args.gene_file,
                                   args.mutations_only, args.min_freq,
                                   args.max_freq)
    m, N, genes, samples, events_to_samples, samples_to_genes = mutations
    #print(set(eventToCases.keys())-set(genes))
    coef = 1
    if args.direction == 'negative': coef = -1

    global profile
    profile = dict((s, coef * profiles[args.target_column][i])
                   for i, s in enumerate(profile_samples) if s in samples)

    logger.info('* Mutation data')
    logger.info('\t- Alterations: %s' % m)
    logger.info('\t- Samples: %s' % N)
    logger.info('* Seed genes: %s' % ','.join(seed))

    #print w

    #
    # if set(patients) - set(w.keys()):
    #     nokeys = set(patients) - set(w.keys())
    #     if verbose & len(nokeys): print "Warning:", len(nokeys), "samples w/out weights" #    : ", nokeys
    # patients = list(set(patients) & set(w.keys()))

    t_start = time.time()

    module = args.query.split(",")
    cases_by_event = [events_to_samples[event] for event in module]
    zP = SuperW(cases_by_event, profile, samples)
    max_score = sum(w[p] for p in samples if w[p] > 0)
    if max_score == 0:
        max_score = -0.1
    print("module is :", module)
    print("score is :", zP, "max_score is :", max_score)
    print("score/max_score is :", float(zP) / max_score)

    if args.cond_it != -1:
        while len(module) > 1:
            logger.info("len module" + str(len(module)))
            worst_event, worst_count = conditional_permutation_test(
                module, events_to_samples, profile, samples, args.cond_it)
            #if args.verbosity: print('worst',worst_event, worst_count/float(args.cond_it))
            logger.info('worst= ' + str(worst_event) +
                        str(worst_count / float(args.cond_it)))
            if worst_count / float(args.cond_it) <= (1.0 /
                                                     float(args.cond_it)):
                break
            ##if worst_count <= 0: break
            module.remove(worst_event)

    # matrix permutation using curveball method
    k = len(module)
    t_s1 = time.time()
    if args.curveball and len(module) > 0 and args.p_val_it > 0:
        print("curveball")
        nullmat_dir = args.null_matrices
        c = 0
        t_curve_arr = []
        t_opt_arr = []
        no_better = 0
        p_z_arr = []

        for i in range(args.p_val_it):
            t_s = time.time()

            p_mutations = load_mutation_data(nullmat_dir + str(i) + ".tsv",
                                             samples1, args.gene_file,
                                             args.mutations_only,
                                             args.min_freq, args.max_freq)
            p_m, p_N, p_genes, p_samples, p_events_to_samples, p_samples_to_genes = p_mutations

            t_curve = time.time() - t_s
            #            print("one curveball loading takes (seconds:)",t_curve)

            t_s = time.time()
            p_z, p_module, p_best_single, p_cov, p_act_cov, p_act_sample, p_x, p_y, p_mo = optimize_model(
                genes, samples, w, p_samples_to_genes, args, seed,
                logger)  #change args.k
            if p_z >= zP: no_better = no_better + 1
            p_z_arr.append(p_z)
            c += 1
            t_opt2 = time.time() - t_s
            #            t_curve_arr.append(t_curve)
            #            t_opt_arr.append(t_opt2)
            #            print("one optimization after curveball permutation (sec)", t_opt2)
            if c % 1000 == 0:
                print("cycle:", str(c))
                print("time:", str((time.time() - t_s1) / 60), "min")
        p_val_str = str(float(no_better) / args.p_val_it)
        print(str((time.time() - t_s1) / 60), "min")
        avg_p_z = str(float(sum(p_z_arr)) / len(p_z_arr))
#        print(str(sum(t_curve_arr)/len(t_curve_arr)), "average curve time in sec")
#        print(str(sum(t_opt_arr)/len(t_opt_arr)), "average opt time in sec")

# compute p-value
    if len(module) == 0:
        avg_p_z = "NA"

    #print module
    #print("here")
    #if args.verbosity: print([(e, len(events_to_samples[e])) for e in module])

    mut_samples = {s for g in module for s in events_to_samples[g]}
    # we multiply by minus one because the weights had been transformed that way (effect is just changing the sign of IC, but this way it is correct)
    ordered_w = [-w[s] for s in samples]
    #IC = ic(ordered_w, mut_samples, samples, seed_cases=set(), metric="IC")

    ##for individual aberration scores
    global scores_ind
    scores_ind = []
    scores_dict = dict()
    for i in range(len(module)):
        sampleset = [events_to_samples[module[i]]]
        scores_ind.append(float(round(SuperW(sampleset, profile, samples), 2)))
        scores_dict[module[i]] = float(scores_ind[i])
    scores_ind.sort(reverse=True)
    scores_ind = map(str, scores_ind)
    #print(scores_ind)
    module.sort(key=lambda g: float(scores_dict[g]), reverse=True)
    #print(module)

    ##for coverage
    cover = set()
    cov_ind = []
    for i in range(len(module)):
        cov_ind.append(str(len(events_to_samples[module[i]])))

    for i in range(len(module)):
        cover = cover.union(events_to_samples[module[i]])

    logger.info(", ".join(module))

    ##urlencode
    dataset = "Project Achilles" if args.target_format == 'achilles' else "Project Revealer"
    query_input = [('dataset', dataset), ('profile', args.target_column),
                   ('sample_lists', 'CERES'),
                   ('events', " ".join(module).replace("_MUT", ""))]
    queries = urlencode(query_input, quote_via=quote)
    url = "https://superdendrix-data-explorer.lrgr.io/#"

    ##Wilcoxon rank sum test
    if len(module) == 0:
        ranksum_pval = 1
    else:
        mutsamples = set()
        for i in range(len(module)):
            mutsamples = mutsamples.union(events_to_samples[module[i]])
        nomutsamples = set(samples) - mutsamples

        mutscores = [profile[s] for s in mutsamples]
        nomutscores = [profile[s] for s in nomutsamples]

        ranksum_pval = ranksums(mutscores, nomutscores)[1]

    ##output
    if args.output_file:
        of = open(args.output_file, 'w')
        of.write(
            "target\taberrations\t#sample\tscores\tp-value\tz\tmax_score\tz/max_score(%)\tcoverage\tranksum_pval\tbrowser_link\tt_total\tavg_random_z\n"
        )
        of.write(str(args.target_column) + "\t")
        if len(module) > 0:
            of.write(",".join(module))
            of.write("\t" + ",".join(cov_ind))
            of.write("\t" + ",".join(scores_ind))
        else:
            of.write("nan")
            of.write("\t" + "nan")
            of.write('\t' + "nan")

        of.write("\t" + str(p_val_str) + "\t" + str(round(zP, 4)) + "\t" +
                 str(round(max_score, 4)) + '\t' +
                 str(round(100 * zP / max_score, 4)) + "\t" + str(len(cover)) +
                 "/" + str(N) + "\t")
        of.write("\t" + str(ranksum_pval) + "\t" + url + queries)
        of.write("\t" + str(time.time() - t_init))
        of.write("\t" + avg_p_z)

    for g in module:
        logger.info('%s %s' % (g, len(events_to_samples[g])))

    return module
def map_permutation_test(args):
    # Set up logger
    logger = getLogger(args.verbosity)

    # Load required modules
    from sklearn.model_selection import LeaveOneOut, GridSearchCV, cross_val_predict
    from metrics import compute_metrics

    # Load the input data
    X = pd.read_csv(args.feature_file, index_col=0, sep='\t')
    y = pd.read_csv(args.outcome_file, index_col=0, sep='\t')
    feature_classes = pd.read_csv(args.feature_class_file, index_col=0, sep='\t')

    # Align the features and outcomes
    patients = X.index
    X = X.reindex(index = patients)
    y = y.reindex(index = patients)
    outcome_name = y.columns[0]

    # Restrict to the training columns
    selected_feature_classes = set(map(str.capitalize, set(FEATURE_CLASSES) - set(args.excluded_feature_classes)))
    training_cols = feature_classes['Class'].isin(selected_feature_classes).index.tolist()

    ############################################################################
    # RUN PERMUTATION TEST
    ############################################################################
    #Initialize model
    pipeline, gscv = init_model(args.model, args.n_jobs,
        args.estimator_random_seed, args.max_iter, args.tol)

    # Permute the outcomes
    np.random.seed(args.permutation_random_seed)
    y[outcome_name] = np.random.permutation(y[outcome_name])

    # Convert dataframes to matrices to avoid dataframe splitting error
    outer_cv = LeaveOneOut()
    preds = pd.Series(cross_val_predict(estimator = gscv,
                                       X=X.loc[:,training_cols],
                                        y=y[outcome_name], cv=outer_cv,
                                        n_jobs = args.n_jobs,
                                        verbose=61 if args.verbosity > 0 else 0),
                     index = patients)

    # Evalue results
    sub_y = y.loc[patients][outcome_name].values
    sub_preds = preds.loc[patients].values
    metric_vals, var_explained = compute_metrics(sub_y, sub_preds)

    ############################################################################
    # OUTPUT TO FILE
    ############################################################################
    with open(args.output_file, 'w') as OUT:
        output = {
            "var_explained": var_explained.tolist(),
            "true": sub_y.tolist(),
            "preds": "sub_preds",
            "params": vars(args),
            "training_features": training_cols
        }
        output.update(metric_vals.items())
        json.dump( output, OUT )