def update_loss_with_prior(loss_matrix, prior_params, prior_motifreg, prior_regreg, iteration, best_split_regulator=None): new_loss = loss_matrix # Apply motif-regulator prior if prior_motifreg is not None: ones_mat = np.ones(prior_motifreg.data.shape) loss_decrease = ones_mat - loss_matrix motifreg_multiplier = prior_motifreg.data * \ prior_params.prior_constant * \ np.power(prior_params.prior_decay_rate, iteration) + 1 loss_decrease_weighted = util.element_mult(loss_decrease, motifreg_multiplier) new_loss = ones_mat - loss_decrease_weighted # Apply regulator-regulator prior if prior_regreg is not None: assert best_split_regulator is not None # Get prior for adding on to that regulator another regulator that interacts with that if best_split_regulator == 'root': return new_loss reg_row = prior_regreg.data[prior_regreg.row_labels == best_split_regulator, ] reg_mat = np.vstack([reg_row for el in xrange(new_loss.shape[0])]) regreg_multiplier = reg_mat * prior_params.prior_constant * \ np.power(prior_params.prior_decay_rate, iteration) + 1 new_loss = util.element_mult(new_loss, regreg_multiplier) return new_loss
def find_rule_weights(leaf_training_examples, example_weights, ones_mat, holdout, y, x1, x2): """ Find rule weights, and return an object store containing them. """ # log('find_rule_weights start') w_temp = util.element_mult(example_weights, leaf_training_examples) # log('weights element-wise') w_pos = util.element_mult(w_temp, holdout.ind_train_up) # log('weights element-wise') w_neg = util.element_mult(w_temp, holdout.ind_train_down) # log('weights element-wise') x2_pos = x2.element_mult(x2.data > 0) # log('x2 element-wise') x2_neg = abs(x2.element_mult(x2.data < 0)) # log('x2 element-wise') x1wpos = x1.matrix_mult(w_pos) # log('x1 weights dot') x1wneg = x1.matrix_mult(w_neg) # log('x1 weights dot') w_up_regup = util.matrix_mult(x1wpos, x2_pos) # log('x1w x2 dot') w_up_regdown = util.matrix_mult(x1wpos, x2_neg) # log('x1w x2 dot') w_down_regup = util.matrix_mult(x1wneg, x2_pos) # log('x1w x2 dot') w_down_regdown = util.matrix_mult(x1wneg, x2_neg) # log('x1w x2 dot') w_zero_regup = ones_mat - w_up_regup - w_down_regup # log('weights subtraction') w_zero_regdown = ones_mat - w_up_regdown - w_down_regdown # log('weights subtraction') return ObjectStore(w_up_regup, w_up_regdown, w_down_regup, w_down_regdown, w_zero_regup, w_zero_regdown, w_pos, w_neg)
def get_index_mat_dict_enh_prom(index_mat_enh, index_math_prom): index_mat_dict = {} index_mat_dict['enh_up'] = util.element_mult(index_mat_enh, y.data == 1) index_mat_dict['enh_down'] = util.element_mult(index_mat_enh, y.data == -1) index_mat_dict['prom_up'] = util.element_mult(index_mat_prom, y.data == 1) index_mat_dict['prom_down'] = util.element_mult(index_mat_prom, y.data == -1) return index_mat_dict
def get_hierarchy_index(hier_node, hierarchy, training_index, tree): """Mask training index with hierarchy node if there is a hierarchy """ if hierarchy is None: return training_index cells = hierarchy.subtree_nodes[hier_node] cell_matrix = np.zeros(training_index.shape, dtype=bool) cell_matrix[:, cells] = True if tree.sparse: leaf_training_examples = util.element_mult(training_index, csr_matrix(cell_matrix)) else: leaf_training_examples = util.element_mult(training_index, cell_matrix) return leaf_training_examples
def find_rule_process_stumps(tree, holdout, y, x1, x2, hierarchy): if hierarchy == None: # Use all examples because no tree leaf_training_examples = tree.ind_pred_train[0] # calculate the loss for this leaf leaf_loss_mat, regulator_sign = find_min_loss(tree, leaf_training_examples, holdout, y, x1, x2) # hierarchy node is just root hierarchy_node = 0 # # If there is a hierarchy, iterate through possible children else: # Keep best loss node of hierarchy best_loss = float('Inf') leaf_loss_mat = None regulator_sign = None hierarchy_node = None # Iterate over children (always adding to root) for hier_node in hierarchy.direct_children[tree.hierarchy_node[0]]: cells = hierarchy.subtree_nodes[hier_node] cell_matrix = np.zeros(tree.ind_pred_train[0].shape, dtype=bool) cell_matrix[:, cells] = True if tree.sparse: leaf_training_examples = util.element_mult( tree.ind_pred_train[0], csr_matrix(cell_matrix)) else: leaf_training_examples = util.element_mult( tree.ind_pred_train[0], cell_matrix) # calculate the loss for this leaf leaf_loss_mat_cell, regulator_sign_cell = find_min_loss( tree, leaf_training_examples, holdout, y, x1, x2) # Update with best loss if leaf_loss_mat_cell.min() < best_loss: best_loss = leaf_loss_mat_cell.min() leaf_loss_mat = leaf_loss_mat_cell regulator_sign = regulator_sign_cell hierarchy_node = hier_node return (0, regulator_sign, 0, leaf_loss_mat)
def get_current_rule(tree, best_split, regulator_sign, loss_best, holdout, y, x1, x2, hierarchy, hierarchy_node): motif, regulator = np.where(np.isclose(loss_best, loss_best.min())) # If multiple rules have the same loss, randomly select one if len(motif) > 1: random.seed(1) choice = random.sample(range(len(motif)), 1) motif = np.array(motif[choice]) regulator = np.array(regulator[choice]) # Convert to int if isinstance(motif, int) == False: motif = int(motif) if isinstance(regulator, int) == False: regulator = int(regulator) # Find indices of where motif and regulator appear if x2.sparse: valid_m = np.nonzero(x1.data[motif, :])[1] valid_r = np.where(x2.data.toarray()[:, regulator] == regulator_sign)[0] else: valid_m = np.nonzero(x1.data[motif, :])[0] valid_r = np.where(x2.data[:, regulator] == regulator_sign)[0] # Get joint motif-regulator index - training and testing if y.sparse: valid_mat = csr_matrix((y.num_row, y.num_col), dtype=np.bool) else: valid_mat = np.zeros((y.num_row, y.num_col), dtype=np.bool) valid_mat[np.ix_(valid_m, valid_r)] = 1 # XX not efficient non_hier_training_examples = tree.ind_pred_train[best_split] training_examples = h.get_hierarchy_index(hierarchy_node, hierarchy, non_hier_training_examples, tree) non_hier_testing_examples = tree.ind_pred_test[best_split] testing_examples = h.get_hierarchy_index(hierarchy_node, hierarchy, non_hier_testing_examples, tree) rule_train_index = util.element_mult(valid_mat, training_examples) rule_test_index = util.element_mult(valid_mat, testing_examples) return motif, regulator, regulator_sign, rule_train_index, rule_test_index
def extract_feat_affecting_example_set(y, tree, peak_index, condition_index, ex_by_feat_mat): # ! check example_index = peak_index * y.num_col + condition_index if y.data[peak_index, condition_index] == 0: print 'There is no change at this feature in this condition. STOP.' return None # Find the features that have non-zero margin score # feat_list = (ex_by_feat_mat[example_index,:]!=0).nonzero()[1].tolist() # Find the features where the index actually overlaps the place df = csr_matrix(y.data.shape) # Make one TRUE entry at yo df[peak_index, condition_index] = True feat_list = [] for ind in xrange(tree.nsplit): if util.element_mult( tree.ind_pred_test[ind] + tree.ind_pred_train[ind], df).sum() != 0: feat_list.append(ind) return feat_list
def get_rule_score_and_indices(rule_bundle, training_examples, testing_examples, weights_i, rule_weights, tree, y, x1, x2, holdout, rule_train_index, rule_test_index): ### ADD IN if rule_bundle.size == 1: rule_score = util.calc_score(tree, rule_weights, rule_train_index) motif_bundle = [] regulator_bundle = [] return rule_score, rule_train_index, rule_test_index # Get a lock lock_stable = multiprocessing.Lock() # Initialize shared data objects theta_alphas = multiprocessing.RawArray(ctypes.c_double, rule_bundle.size) bundle_train_pred = multiprocessing.RawArray(ctypes.c_double, y.num_row * y.num_col) bundle_test_pred = multiprocessing.RawArray(ctypes.c_double, y.num_row * y.num_col) # Store the value of the next rule that needs to be worked on rule_index_cntr = multiprocessing.Value('i', 0) # Pack arguments stable_args = [y, x1, x2, rule_index_cntr, rule_bundle, \ training_examples, testing_examples, rule_weights.w_pos, rule_weights.w_neg, (lock_stable, theta_alphas, bundle_train_pred, bundle_test_pred)] # Fork worker processes, and wait for them to return fork_and_wait(config.NCPU, return_rule_index, stable_args) ### Get results back into the right format () theta_alphas = np.array(theta_alphas) if y.sparse: bundle_train_pred = csr_matrix( np.reshape(np.array(bundle_train_pred), (y.data.shape))) bundle_test_pred = csr_matrix( np.reshape(np.array(bundle_test_pred), (y.data.shape))) else: bundle_train_pred = np.reshape(np.array(bundle_train_pred), (y.data.shape)) bundle_test_pred = np.reshape(np.array(bundle_test_pred), (y.data.shape)) # Calculate theta min_val = min([abs(a) for a in theta_alphas]) theta = sum([abs(alph) - min_val for alph in theta_alphas]) / 2 # new index is where absolute value greater than theta new_train_rule_ind = (abs(bundle_train_pred) > theta) new_test_rule_ind = (abs(bundle_test_pred) > theta) # calculate W+ and W- for new rule w_pos = util.element_mult(weights_i, holdout.ind_train_up) w_neg = util.element_mult(weights_i, holdout.ind_train_down) w_bundle_pos = util.element_mult(w_pos, new_train_rule_ind) w_bundle_neg = util.element_mult(w_neg, new_train_rule_ind) # get score of new rule rule_bundle_score = 0.5 * np.log( (w_bundle_pos.sum() + config.TUNING_PARAMS.epsilon) / (w_bundle_neg.sum() + config.TUNING_PARAMS.epsilon)) return rule_bundle_score, new_train_rule_ind, new_test_rule_ind
def stable_boost_test(tree, rule_train_index, holdout): w_pos = util.element_mult(tree.weights, holdout.ind_train_up) w_neg = util.element_mult(tree.weights, holdout.ind_train_down) test = 0.5*abs(util.element_mult(w_pos, rule_train_index).sum() - \ util.element_mult(w_neg, rule_train_index).sum()) return test
def bundle_rules(tree, y, x1, x2, m, r, reg, best_split, rule_weights, hierarchy, hierarchy_node): level = 'VERBOSE' if config.VERBOSE else 'QUIET' log('starting bundle rules', level=level) log('best split is {0}'.format(best_split), level=level) log('calculate A', level=level) non_hier_training_examples = tree.ind_pred_train[best_split] training_examples = h.get_hierarchy_index(hierarchy_node, hierarchy, non_hier_training_examples, tree) # Get weights based on current training examples (obeying hierarchy) # (formerly tree.ind_pred_train[best_split]) weights_i = util.element_mult(tree.weights, training_examples) # SYMM DIFF - calculate weights and weights squared of best loss_rule (A) if reg == 1: a_val = rule_weights.w_up_regup[m,r]+ \ rule_weights.w_down_regup[m,r] elif reg == -1: a_val = rule_weights.w_up_regdown[m,r]+ \ rule_weights.w_down_regdown[m,r] # Allocate matrix of weight value if y.sparse: a_weights = csr_matrix(a_val * np.ones(shape=rule_weights.w_down_regup.shape)) else: a_weights = a_val * np.ones(shape=rule_weights.w_down_regup.shape) ## Calculate weights and weights square of all the other rules (B) # W+ + W- from find_rule() log('calculate B', level=level) b_weights_regup = rule_weights.w_up_regup+ \ rule_weights.w_down_regup b_weights_regdown = rule_weights.w_up_regdown+ \ rule_weights.w_down_regdown ## Calculate intersection of A and B (A union B) # Allocate matrix with best rule in repeated m matrix, and best rule in repeated r matrix log('calculate A+B', level=level) if y.sparse: reg_vec = (x2.data[:, r] == reg) else: reg_vec = np.reshape((x2.data[:, r] == reg), (x2.num_row, 1)) # Multiply best rule times all other rules log('best rule times others', level=level) x1_intersect = util.element_mult(x1.data[m, :], x1.data) x2_up = x2.element_mult(x2.data > 0) x2_down = abs(x2.element_mult(x2.data < 0)) x2_intersect_regup = util.element_mult(reg_vec, x2_up) x2_intersect_regdown = util.element_mult(reg_vec, x2_down) # Get weights for intersection x1_intersect_weights = util.matrix_mult(x1_intersect, weights_i) log('intersection weights', level=level) ab_weights_regup = util.matrix_mult( x1_intersect_weights, x2_intersect_regup) # PRE-FILTER weights ab_weights_regdown = util.matrix_mult( x1_intersect_weights, x2_intersect_regdown) # PRE-FILTER weights # Get symmetric difference in weights log('symmetric diff', level=level) symm_diff_w_regup = a_weights + b_weights_regup - 2 * ab_weights_regup symm_diff_w_regdown = a_weights + b_weights_regdown - 2 * ab_weights_regdown ## Calculate threshold for stabilization log('get threshold', level=level) if y.sparse: bundle_thresh = util.calc_sqrt_sum_sqr_sqr_sums(weights_i.data) else: bundle_thresh = util.calc_sqrt_sum_sqr_sqr_sums(weights_i.ravel()) ## If large bundle, but hard cap on number of rules in bundle: log('test bundle size', level=level) test_big_bundle = (symm_diff_w_regup < \ config.TUNING_PARAMS.eta_1*bundle_thresh).sum() + \ (symm_diff_w_regdown < \ config.TUNING_PARAMS.eta_1*bundle_thresh).sum() \ > config.TUNING_PARAMS.bundle_max # If large bundle cap at max bundle size if test_big_bundle: log('cap bundle', level=level) print "=" * 80 print 'large bundle - cap at {0}'.format( config.TUNING_PARAMS.bundle_max) ### Get rule bundles if y.sparse: rule_bundle_regup = np.where(symm_diff_w_regup.todense().ravel( ).argsort().argsort().reshape(symm_diff_w_regup.toarray().shape) < config.TUNING_PARAMS.bundle_max / 2) rule_bundle_regdown = np.where(symm_diff_w_regdown.todense().ravel( ).argsort().argsort().reshape(symm_diff_w_regup.toarray().shape) < config.TUNING_PARAMS.bundle_max / 2) else: rule_bundle_regup = np.where(symm_diff_w_regup.ravel().argsort( ).argsort().reshape(symm_diff_w_regup.shape) < config.TUNING_PARAMS.bundle_max / 2) rule_bundle_regdown = np.where(symm_diff_w_regdown.ravel().argsort( ).argsort().reshape(symm_diff_w_regup.shape) < config.TUNING_PARAMS.bundle_max / 2) rule_bundle_regup_motifs = rule_bundle_regup[0].tolist( ) # Keeping min loss rule rule_bundle_regup_regs = rule_bundle_regup[1].tolist() rule_bundle_regdown_motifs = rule_bundle_regdown[0].tolist() rule_bundle_regdown_regs = rule_bundle_regdown[1].tolist() # Otherwise take all bundled rules else: log('keep bundle', level=level) rule_bundle_regup = (symm_diff_w_regup < \ config.TUNING_PARAMS.eta_1*bundle_thresh).nonzero() rule_bundle_regdown = (symm_diff_w_regdown < \ config.TUNING_PARAMS.eta_1*bundle_thresh).nonzero() rule_bundle_regup_motifs = rule_bundle_regup[0].tolist( ) # Keeping min loss rule rule_bundle_regup_regs = rule_bundle_regup[1].tolist() rule_bundle_regdown_motifs = rule_bundle_regdown[0].tolist() rule_bundle_regdown_regs = rule_bundle_regdown[1].tolist() # # Investigate large bundles # if len(rule_bundle_regup_motifs)+len(rule_bundle_regdown_motifs) > 40: # pdb.set_trace() # Print names of x1/x2 features that are bundled rule_bundle_motifs = x1.row_labels[ \ rule_bundle_regup_motifs+rule_bundle_regdown_motifs] rule_bundle_regs = x2.col_labels[ \ rule_bundle_regup_regs+rule_bundle_regdown_regs] # Return list where first element is bundle where reg_up and second is where reg_down return BundleStore(rule_bundle_regup_motifs, rule_bundle_regup_regs, rule_bundle_regdown_motifs, rule_bundle_regdown_regs)
r_h = (rule_bundle.rule_bundle_regup_regs + rule_bundle.rule_bundle_regdown_regs)[rule_index] reg_h = ( [+1] * len(rule_bundle.rule_bundle_regup_motifs) + [-1] * len(rule_bundle.rule_bundle_regdown_motifs))[rule_index] if x1.sparse: valid_m_h = np.nonzero(x1.data[m_h, :])[1] valid_r_h = np.where(x2.data.toarray()[:, r_h] == reg_h)[0] else: valid_m_h = np.nonzero(x1.data[m_h, :])[0] valid_r_h = np.where(x2.data[:, r_h] == reg_h)[0] # Calculate the loss for this leaf valid_mat_h[np.ix_(valid_m_h, valid_r_h)] = 1 rule_train_index_h = util.element_mult(valid_mat_h, best_split_train_index) rule_test_index_h = util.element_mult(valid_mat_h, best_split_test_index) rule_score_h = 0.5 * np.log( (util.element_mult(w_pos, rule_train_index_h).sum() + config.TUNING_PARAMS.epsilon) / (util.element_mult(w_neg, rule_train_index_h).sum() + config.TUNING_PARAMS.epsilon)) # print rule_index # Update current predictions with lock_stable: # Add current rule to training and testing sets current_bundle_train_pred = np.reshape(np.array(bundle_train_pred),
best_loss = float('Inf') leaf_loss_mat = None regulator_sign = None hierarchy_node = None # Iterate over children for hier_node in hierarchy.direct_children[ tree.hierarchy_node[leaf_index]]: cells = hierarchy.subtree_nodes[hier_node] cell_matrix = np.zeros(tree.ind_pred_train[leaf_index].shape, dtype=bool) cell_matrix[:, cells] = True if tree.sparse: leaf_training_examples = util.element_mult( tree.ind_pred_train[leaf_index], csr_matrix(cell_matrix)) else: leaf_training_examples = util.element_mult( tree.ind_pred_train[leaf_index], cell_matrix) # calculate the loss for this leaf leaf_loss_mat_cell, regulator_sign_cell = find_min_loss( tree, leaf_training_examples, holdout, y, x1, x2) # Update with best loss if leaf_loss_mat_cell.min() < best_loss: best_loss = leaf_loss_mat_cell.min() leaf_loss_mat = leaf_loss_mat_cell regulator_sign = regulator_sign_cell hierarchy_node = hier_node
def find_next_decision_node(tree, holdout, y, x1, x2, hierarchy, iteration): level = 'VERBOSE' if config.VERBOSE else 'QUIET' ## Calculate loss at all search nodes ## (will search across current hier node and direct children to find best hier node) log('Find rule process', level=level) best_split, regulator_sign, hierarchy_node, loss_best = find_rule_processes( tree, holdout, y, x1, x2, hierarchy) # Update loss with prior if config.TUNING_PARAMS.use_prior: log('Update loss with prior', level=level) best_split_regulator = util.get_best_split_regulator( tree, x2, best_split) loss_best = prior.update_loss_with_prior(loss_best, prior.PRIOR_PARAMS, prior.prior_motifreg, prior.prior_regreg, iteration, best_split_regulator) log('Find rule weights', level=level) # Mask training/testing examples by using only hierarchy children non_hier_training_examples = tree.ind_pred_train[best_split] training_examples = h.get_hierarchy_index(hierarchy_node, hierarchy, non_hier_training_examples, tree) non_hier_testing_examples = tree.ind_pred_test[best_split] testing_examples = h.get_hierarchy_index(hierarchy_node, hierarchy, non_hier_testing_examples, tree) # Get rule weights for the best split rule_weights = find_rule_weights(training_examples, tree.weights, tree.ones_mat, holdout, y, x1, x2) # Get current rule, no stabilization # rule_train_index/rule_test_index restricted to hierarchy log('Get current rule', level=level) (motif, regulator, regulator_sign, rule_train_index, rule_test_index) = get_current_rule(tree, best_split, regulator_sign, loss_best, holdout, y, x1, x2, hierarchy, hierarchy_node) if config.TUNING_PARAMS.use_stable: log('Starting stabilization', level=level) # Store current training weights weights_i = util.element_mult(tree.weights, training_examples) # Test if stabilization criterion is met log('Stabilization test', level=level) stable_test = stabilize.stable_boost_test(tree, rule_train_index, holdout) stable_thresh = stabilize.stable_boost_thresh(tree, y, weights_i) # If stabilization criterion met, then we want to find a bundle of # correlated rules to use as a single node if stable_test >= config.TUNING_PARAMS.eta_2 * stable_thresh: log('Stabilization criterion applies', level='VERBOSE') # Get rules that are bundled together log('Getting rule bundle', level=level) bundle = stabilize.bundle_rules(tree, y, x1, x2, motif, regulator, regulator_sign, best_split, rule_weights, hierarchy, hierarchy_node) # rule score is the direction and magnitude of the prediciton update # for the rule given by rule_weights and rule_train_index log('Updating scores and indices with bundle', level=level) (rule_score, rule_train_index, rule_test_index) = stabilize.get_rule_score_and_indices( bundle, training_examples, testing_examples, weights_i, rule_weights, tree, y, x1, x2, holdout, rule_train_index, rule_test_index) # Add bundled rules to bundle log('Adding bundles to rule', level=level) motif_bundle = bundle.rule_bundle_regup_motifs + bundle.rule_bundle_regdown_motifs regulator_bundle = bundle.rule_bundle_regup_regs + bundle.rule_bundle_regdown_regs else: # rule score is the direction and magnitude of the prediciton update # for the rule given by rule_weights and rule_train_index log('Updating rule without stabilization', level=level) rule_score = calc_score(tree, rule_weights, rule_train_index) motif_bundle = [] regulator_bundle = [] # If no stabilization else: # rule score is the direction and magnitude of the prediciton update # for the rule given by rule_weights and rule_train_index log('Updating rule without stabilization', level=level) rule_score = calc_score(tree, rule_weights, rule_train_index) motif_bundle = [] regulator_bundle = [] log('Adding above motifs/regs', level=level) above_motifs = tree.above_motifs[best_split] + np.unique( tree.bundle_x1[best_split] + [tree.split_x1[best_split]]).tolist() above_regs = tree.above_regs[best_split] + np.unique( tree.bundle_x2[best_split] + [tree.split_x2[best_split]]).tolist() return (motif, regulator, best_split, hierarchy_node, motif_bundle, regulator_bundle, rule_train_index, rule_test_index, rule_score, above_motifs, above_regs)
def get_index_mat_dict(index_mat): index_mat_dict = {} index_mat_dict['all_up'] = util.element_mult(index_mat, y.data == 1) index_mat_dict['all_down'] = util.element_mult(index_mat, y.data == -1) return index_mat_dict