def __init__(self, seq_mat, lambda_grouping, train_sets=None, blind_sets=None, use_pair_coeffs=False): self.sequence_matrix = seq_mat self.inequalities_present = seq_mat.inequalities_present self.predictions = np.matlib.zeros((1, num_cols(seq_mat.measured_values)), dtype=np.float) # TODO: verify initialization to ones is fine self.scoring_matrix = np.matlib.ones((num_cols(seq_mat.training_data), 1), dtype=np.float) self.lambda_start = 1.0 self.lambda_min = .0001 self.lambda_max = 10000000 self.precision = .0001 self.tolerated_num_non_convergence = 3 self.num_cv_folds = 10 self.cv_scoring_matrices = [np.matlib.zeros(self.scoring_matrix.shape, dtype=np.float) for i in range(self.num_cv_folds)] self.cv_measured_values = [None] * self.num_cv_folds self.lambdas = np.matlib.zeros((1, 1)) self.cv_train_data = train_sets self.cv_blind_data = blind_sets self.AT_A = None self.alphabet_size = 20 self.seq_len = seq_mat.seq_length self.group_num = 0 self.lambda_grouping = lambda_grouping self.lm_direction = None self.lm_start_pos = None self.use_pair_coeffs = use_pair_coeffs self.min_pair_count = 10 self.max_disagreement = .4 self.init_lambdas()
def identify_pair_coeffs(self): candidate_pairs = {} selected_pairs = {} for seq in self.sequence_matrix.training_data: for pos1 in range(1, num_cols(self.sequence_matrix.training_data)): for pos2 in range(pos1+1, num_cols(self.sequence_matrix.training_data)): if seq[0, pos1] == 1 and seq[0, pos2] == 1: pair = (pos1, pos2) if pair not in candidate_pairs: candidate_pairs[pair] = 1 else: candidate_pairs[pair] += 1 count = candidate_pairs[pair] if count >= self.min_pair_count: selected_pairs[pair] = 1 return selected_pairs
def x_distance(self, data, fold_num): # assert False self.predictions = data * self.cv_scoring_matrices[fold_num] self.predictions = self.predictions.T measured_vals = self.construct_measured_vector(data) difference = self.predictions - measured_vals inequalities = self.sequence_matrix.inequalities if self.inequalities_present: for index in range(num_cols(difference)): if inequalities[index] != '=': if inequalities[index] == '>': if difference[0, index] > 0: difference[0, index] = 0 elif difference[0, index] < 0: difference[0, index] = 0 return difference.dot(difference.T)
def solve_x(self, training_data, log_lambdas, fold_num): """ scoring matrix will be set after this BMC 2005 eq (3) :param training_data: numpy matrix of binary features :param log_lambdas: the lambda values :return: """ # TODO: rename matrices to fit python naming schemes self.AT_A = training_data.T * training_data log_lambda_max = math.log10(self.lambda_max) # TODO: better variable names for p for log_lambda_index in range(num_cols(log_lambdas)): cur_log_lambda = log_lambdas[0, log_lambda_index] if fold_num == 0: print('cur_log_lambda', str(cur_log_lambda)) if cur_log_lambda > log_lambda_max: self.lambdas[0, log_lambda_index] = self.lambda_min + self.lambda_max else: self.lambdas[0, log_lambda_index] = self.lambda_min + math.pow(10.0, cur_log_lambda) if fold_num == 0: print('lambda[0,' + str(log_lambda_index) + '] = ' + str(self.lambdas[0, log_lambda_index])) if self.lambda_grouping == LambdaGrouping.SCALAR_LAMBDA: for index in range(1, num_cols(self.AT_A)): # skip bias self.AT_A[index, index] += self.lambdas[0, 0] elif self.lambda_grouping == LambdaGrouping.SCALAR_PAIR_COEFFS: # this is currently the same as scalar_lambda. merge? differentiate? for index in range(1, num_cols(self.AT_A)): # skip bias self.AT_A[index, index] += self.lambdas[0, 0] elif self.lambda_grouping == LambdaGrouping.MATRIX_LAMBDA: for aa_position in range(self.group_num): offset = 1 + aa_position * self.alphabet_size for group_index in range(self.alphabet_size): self.AT_A[offset + group_index, offset + group_index] += self.lambdas[0, aa_position] elif self.lambda_grouping == LambdaGrouping.MATRIX_PAIR_COEFFS: # TODO: since just training on the residuals there is only one lambda value to use, that is the new lambda value corresponding to pair coeffs # the previous training to calc the scoring matrix has found the 9 optimal lambdas for rest of the features for aa_position in range(self.seq_len): offset = 1 + aa_position * self.alphabet_size for group_index in range(self.alphabet_size): self.AT_A[offset + group_index, offset + group_index] += self.lambdas[0, aa_position] coeff_start_index = 1 + self.alphabet_size * self.seq_len for coeff_index in range(coeff_start_index, num_cols(self.AT_A)): self.AT_A[coeff_index, coeff_index] += self.lambdas[0, -1] try: AT_A_inverse = self.AT_A.I except numpy.linalg.linalg.LinAlgError as err: print('singular matrix error') raise err A = training_data if self.cv_measured_values[fold_num] is None: self.cv_measured_values[fold_num] = self.construct_measured_vector(A) y = self.cv_measured_values[fold_num] inverse = AT_A_inverse * A.T if self.inequalities_present: assert False # calcX_inequal(training_data, inverse) else: self.cv_scoring_matrices[fold_num] = inverse * y.T