Пример #1
0
    def __init__(self, seq_mat, lambda_grouping, train_sets=None, blind_sets=None, use_pair_coeffs=False):
        self.sequence_matrix = seq_mat
        self.inequalities_present = seq_mat.inequalities_present
        self.predictions = np.matlib.zeros((1, num_cols(seq_mat.measured_values)), dtype=np.float)
        # TODO: verify initialization to ones is fine
        self.scoring_matrix = np.matlib.ones((num_cols(seq_mat.training_data), 1), dtype=np.float)

        self.lambda_start = 1.0
        self.lambda_min = .0001
        self.lambda_max = 10000000
        self.precision = .0001
        self.tolerated_num_non_convergence = 3
        self.num_cv_folds = 10
        self.cv_scoring_matrices = [np.matlib.zeros(self.scoring_matrix.shape, dtype=np.float) for i in range(self.num_cv_folds)]
        self.cv_measured_values = [None] * self.num_cv_folds
        self.lambdas = np.matlib.zeros((1, 1))
        self.cv_train_data = train_sets
        self.cv_blind_data = blind_sets
        self.AT_A = None
        self.alphabet_size = 20
        self.seq_len = seq_mat.seq_length
        self.group_num = 0
        self.lambda_grouping = lambda_grouping
        self.lm_direction = None
        self.lm_start_pos = None

        self.use_pair_coeffs = use_pair_coeffs
        self.min_pair_count = 10
        self.max_disagreement = .4

        self.init_lambdas()
Пример #2
0
    def identify_pair_coeffs(self):
        candidate_pairs = {}
        selected_pairs = {}
        for seq in self.sequence_matrix.training_data:
            for pos1 in range(1, num_cols(self.sequence_matrix.training_data)):
                for pos2 in range(pos1+1, num_cols(self.sequence_matrix.training_data)):
                    if seq[0, pos1] == 1 and seq[0, pos2] == 1:
                        pair = (pos1, pos2)
                        if pair not in candidate_pairs:
                            candidate_pairs[pair] = 1
                        else:
                            candidate_pairs[pair] += 1
                            count = candidate_pairs[pair]
                            if count >= self.min_pair_count:
                                selected_pairs[pair] = 1

        return selected_pairs
Пример #3
0
    def x_distance(self, data, fold_num):
        # assert False
        self.predictions = data * self.cv_scoring_matrices[fold_num]
        self.predictions = self.predictions.T
        measured_vals = self.construct_measured_vector(data)
        difference = self.predictions - measured_vals
        inequalities = self.sequence_matrix.inequalities

        if self.inequalities_present:
            for index in range(num_cols(difference)):
                if inequalities[index] != '=':
                    if inequalities[index] == '>':
                        if difference[0, index] > 0:
                            difference[0, index] = 0
                    elif difference[0, index] < 0:
                        difference[0, index] = 0

        return difference.dot(difference.T)
Пример #4
0
    def solve_x(self, training_data, log_lambdas, fold_num):
        """
        scoring matrix will be set after this

        BMC 2005 eq (3)

        :param training_data: numpy matrix of binary features
        :param log_lambdas: the lambda values
        :return:
        """
        # TODO: rename matrices to fit python naming schemes
        self.AT_A = training_data.T * training_data
        log_lambda_max = math.log10(self.lambda_max)
        # TODO: better variable names for p
        for log_lambda_index in range(num_cols(log_lambdas)):
            cur_log_lambda = log_lambdas[0, log_lambda_index]
            if fold_num == 0:
                print('cur_log_lambda', str(cur_log_lambda))
            if cur_log_lambda > log_lambda_max:
                self.lambdas[0, log_lambda_index] = self.lambda_min + self.lambda_max
            else:
                self.lambdas[0, log_lambda_index] = self.lambda_min + math.pow(10.0, cur_log_lambda)
            if fold_num == 0:
                print('lambda[0,' + str(log_lambda_index) + '] = ' + str(self.lambdas[0, log_lambda_index]))

        if self.lambda_grouping == LambdaGrouping.SCALAR_LAMBDA:
            for index in range(1, num_cols(self.AT_A)):  # skip bias
                self.AT_A[index, index] += self.lambdas[0, 0]
        elif self.lambda_grouping == LambdaGrouping.SCALAR_PAIR_COEFFS:
            # this is currently the same as scalar_lambda. merge? differentiate?
            for index in range(1, num_cols(self.AT_A)):  # skip bias
                self.AT_A[index, index] += self.lambdas[0, 0]
        elif self.lambda_grouping == LambdaGrouping.MATRIX_LAMBDA:
            for aa_position in range(self.group_num):
                offset = 1 + aa_position * self.alphabet_size
                for group_index in range(self.alphabet_size):
                    self.AT_A[offset + group_index, offset + group_index] += self.lambdas[0, aa_position]
        elif self.lambda_grouping == LambdaGrouping.MATRIX_PAIR_COEFFS:
            # TODO: since just training on the residuals there is only one lambda value to use, that is the new lambda value corresponding to pair coeffs
            # the previous training to calc the scoring matrix has found the 9 optimal lambdas for rest of the features
            for aa_position in range(self.seq_len):
                offset = 1 + aa_position * self.alphabet_size
                for group_index in range(self.alphabet_size):
                    self.AT_A[offset + group_index, offset + group_index] += self.lambdas[0, aa_position]
            coeff_start_index = 1 + self.alphabet_size * self.seq_len
            for coeff_index in range(coeff_start_index, num_cols(self.AT_A)):
                self.AT_A[coeff_index, coeff_index] += self.lambdas[0, -1]

        try:
            AT_A_inverse = self.AT_A.I
        except numpy.linalg.linalg.LinAlgError as err:
            print('singular matrix error')
            raise err

        A = training_data

        if self.cv_measured_values[fold_num] is None:
            self.cv_measured_values[fold_num] = self.construct_measured_vector(A)

        y = self.cv_measured_values[fold_num]

        inverse = AT_A_inverse * A.T

        if self.inequalities_present:
            assert False
            # calcX_inequal(training_data, inverse)
        else:
            self.cv_scoring_matrices[fold_num] = inverse * y.T