def construct_measured_vector(self, train_subset): num_seqs = num_rows(train_subset) measured_vals = np.matlib.zeros((1, num_seqs), dtype=np.float) for ctr in range(num_seqs): seq = str(train_subset[ctr, :]) measured_vals[0, ctr] = self.sequence_matrix.seq_to_measurement[seq] return measured_vals
def update_dictionaries(self): """ When the training data is changed (features or examples added/removed) the dictionary keys need to be updated # using a dictionary to look up measured values an inequalities means that sequences must be unique in training data # however, this doesn't seem to be an issue and it is the simplest way to associate sequences with the measured # values and inequalities across shuffling and CV splits :return: """ self.seq_to_measurement.clear() self.seq_to_inequality.clear() for ctr in range(num_rows(self.training_data)): seq = str(self.training_data[ctr, :]) self.seq_to_inequality[seq] = self.inequalities[ctr] self.seq_to_measurement[seq] = self.measured_values[ctr, 0]
def distance(self, log_lambdas): print('calculating distance') dist = 0.0 count = 0 for fold_num in range(self.num_cv_folds): train = self.cv_train_data[fold_num] blind = self.cv_blind_data[fold_num] if fold_num == 0: print('starting solve_x for fold ', fold_num) self.solve_x(train, log_lambdas, fold_num) dist += self.x_distance(blind, fold_num)[0, 0] # num rows is num elements in blind set count += num_rows(blind) dist /= float(count) print('dist: ' + str(dist) + '\n') print('\n') sys.stdout.flush() return dist
def add_pair_coeffs(self, pairs): num_pairs = len(pairs) # extend columns for pair coeff binary features coeff_feature_matrix = np.matlib.zeros((num_rows(self.training_data), num_pairs), dtype=np.float) # TODO: 20 should be dynamic alphabet size base_index = 1 + self.seq_length * 20 feature_index_to_pair = [None] * (base_index + len(pairs)) for offset, pair in enumerate(pairs): feature_index_to_pair[base_index + offset] = pair seq_index = 0 for seq in self.training_data: for offset, pair in enumerate(pairs): if seq[0, pair[0]] == 1 and seq[0, pair[1]] == 1: coeff_feature_matrix[seq_index, offset] = 1.0 seq_index += 1 self.training_data = np.concatenate((self.training_data, coeff_feature_matrix), axis=1) self.update_dictionaries()
def create_pair_training_matrix(self, pairs): """ The 0th pair will be the 1st column of features due to bias :param pairs: :return: """ num_pairs = len(pairs) # feature_index_to_pair = [None] * (1 + len(pairs)) # for offset, pair in enumerate(pairs): # feature_index_to_pair[1 + offset] = pair coeff_feature_matrix = np.matlib.zeros((num_rows(self.training_data), 1 + num_pairs), dtype=np.float) seq_index = 0 # can't use np.ndenumerate() because that is by element, not row for seq in self.training_data: # pair is tuple of index locations of binary pair feature coeff_feature_matrix[seq_index, 0] = 1.0 for offset, pair in enumerate(pairs): if seq[0, pair[0]] == 1 and seq[0, pair[1]] == 1: # does seq contain the pair coeff_feature_matrix[seq_index, offset + 1] = 1.0 # plus 1 to account for bias being first column seq_index += 1 return coeff_feature_matrix