class ModelGenerator(object): def __init__(self, load_pretrain_emb=False, data_feature=None, meta_data_feature=None, fasttext_embeddings_index=None, multi_label=False): self.data_feature = data_feature self.load_pretrain_emb = load_pretrain_emb self.meta_data_feature = meta_data_feature self.oov_cnt = 0 self.embedding_matrix = None self.use_bpe = False self.multi_label = multi_label self.lr = 0.001 self.cur_lr = 0.0 self.emb_size = 300 self.fasttext_embeddings_index = fasttext_embeddings_index self.model_lib = { 'text_cnn': TextCNN_Model, 'text_cnn_2d': CNN_Model, 'text_rcnn': TextRCNN_Model, 'text_rnn': RNN_Model } self.feature_lib = { 'char-level + 64dim-embedding', 'char-level + 300dim-embedding', 'word-level + pretrained embedding300dim', 'word-level + 64dim-embedding' } def select_classifier(self, model_name, feature_mode, data_feature): _feature = {} if feature_mode == 'char-level + 64dim-embedding': _feature = {'use_fasttext_emb': False, 'emb_size': 64} elif feature_mode == 'char-level + 300dim-embedding': _feature = {'use_fasttext_emb': False, 'emb_size': 300} elif feature_mode == 'word-level + pretrained embedding300dim': _feature = {'use_fasttext_emb': True, 'emb_size': 300} elif feature_mode == 'word-level + 64dim-embedding': _feature = {'use_fasttext_emb': False, 'emb_size': 64} data_feature.update(_feature) model = self.build_model(model_name, data_feature=data_feature) return model def _set_model_compile_params(self, optimizer_name, lr, metrics=[]): optimizer = self._set_optimizer(optimizer_name=optimizer_name, lr=lr) loss_fn = self._set_loss_fn() print(loss_fn) if metrics: metrics = metrics else: metrics = ['accuracy'] return optimizer, loss_fn, metrics def _set_model_train_params(self): pass def build_model(self, model_name, data_feature): if model_name == 'svm': model = LinearSVC(random_state=0, tol=1e-5, max_iter=500) self.model = CalibratedClassifierCV(model) if self.multi_label: info("use OneVsRestClassifier") self.model = OneVsRestClassifier(self.model, n_jobs=-1) else: if data_feature["use_fasttext_emb"]: self.oov_cnt, self.embedding_matrix = self.generate_emb_matrix( num_features=data_feature["num_features"], word_index=data_feature["word_index"]) else: self.embedding_matrix = None self.emb_size = data_feature["emb_size"] kwargs = { 'embedding_matrix': self.embedding_matrix, 'input_shape': data_feature['input_shape'], 'max_length': data_feature['max_length'], 'num_features': data_feature['num_features'], 'num_classes': data_feature['num_class'], "filter_num": data_feature["filter_num"], "trainable": False, "emb_size": self.emb_size } if self.multi_label: kwargs["use_multi_label"] = True self.model = self.model_lib[model_name](**kwargs) self._set_init_lr(model_name) optimizer, loss_fn, metrics = self._set_model_compile_params( optimizer_name='RMSprop', lr=self.lr) if self.multi_label: loss_fn = 'binary_crossentropy' self.model.compile(loss=loss_fn, optimizer=optimizer, metrics=metrics) return self.model def _set_loss_fn(self): loss_fn = 'categorical_crossentropy' return loss_fn def _set_optimizer(self, optimizer_name, lr=0.001): if optimizer_name == 'RAdam': opt = RAdam(learning_rate=lr) elif optimizer_name == 'RMSprop': opt = RMSprop(lr=lr) elif optimizer_name == "Adam": opt = Adam(lr=lr) return opt def _set_init_lr(self, model_name): if model_name == "text_cnn": self.lr = 0.001 elif model_name == "text_cnn_2d": self.lr = 0.016 elif model_name == "text_rcnn": self.lr = 0.025 elif model_name == "text_rnn": self.lr = 0.0035 def model_pre_select(self, model_name="svm"): self.model_name = model_name def generate_emb_matrix(self, num_features, word_index): return generate_emb_matrix( num_features=num_features, word_index=word_index, fasttext_embeddings_index=self.fasttext_embeddings_index)
class ClassifierSimilarity(): def __init__(self, model = None, num_pca_components = 6, max_error = 3, min_matches = 3): from sklearn.multiclass import OneVsRestClassifier if model is None: from sklearn.linear_model import LogisticRegression model = LogisticRegression() self.model = OneVsRestClassifier(model) self.num_pca_components = num_pca_components self.max_error = max_error self.min_matches = min_matches def get_similarity(self, data, similarity = None): features = self.get_input_features(data) features = (features - features.mean(axis=0))/features.std(axis=0) features[:, 0] = 0 true_matches = self.get_true_matches(data) predicted_matches = np.zeros(true_matches.shape) tsim_scores = TsimModel().get_similarity(data) for p in range(true_matches.shape[0]): train_features = np.delete(features, p, axis = 0) train_matches = np.delete(true_matches, p, axis = 0) predict_features = features[p,:] for p2 in range(true_matches.shape[1]): if p == p2: continue train_features[: , 0] = np.delete(tsim_scores[:, p2], p, axis = 0) predict_features[0] = tsim_scores[p,p2] y = train_matches[:, p2].reshape(-1,1) print(y.shape, ' ', train_features.shape) self.model.fit(train_features , y) print(self.model.predict_proba(predict_features).shape) predicted_matches[p, p2] = self.model.predict_proba(predict_features) # predicted_matches[p, p] = 0 print(len(np.where(predicted_matches[p,:] > .4)[0])) return predicted_matches def get_true_matches(self, data): dose_error = self.get_match_error(data) match_matrix = np.zeros(dose_error.shape) n_patients = data.get_num_patients() for p in range(n_patients): errors = dose_error[p, :] matches = [] max_error = self.max_error while len(matches) < self.min_matches: matches = np.where(errors < max_error)[0] max_error = max_error + .2 match_matrix[p, matches] = 1 return match_matrix def get_match_error(self, data): n_patients = data.get_num_patients() doses = data.doses error_matrix = np.zeros((n_patients, n_patients)) for p1 in range(n_patients): for p2 in range(p1 + 1, n_patients): dose_difference = np.abs(doses[p1,:] - doses[p2, :]) error_matrix[p1, p2] = np.mean(dose_difference) error_matrix += error_matrix.transpose() return error_matrix def get_input_features(self, data): num_patients = data.get_num_patients() pca = lambda x: Metrics.pca(x, self.num_pca_components) distances = pca(data.tumor_distances) lymph_nodes = pca(data.lymph_nodes) tumor_volumes = np.zeros((num_patients, 2)) for i in range(num_patients): gtvs = data.gtvs[i] gtvp_volume = gtvs[0].volume gtvn_volume = 0 for gtvn in gtvs[1:]: gtvn_volume += gtvn.volume tumor_volumes[i, :] = (gtvp_volume, gtvn_volume) laterality = data.lateralities.reshape(num_patients, 1) laterality = np.vectorize(TreeSimilarity.laterality_map.__getitem__)(laterality) subsites = data.subsites.reshape(num_patients, 1) subsites = np.vectorize(TreeSimilarity.subsite_map.__getitem__)(subsites) total_doses = data.prescribed_doses.reshape(num_patients, 1) clusters = data.classes.reshape(num_patients, 1) features = np.hstack([distances, lymph_nodes, tumor_volumes, total_doses, subsites]) return features ##old code from testing neural nets def get_autoencoderish_model(features): input_x = Input(shape=(features.shape[1],)) encoder = Sequential([ Dense(45, input_dim=features.shape[1], activation = 'relu'), Dense(100, activation = 'relu'), Dense(100, activation = 'relu'), Dense(100, activation = 'relu'), ])(input_x) decoder = Sequential([ Dense(100,input_dim = 4, activation = 'relu', activity_regularizer = regularizers.l2(.01)), Dense(45, activation = 'relu'), ])(encoder) model = Model(input_x, decoder) encoder_model= Model(input_x, encoder) # optimizer = optimizers.SGD(lr = .01, decay = 1e-12, momentum = .1) optimizer = optimizers.Adam() model.compile(loss = losses.mean_absolute_error, optimizer = optimizer) return(model, encoder_model)