def __init__(self, batch_size: int = 256, epochs: int = 100, activation_fn: str = 'softmax', num_negative: int = 500, optimizer: optimizers = optimizers.Adam(), loss: losses = 'categorical_crossentropy', validation_step: int = 1, normalize_score: bool = False, bow_feature=None, vocabulary=None): self._model_params = InputParamTuple(num_negative=num_negative, batch_size=batch_size, epochs=epochs, validation_step=validation_step, optimizer=optimizer, loss=loss, activation_fn=activation_fn, normalize=normalize_score) # TODO self._data = None self._model = None self._best_model = None self._normalize_score = normalize_score self._doubler = Doubler(embedding_node_dim=300, embedding_doc_dim=64, bow_feature=bow_feature, vocabulary=vocabulary)
class TestDoubler(unittest.TestCase): def setUp(self): self.doubler = Doubler() def test_doubles(self): self.assertEquals(4, self.doubler.double(2))
def gen(): d = { "RMED1_regrets": [], "RMED2_regrets": [], "IF_regrets": [], "BTM_regrets": [], "SAVAGE_regrets": [], "DOUBLER_regrets": [] } horizon = 100000 samples = 10 for i in range(samples): print("On sample number", i) x = RMED('RMED1', len(pref_mat), horizon, generator_fnc, f_rmed, regret_fn) reg_RMED = np.array(x.algo()) #reg_BTM[0] contains the regret values, [1] contains the best arm print("RMED done, best arm : ", reg_RMED[1]) d["RMED1_regrets"].append(list(np.around(reg_RMED[0], 3))) # json.dump(RMED1_regrets, open("RMED1_regrets.json", "w")) x = RMED('RMED2', len(pref_mat), horizon, generator_fnc, f_rmed, regret_fn) reg_RMED = np.array(x.algo()) print("RMED done, best arm : ", reg_RMED[1]) d["RMED2_regrets"].append(list(np.around(reg_RMED[0], 3))) # json.dump(RMED2_regrets, open("RMED2_regrets.json", "w")) x = InterleavedFilter(len(pref_mat), horizon, generator_fnc, regret_fn) reg_IF = np.array(x.algo()) print("IF done, best arm : ", reg_IF[1]) d["IF_regrets"].append(list(np.around(reg_IF[0], 3))) # json.dump(IF_regrets, open("IF_regrets.json", "w")) x = BeatTheMean(len(pref_mat), horizon, generator_fnc, regret_fn) reg_BTM = np.array(x.algo()) print("BTM done, best arm : ", reg_BTM[1]) d["BTM_regrets"].append(list(np.around(reg_BTM[0], 3))) # json.dump(BTM_regrets, open("BTM_regrets.json", "w")) x = Doubler(horizon, pref_mat, regret_fn) reg_DOUBLER = x.run() print("Doubler done, best arm : ", reg_DOUBLER[1]) d["DOUBLER_regrets"].append(reg_DOUBLER[0]) # json.dump(DOUBLER_regrets, open("DOUBLER_regrets.json", "w")) x = SAVAGE(horizon, pref_mat, regret_fn) reg_SAVAGE = x.run() print("SAVAGE done, best arm : ", reg_SAVAGE[1]) d["SAVAGE_regrets"].append(reg_SAVAGE[0]) # json.dump(SAVAGE_regrets, open("SAVAGE_regrets.json", "w")) if i % 3 == 0: print("Dumped all") dump_all(d) dump_all(d) d['RUCB_regrets'] = run_rucb(samples, horizon, pref_mat) print("RUCB done") d['RCS_regrets'] = run_rcs(samples, horizon, pref_mat) print("RCS done") dump_all(d)
class MAIN: def __init__(self, batch_size: int = 256, epochs: int = 100, activation_fn: str = 'softmax', num_negative: int = 500, optimizer: optimizers = optimizers.Adam(), loss: losses = 'categorical_crossentropy', validation_step: int = 1, normalize_score: bool = False, bow_feature=None, vocabulary=None): self._model_params = InputParamTuple(num_negative=num_negative, batch_size=batch_size, epochs=epochs, validation_step=validation_step, optimizer=optimizer, loss=loss, activation_fn=activation_fn, normalize=normalize_score) # TODO self._data = None self._model = None self._best_model = None self._normalize_score = normalize_score self._doubler = Doubler(embedding_node_dim=300, embedding_doc_dim=64, bow_feature=bow_feature, vocabulary=vocabulary) def fit_and_test(self, triples_train: np.array, triples_validation: np.array, triples_validation_neg: np.array, test_func, test_data, test_neg_data) -> list: self.__load_graph(triples_train, triples_validation) self.__build_model() return self.__train_and_test(triples_validation_neg, test_func, test_data, test_neg_data) def predict(self, triples_test: np.array): known_triples = utils.get_known_triples(self._data, self._gp) triples = utils.extract_triples(triples_test, self._gp.vertex_indexer, self._gp.relation_indexer) known_triples = known_triples.union(triples) nodes_idx = [] relations_idx = [] for triple in triples_test: nodes_idx.append(self._gp.vertex_indexer[triple[0]]) nodes_idx.append(self._gp.vertex_indexer[triple[2]]) relations_idx.append(self._gp.relation_indexer[triple[1]]) relations_idx.append(self._gp.relation_indexer[triple[1]]) ranks, scores = self.__predict_nodes(nodes_idx, relations_idx, known_triples) return ranks, scores, self._gp.vertices def __load_graph(self, triples_train: np.array, triples_validation: np.array): # build graph self.__build_graph(triples_train) # convert training triples to training_idx triples triples_train_idx = [[ self._gp.vertex_indexer[triple[0]], self._gp.relation_indexer[triple[1]], self._gp.vertex_indexer[triple[2]] ] for triple in triples_train] # build data structure self._data = Data(triples_train=triples_train, triples_train_idx=triples_train_idx, triples_validation=triples_validation) def __build_graph(self, triples_train): nodes = list(set(triples_train[:, 0]).union(triples_train[:, 2])) nodes.sort() node_indexer = {node: idx for idx, node in enumerate(nodes)} relations = list(set(triples_train[:, 1])) relations.sort() relation_indexer = {rel: idx for idx, rel in enumerate(relations)} num_nodes = len(node_indexer) num_relations = len(relation_indexer) index_node = {idx: vertex for vertex, idx in node_indexer.items()} index_relation = { idx: relation for relation, idx in relation_indexer.items() } self._gp = Graph(vertices=nodes, vertex_indexer=node_indexer, num_vertices=num_nodes, index_vertex=index_node, relations=relations, relation_indexer=relation_indexer, index_relation=index_relation, num_relations=num_relations) def __build_model(self): input_layer_positive = Input(shape=(1, ), name='input_node_positive') input_layer_negative = Input(shape=(self._model_params.num_negative + 1, ), name='input_node_negative') input_layer_relation = Input(shape=(1, ), name='input_relation') # keep track of all input and output layers input_layer_list = [ input_layer_positive, input_layer_negative, input_layer_relation ] output_layer_list = [] # build input_layers, score_layer, l2_offset_layer = self._doubler.build_model( self._model_params, self._gp, input_layer_positive, input_layer_negative, input_layer_relation) output_layer_list.append(score_layer) input_layer_list.extend(input_layers) # combine score layers if len(output_layer_list) > 1: score_layer_total = Average()(output_layer_list) else: score_layer_total = output_layer_list[0] # build model score_layer_total = Activation( self._model_params.activation_fn, name='output_overall_score')(score_layer_total) if l2_offset_layer is not None: l2_offset_layer = Activation( 'relu', name='output_overall_offset')(l2_offset_layer) self._model = Model(inputs=input_layer_list, outputs=[score_layer_total, l2_offset_layer]) self._model.compile(loss=self.__custom_loss, optimizer=self._model_params.optimizer) else: self._model = Model(inputs=input_layer_list, outputs=[score_layer_total]) self._model.compile(loss=self._model_params.loss, optimizer=self._model_params.optimizer) # print model print(self._model.summary()) def __custom_loss(self, y_true, y_pred): if 'offset' in y_pred.name: result = K.sum(K.abs(y_true - y_pred)) / 1000000.0 return K.minimum(result, 10) else: return K.categorical_crossentropy(y_true, y_pred) def __train_and_test(self, triples_validation_neg, test_func, test_data, test_neg_data) -> list: result_files = list() generator_token = self.__generator() steps_per_epoch = int( len(self._data.triples_train_idx) / self._model_params.batch_size) for epoch in range(self._model_params.epochs): print('Epoch %s/%s' % ((epoch + 1), self._model_params.epochs)) self._model.fit_generator(generator_token, steps_per_epoch, epochs=1, shuffle=False, workers=1, use_multiprocessing=False) if (epoch + 1) % self._model_params.validation_step == 0: self._validation(self._data.triples_validation, triples_validation_neg) result_file = test_func(self, test_data, test_neg_data) result_files.append(result_file) return result_files def _validation(self, valid_data, valid_neg_data): val_disease_dict = dict() val_neg_disease_dict = dict() recall_scores = list() genes_neg = set(valid_neg_data[:, 0]) for gene, relation, disease in valid_data: if disease not in val_disease_dict: val_disease_dict[disease] = set() val_disease_dict[disease].add(gene) for gene, relation, disease in valid_neg_data: if disease not in val_neg_disease_dict: val_neg_disease_dict[disease] = set() val_neg_disease_dict[disease].add(gene) for disease, values in tqdm.tqdm(val_disease_dict.items(), total=len(val_disease_dict.keys()), desc='> Validation'): sample = np.array( [[list(values)[0], 'gene_associated_with_disease', disease]]) ranks, scores, nodes = self.predict(sample) result = list(zip(nodes, scores[1])) result.sort(key=lambda x: x[1], reverse=True) result_filtered = list() for entry in result: if entry[0] in genes_neg or entry[0] in values: result_filtered.append(entry) # Recall-at-k recall_at_100 = 0 for idx in range(100): gene = result_filtered[idx][0] if gene in values: recall_at_100 += 1 recall_score = recall_at_100 / len(values) recall_scores.append(recall_score) print('\nRecall-AT-100 (Mean) = ' + str(np.mean(recall_scores)) + '| Recall-AT-100 (Std) = ' + str(np.std(recall_scores))) def __predict_nodes(self, nodes_idx: list, relations_idx: list, known_triples: set): scores = np.zeros((len(nodes_idx), self._gp.num_vertices)) doubler_score_node, doubler_score_doc = self._doubler.predict_nodes( nodes_idx, relations_idx) if self._normalize_score: doubler_score_node = utils.sigmoid(doubler_score_node) if doubler_score_doc is not None: doubler_score_doc = utils.sigmoid(doubler_score_doc) scores = np.sum([scores, doubler_score_node], axis=0) if doubler_score_doc is not None: scores = np.sum([scores, doubler_score_doc], axis=0) ranks = [] num_scores = len(scores) for idx, row in tqdm.tqdm(enumerate(scores), total=num_scores, desc='> Compute Ranking'): is_head = idx % 2 == 0 node_idx_given = nodes_idx[idx] relation_idx_given = relations_idx[idx] node_idx_wanted = nodes_idx[idx + 1] if is_head else nodes_idx[idx - 1] threshold_lower = row[node_idx_wanted] rank_cleaned = 1 for node_idx, score in enumerate(row): if score <= threshold_lower: continue elif is_head and not (node_idx_given, relation_idx_given, node_idx) in known_triples: rank_cleaned += 1 elif not is_head and not (node_idx, relation_idx_given, node_idx_given) in known_triples: rank_cleaned += 1 ranks.append(rank_cleaned) return ranks, scores def __generator(self): steps_per_epoch = int( len(self._data.triples_train_idx) / self._model_params.batch_size) while True: for idx in range(steps_per_epoch): input_dict = self.__generate_data(idx, steps_per_epoch) # build batch batch = [] for input_layer in self._model.inputs: layer_name = input_layer.name layer_name = layer_name[:layer_name.rfind(':')] batch.append(input_dict[layer_name]) # perfect result y1 = self.__generate_output_data( input_dict['input_relation'].shape[0]) y2 = np.zeros(len(y1)) y = [y1, y2] yield batch, y def __generate_output_data(self, num_triples: int): y = [0] * (self._model_params.num_negative + 1) y[0] = 2 # because we have 1+1 (node+doc) y = np.array([y] * num_triples) return y def __generate_data(self, batch_idx: int, num_batches: int): node_train_idx_positive = [] node_train_idx_negatives = [] relation_train_idx = [] generated_training_data = {} batch_size = self._model_params.batch_size num_negative = self._model_params.num_negative if batch_idx == num_batches - 1: triples_train_idx_batch = self._data.triples_train_idx[batch_idx * batch_size:] else: triples_train_idx_batch = self._data.triples_train_idx[ batch_idx * batch_size:(batch_idx + 1) * batch_size] self._doubler.init_batch_triples(self._model_params, batch_idx, num_batches, triples_train_idx_batch) np_random = RandomState(batch_idx) random_indices = np_random.randint( self._gp.num_vertices, size=(2 * len(triples_train_idx_batch), num_negative)) for idx, triple_idx in enumerate(triples_train_idx_batch): relation_train_idx.append(triple_idx[1]) relation_train_idx.append(triple_idx[1]) node_train_idx_positive.append([triple_idx[2]]) node_train_idx_positive.append([triple_idx[0]]) replacement = int((triple_idx[0] + triple_idx[2]) / 2) neg_head_replacement = replacement - 1 if replacement > 0 else replacement head_idx_negatives = random_indices[idx, ] head_idx_negatives = np.where(head_idx_negatives == triple_idx[0], neg_head_replacement, head_idx_negatives) head_idx_negatives = np.insert(head_idx_negatives, 0, triple_idx[0]) node_train_idx_negatives.append(head_idx_negatives) neg_tail_replacement = replacement + 1 if replacement < ( self._gp.num_vertices - 1) else replacement tail_idx_negatives = random_indices[batch_size - 1 + idx, ] tail_idx_negatives = np.where(tail_idx_negatives == triple_idx[2], neg_tail_replacement, tail_idx_negatives) tail_idx_negatives = np.insert(tail_idx_negatives, 0, triple_idx[2]) node_train_idx_negatives.append(tail_idx_negatives) training_data = self._doubler.generate_training_data( node_train_idx_positive, node_train_idx_negatives) generated_training_data.update(training_data) # store default input data generated_training_data.update( {'input_node_positive': np.array(node_train_idx_positive)}) generated_training_data.update( {'input_node_negative': np.array(node_train_idx_negatives)}) generated_training_data.update( {'input_relation': np.array(relation_train_idx)}) return generated_training_data
def setUp(self): self.doubler = Doubler()