def evaluate(self, x, y): xf = x[y > 0] yf = y[y > 0] print "Starting Kmeans clustering.." clustering = KMeans(n_clusters=100) predictions_minibatch = clustering.fit_predict(xf) print "done Kmeans clustering.." print "homogeneity score = %s" % metrics.homogeneity_score( yf, predictions_minibatch) e = ClusterEvaluation(yf, predictions_minibatch) m = e.printEvaluation()
def cluster(self, x, clustering=None, n_clusters=100, labels=None): if labels is None: labels = self.labels # print "Starting Kmeans clustering.." if clustering is None: clustering = KMeans(n_clusters=n_clusters) pred = clustering.fit_predict(x) # print "done Kmeans clustering.." self.clusters = pred e = ClusterEvaluation(labels, pred) m = e.printEvaluation() return m
def train_SN(train_data_file, val_data_file, test_data_file, wordvec_file, load_model_name=None, save_model_name='SN', trainset_loss_type='triplet', testset_loss_type='none', testset_loss_mask_epoch=3, p_cond=0.03, p_denoise=1.0, rel2id_file=None, similarity_file=None, dynamic_margin=True, margin=1.0, louvain_weighted=False, level_train=False, shallow_to_deep=False, same_level_pair_file=None, max_len=120, pos_emb_dim=5, same_ratio=0.06, batch_size=64, batch_num=10000, epoch_num=1, val_size=10000, select_cluster=None, omit_relid=None, labeled_sample_num=None, squared=True, same_level_part=None, mask_same_level_epoch=1, same_v_adv=False, random_init=False, seed=42, K_num=4, evaluate_hierarchy=False, train_for_cluster_file=None, train_structure_file=None, all_structure_file=None, to_cluster_data_num=100, incre_threshold=0, iso_threshold=5, avg_link_increment=True, modularity_increment=False): # preparing saving files. if select_cluster is None: select_cluster = ['Louvain'] if load_model_name is not None: load_path = os.path.join('model_file', load_model_name).replace('\\', '/') else: load_path = None save_path = os.path.join('model_file', save_model_name).replace('\\', '/') if not os.path.exists(save_path): os.makedirs(save_path) msger = messager(save_path=save_path, types=[ 'train_data_file', 'val_data_file', 'test_data_file', 'load_model_name', 'save_model_name', 'trainset_loss_type', 'testset_loss_type', 'testset_loss_mask_epoch', 'p_cond', 'p_denoise', 'same_ratio', 'labeled_sample_num' ], json_name='train_msg.json') msger.record_message([ train_data_file, val_data_file, test_data_file, load_model_name, save_model_name, trainset_loss_type, testset_loss_type, testset_loss_mask_epoch, p_cond, p_denoise, same_ratio, labeled_sample_num ]) msger.save_json() print('-----Data Loading-----') # for train dataloader_train = dataloader(train_data_file, wordvec_file, rel2id_file, similarity_file, same_level_pair_file, max_len=max_len, random_init=random_init, seed=seed) # for cluster never seen instances dataloader_train_for_cluster = dataloader(train_for_cluster_file, wordvec_file, rel2id_file, similarity_file, same_level_pair_file, max_len=max_len, random_init=random_init, seed=seed) # for validation, to select best model dataloader_val = dataloader(val_data_file, wordvec_file, rel2id_file, similarity_file, max_len=max_len) # for cluster dataloader_test = dataloader(test_data_file, wordvec_file, rel2id_file, similarity_file, max_len=max_len) word_emb_dim = dataloader_train._word_emb_dim_() word_vec_mat = dataloader_train._word_vec_mat_() print('word_emb_dim is {}'.format(word_emb_dim)) # compile model print('-----Model Initializing-----') rsn = RSN(word_vec_mat=word_vec_mat, max_len=max_len, pos_emb_dim=pos_emb_dim, dropout=0.2) if load_path: rsn.load_model(load_path) rsn = cudafy(rsn) rsn.set_train_op(batch_size=batch_size, train_loss_type=trainset_loss_type, testset_loss_type=testset_loss_type, p_cond=p_cond, p_denoise=p_denoise, p_mult=0.02, squared=squared, margin=margin) print('-----Validation Data Preparing-----') val_data, val_data_label = dataloader_val._part_data_(100) print('-----Clustering Data Preparing-----') train_hierarchy_structure_info = json.load(open(train_structure_file)) all_hierarchy_structure_info = json.load(open(all_structure_file)) train_hierarchy_cluster_list, gt_hierarchy_cluster_list, train_data_num, test_data_num, train_data, train_label, test_data, test_label = prepare_cluster_list( dataloader_train_for_cluster, dataloader_test, train_hierarchy_structure_info, all_hierarchy_structure_info, to_cluster_data_num) batch_num_list = [batch_num] * epoch_num # start_cluster_accuracy = 0.5 best_validation_f1 = 0 least_epoch = 1 best_step = 0 for epoch in range(epoch_num): msger = messager(save_path=save_path, types=[ 'batch_num', 'train_tp', 'train_fp', 'train_fn', 'train_tn', 'train_l', 'test_tp', 'test_fp', 'test_fn', 'test_tn', 'test_l' ], json_name='SNmsg' + str(epoch) + '.json') # for cluster # test_data, test_data_label = dataloader_test._data_() print('------epoch {}------'.format(epoch)) print('max batch num to train is {}'.format(batch_num_list[epoch])) for i in range(1, batch_num_list[epoch] + 1): to_cluster_flag = False if trainset_loss_type.startswith("triplet"): if level_train and epoch < mask_same_level_epoch: if i <= 1 / same_level_part * batch_num_list[epoch]: rsn.train_triplet_same_level( dataloader_train, batch_size=batch_size, K_num=4, dynamic_margin=dynamic_margin, level=1, same_v_adv=same_v_adv) elif i <= 2 / same_level_part * batch_num_list[epoch]: rsn.train_triplet_same_level( dataloader_train, batch_size=batch_size, K_num=4, dynamic_margin=dynamic_margin, level=2, same_v_adv=same_v_adv) else: rsn.train_triplet_loss(dataloader_train, batch_size=batch_size, dynamic_margin=dynamic_margin) else: rsn.train_triplet_loss(dataloader_train, batch_size=batch_size, dynamic_margin=dynamic_margin) else: rsn.train_RSN(dataloader_train, dataloader_test, batch_size=batch_size) if i % 100 == 0: print('temp_batch_num: ', i, ' total_batch_num: ', batch_num_list[epoch]) if i % 1000 == 0 and epoch >= least_epoch: print(save_model_name, 'epoch:', epoch) print('Validation:') cluster_result, cluster_msg = Louvain_no_isolation( dataset=val_data, edge_measure=rsn.pred_X, weighted=louvain_weighted) cluster_eval_b3 = ClusterEvaluation( val_data_label, cluster_result).printEvaluation(print_flag=False) cluster_eval_new = ClusterEvaluationNew( val_data_label, cluster_result).printEvaluation(print_flag=False) two_f1 = cluster_eval_new['F1'] if two_f1 > best_validation_f1: # acc to_cluster_flag = True best_step = i best_validation_f1 = two_f1 if to_cluster_flag: # if True: if 'Louvain' in select_cluster: print('-----Top Down Hierarchy Louvain Clustering-----') if avg_link_increment: # link_th_list = [0.5, 1, 2, 5, 10, 15, 20, 50, 100] # link_th_list = [0.05, 0.08, 0.1, 0.12, 0.15, 0.18, 0.2, 0.3, 0.4] # link_th_list = [i * 0.02 for i in range(1, 100)] link_th_list = [0.05] cluster_result, cluster_msg = Louvain_no_isolation( dataset=test_data, edge_measure=rsn.pred_X, weighted=louvain_weighted) predicted_cluster_dict_list = Top_Down_Louvain_with_test_cluster_done_avg_link_list( # predicted_cluster_dict_list = Louvain_with_test_cluster_done_avg_link_list( cluster_result, train_data_num, test_data_num, train_data, test_data, train_hierarchy_cluster_list, rsn.pred_X, link_th_list) best_hyper_score = 0 best_eval_info = None for predicted_cluster_dict in predicted_cluster_dict_list: predicted_cluster_list = predicted_cluster_dict[ 'list'] evaluation = HierarchyClusterEvaluation( gt_hierarchy_cluster_list, predicted_cluster_list, test_data_num) eval_info = evaluation.printEvaluation() if eval_info['total_F1'] > best_hyper_score: best_eval_info = eval_info best_hyper_score = eval_info['total_F1'] rsn.save_model(save_path=save_path, global_step=i + epoch * batch_num) print('model and clustering messages saved.') print('End: The model is:', save_model_name, trainset_loss_type, testset_loss_type, 'p_cond is:', p_cond) print(seed) print("best step:", best_step) print("new metric Info:") print("F1(%)") print(best_eval_info['match_f1'] * 100) print("taxonomy Info:") print("Precision(%); Recall(%); F1(%)") print(round(best_eval_info['taxonomy_precision'] * 100, 3), "; ", round(best_eval_info['taxonomy_recall'] * 100, 3), "; ", round(best_eval_info['taxonomy_F1'] * 100, 3)) print("Total Info:") print("Precision(%); Recall(%); F1(%)") print(round(best_eval_info['total_precision'] * 100, 3), "; ", round(best_eval_info['total_recall'] * 100, 3), "; ", round(best_eval_info['total_F1'] * 100, 3))
def train_CNN(train_data_file,test_data_file,wordvec_file,load_model_name,save_model_name, loss_type,max_len=120, pos_emb_dim=5,batch_size=100,batch_num=1000,epoch_num=1,val_size=1000): # preparing saving files save_path = os.path.join('model_file',save_model_name).replace('\\','/') if not os.path.exists(save_path): os.makedirs(save_path) # train data loading print('-----Data Loading-----') dataloader_train = dataloader(train_data_file, wordvec_file, max_len=max_len) dataloader_test = dataloader(test_data_file, wordvec_file, max_len=max_len) word_emb_dim = dataloader_train._word_emb_dim_() word_vec_mat = dataloader_train._word_vec_mat_() print('word_emb_dim is {}'.format(word_emb_dim)) # compile model print('-----Model Intializing-----') config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.5 config.gpu_options.allow_growth = True sess = tf.Session(config = config) cnn = CNN(session=sess,word_vec_mat=word_vec_mat,max_len=max_len, pos_emb_dim=pos_emb_dim,dropout=0.2) cnn.set_ph(batch_size=batch_size) cnn.set_train_op(loss_type=loss_type,p_mult=0.02) cnn.init_model() print('-----Testing Data Preparing-----') # preparing testing samples val_testset_input, val_testset_label = dataloader_test.next_batch_cnn(val_size) val_trainset_input, val_trainset_label = dataloader_train.next_batch_cnn(val_size) # intializing parameters batch_num_list = [batch_num] for epoch in range(epoch_num): # preparing message lists msger = messager(save_path=save_path,types=['batch_num','train_acc','train_l','test_acc','test_l'], json_name='CNNmsg'+str(epoch)+'.json') print('------epoch {}------'.format(epoch)) print('max batch num to train is {}'.format(batch_num_list[epoch])) for i in range(batch_num_list[epoch]): # training cnn.train(dataloader_train) # testing and saving if i % 10 == 0: print('temp_batch_num: ', i,' total_batch_num: ', batch_num_list[epoch]) if i % 100 == 0: print('model_name',save_model_name) print('trainset:') val_trainset_info = cnn.validation(val_trainset_input, val_trainset_label) print('testset:') val_testset_info = cnn.validation(val_testset_input, val_testset_label) msger.record_message((i,)+val_trainset_info+val_testset_info) msger.save_json() cnn.save_model(save_path=save_path,global_step=i) print('model and messages saved.') # Clustering print('Data to cluster loading...') msger = messager(save_path=save_path,types=['method','F1','precision','recall','msg'], json_name='cluster_msg'+str(epoch)+'.json') data_to_cluster, gt = dataloader_test._data_() for i,item in enumerate(gt): gt[i]=dataloader_test.relid_dict[item] print('-----CNN Clustering-----') cluster_result = cnn.pred_X(data_to_cluster) cluster_result = np.squeeze(cluster_result).tolist() cluster_msg = create_msg(cluster_result) print('Evaluating...') cluster_eval = ClusterEvaluation(gt,cluster_result).printEvaluation() msger.record_message(['CNN',cluster_eval['F1'],cluster_eval['precision'], cluster_eval['recall'],cluster_msg]) msger.save_json() print(cluster_eval) print('clustering messages saved.') print("-----End-----") print("The model name is:",save_model_name) print("loss type is:",loss_type)
def train_SN(train_data_file, val_data_file, test_data_file, wordvec_file, load_model_name=None, save_model_name='SN', trainset_loss_type='triplet', testset_loss_type='none', testset_loss_mask_epoch=3, p_cond=0.03, p_denoise=1.0, rel2id_file=None, similarity_file=None, dynamic_margin=True, margin=1.0, louvain_weighted=False, level_train=False, shallow_to_deep=False, same_level_pair_file=None, max_len=120, pos_emb_dim=5, same_ratio=0.06, batch_size=64, batch_num=10000, epoch_num=1, val_size=10000, select_cluster=None, omit_relid=None, labeled_sample_num=None, squared=True, same_level_part=None, mask_same_level_epoch=1, same_v_adv=False, random_init=False, seed=42, K_num=4, evaluate_hierarchy=False, gt_hierarchy_file=None): # preparing saving files. if select_cluster is None: select_cluster = ['Louvain'] if load_model_name is not None: load_path = os.path.join('model_file', load_model_name).replace('\\', '/') else: load_path = None save_path = os.path.join('model_file', save_model_name).replace('\\', '/') if not os.path.exists(save_path): os.makedirs(save_path) msger = messager(save_path=save_path, types=[ 'train_data_file', 'val_data_file', 'test_data_file', 'load_model_name', 'save_model_name', 'trainset_loss_type', 'testset_loss_type', 'testset_loss_mask_epoch', 'p_cond', 'p_denoise', 'same_ratio', 'labeled_sample_num' ], json_name='train_msg.json') msger.record_message([ train_data_file, val_data_file, test_data_file, load_model_name, save_model_name, trainset_loss_type, testset_loss_type, testset_loss_mask_epoch, p_cond, p_denoise, same_ratio, labeled_sample_num ]) msger.save_json() # if not trainset_loss_type.startswith("triplet"): # batch_size = 100 # train data loading print('-----Data Loading-----') dataloader_train = dataloader(train_data_file, wordvec_file, rel2id_file, similarity_file, same_level_pair_file, max_len=max_len, random_init=random_init, seed=seed) dataloader_val = dataloader(val_data_file, wordvec_file, rel2id_file, similarity_file, max_len=max_len) dataloader_test = dataloader(test_data_file, wordvec_file, rel2id_file, similarity_file, max_len=max_len) word_emb_dim = dataloader_train._word_emb_dim_() word_vec_mat = dataloader_train._word_vec_mat_() print('word_emb_dim is {}'.format(word_emb_dim)) # compile model print('-----Model Initializing-----') rsn = RSN(word_vec_mat=word_vec_mat, max_len=max_len, pos_emb_dim=pos_emb_dim, dropout=0.2) # rsn if load_path: rsn.load_model(load_path) rsn = cudafy(rsn) rsn.set_train_op(batch_size=batch_size, train_loss_type=trainset_loss_type, testset_loss_type=testset_loss_type, p_cond=p_cond, p_denoise=p_denoise, p_mult=0.02, squared=squared, margin=margin) print('-----Validation Data Preparing-----') val_data, val_data_label = dataloader_val._part_data_(100) # intializing parameters batch_num_list = [batch_num] * epoch_num # clustering_test_time = np.arange(19999, batch_num, 20000).tolist() msger_cluster = messager( save_path=save_path, types=['method', 'temp_batch_num', 'F1', 'precision', 'recall', 'msg'], json_name='cluster_msg.json') # best_validation_accuracy = 0.9 least_epoch = 1 best_step = 0 print_flag = True best_validation_f1 = 0 for epoch in range(epoch_num): test_data, test_data_label = dataloader_test._data_() print('------epoch {}------'.format(epoch)) print('max batch num to train is {}'.format(batch_num_list[epoch])) for i in range(1, batch_num_list[epoch] + 1): to_cluster_flag = False if trainset_loss_type.startswith("triplet"): if level_train and epoch < mask_same_level_epoch: if i <= 1 / same_level_part * batch_num_list[epoch]: rsn.train_triplet_same_level( dataloader_train, batch_size=batch_size, K_num=4, dynamic_margin=dynamic_margin, level=1, same_v_adv=same_v_adv) elif i <= 2 / same_level_part * batch_num_list[epoch]: rsn.train_triplet_same_level( dataloader_train, batch_size=batch_size, K_num=4, dynamic_margin=dynamic_margin, level=2, same_v_adv=same_v_adv) else: rsn.train_triplet_loss(dataloader_train, batch_size=batch_size, dynamic_margin=dynamic_margin) else: rsn.train_triplet_loss(dataloader_train, batch_size=batch_size, dynamic_margin=dynamic_margin) else: rsn.train_RSN(dataloader_train, dataloader_test, batch_size=batch_size) if i % 500 == 0: print('temp_batch_num: ', i, ' total_batch_num: ', batch_num_list[epoch]) if i % 1000 == 0 and epoch >= least_epoch: print(save_model_name, 'epoch:', epoch) print('Validation:') cluster_result, cluster_msg = Louvain_no_isolation( dataset=val_data, edge_measure=rsn.pred_X, weighted=louvain_weighted) cluster_eval_new = ClusterEvaluationNew( val_data_label, cluster_result).printEvaluation(print_flag=False) cluster_eval_b3 = ClusterEvaluation( val_data_label, cluster_result).printEvaluation(print_flag=False) # two_f1 = cluster_eval_new['F1'] + cluster_eval_b3['F1'] two_f1 = cluster_eval_b3['F1'] if two_f1 > best_validation_f1: # acc to_cluster_flag = True best_step = i best_validation_f1 = two_f1 if to_cluster_flag: if 'Louvain' in select_cluster: print('-----Louvain Clustering-----') if not evaluate_hierarchy: cluster_result, cluster_msg = Louvain_no_isolation( dataset=test_data, edge_measure=rsn.pred_X, weighted=louvain_weighted) cluster_eval_new = ClusterEvaluationNew( test_data_label, cluster_result).printEvaluation( print_flag=print_flag) # msger_cluster.record_message(['Louvain_New', i, cluster_eval_new['F1'], cluster_msg]) # print("New Metric", cluster_eval) cluster_eval_b3 = ClusterEvaluation( test_data_label, cluster_result).printEvaluation( print_flag=print_flag, extra_info=True) # msger_cluster.record_message(['Louvain', i, cluster_eval_b3['F1'], cluster_eval_b3['precision'], # cluster_eval_b3['recall'], cluster_msg]) best_cluster_eval_new = cluster_eval_new best_cluster_eval_b3 = cluster_eval_b3 rsn.save_model(save_path=save_path, global_step=i + epoch * batch_num) print('model and clustering messages saved.') print('End: The model is:', save_model_name, trainset_loss_type, testset_loss_type, 'p_cond is:', p_cond) print("best_cluster_eval_new", best_cluster_eval_new) print("best_cluster_eval_b3", best_cluster_eval_b3) print(seed) return best_cluster_eval_new, best_cluster_eval_b3
def train_SN(train_data_file, val_data_file, test_data_file, wordvec_file, load_model_name=None, save_model_name='SN', trainset_loss_type='cross', testset_loss_type='none', testset_loss_mask_epoch=3, p_cond=0.03, p_denoise=1.0, max_len=120, pos_emb_dim=5, same_ratio=0.06, batch_size=100, batch_num=100000, epoch_num=1, val_size=10000, select_cluster='Louvain', omit_relid=None, labeled_sample_num=None): # preparing saving files if load_model_name is not None: load_path = os.path.join('model_file', load_model_name).replace('\\', '/') else: load_path = None save_path = os.path.join('model_file', save_model_name).replace('\\', '/') if not os.path.exists(save_path): os.makedirs(save_path) msger = messager(save_path=save_path, types=[ 'train_data_file', 'val_data_file', 'test_data_file', 'load_model_name', 'save_model_name', 'trainset_loss_type', 'testset_loss_type', 'testset_loss_mask_epoch', 'p_cond', 'p_denoise', 'same_ratio', 'labeled_sample_num' ], json_name='train_msg.json') msger.record_message([ train_data_file, val_data_file, test_data_file, load_model_name, save_model_name, trainset_loss_type, testset_loss_type, testset_loss_mask_epoch, p_cond, p_denoise, same_ratio, labeled_sample_num ]) msger.save_json() # train data loading print('-----Data Loading-----') dataloader_train = dataloader(train_data_file, wordvec_file, max_len=max_len) if omit_relid is not None and omit_relid >= 4: dataloader_train.select_relation( np.arange(2, omit_relid + 1, 1).tolist()) if labeled_sample_num is not None: dataloader_train.select_sample_num(labeled_sample_num) dataloader_testset = dataloader(val_data_file, wordvec_file, max_len=max_len) dataloader_test = dataloader(test_data_file, wordvec_file, max_len=max_len) word_emb_dim = dataloader_train._word_emb_dim_() word_vec_mat = dataloader_train._word_vec_mat_() print('word_emb_dim is {}'.format(word_emb_dim)) # compile model print('-----Model Intializing-----') config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) SN = VASN(session=sess, word_vec_mat=word_vec_mat, max_len=max_len, pos_emb_dim=pos_emb_dim, dropout=0.2) SN.set_ph(batch_size=batch_size) SN.set_train_op(trainset_loss_type=trainset_loss_type, testset_loss_type=testset_loss_type, p_cond=p_cond, p_denoise=p_denoise, p_mult=0.02) SN.init_model(load_path) print('-----Testing Data Preparing-----') # preparing testing samples val_testset_left_input, val_testset_right_input, val_testset_data_label = \ dataloader_testset.next_batch(val_size,same_ratio=same_ratio) val_trainset_left_input, val_trainset_right_input, val_trainset_data_label = \ dataloader_train.next_batch(val_size,same_ratio=same_ratio) # intializing parameters batch_num_list = [batch_num] * epoch_num clustering_test_time = np.arange(19999, batch_num, 20000).tolist() msger_cluster = messager( save_path=save_path, types=['method', 'temp_batch_num', 'F1', 'precision', 'recall', 'msg'], json_name='cluster_msg.json') for epoch in range(epoch_num): if epoch < testset_loss_mask_epoch: SN.set_train_op(trainset_loss_type=trainset_loss_type, testset_loss_type='none', p_cond=p_cond, p_denoise=p_denoise, p_mult=0.02) else: SN.set_train_op(trainset_loss_type=trainset_loss_type, testset_loss_type=testset_loss_type, p_cond=p_cond, p_denoise=p_denoise, p_mult=0.02) # preparing message lists msger = messager(save_path=save_path, types=[ 'batch_num', 'train_tp', 'train_fp', 'train_fn', 'train_tn', 'train_l', 'test_tp', 'test_fp', 'test_fn', 'test_tn', 'test_l' ], json_name='SNmsg' + str(epoch) + '.json') data_to_cluster, gt = dataloader_test._data_() print('------epoch {}------'.format(epoch)) print('max batch num to train is {}'.format(batch_num_list[epoch])) for i in range(batch_num_list[epoch]): # training if omit_relid is not None and omit_relid == 0: SN.train_unsup(dataloader_train, dataloader_testset, batch_size=batch_size, same_ratio=same_ratio) else: SN.train(dataloader_train, dataloader_testset, batch_size=batch_size, same_ratio=same_ratio) # testing and saving if i % 100 == 0: print('temp_batch_num: ', i, ' total_batch_num: ', batch_num_list[epoch]) if i % 1000 == 0: print(save_model_name, 'epoch:', epoch) print('trainset:') val_trainset_info = SN.validation(val_trainset_left_input, val_trainset_right_input, val_trainset_data_label) print('testset:') val_testset_info = SN.validation(val_testset_left_input, val_testset_right_input, val_testset_data_label) msger.record_message((i, ) + val_trainset_info + val_testset_info) msger.save_json() SN.save_model(save_path=save_path, global_step=i) print('model and messages saved.') if i in clustering_test_time or i == batch_num_list[epoch] - 1: if 'Louvain' in select_cluster: print('-----Louvain Clustering-----') cluster_result, cluster_msg = Louvain_no_isolation( dataset=data_to_cluster, edge_measure=SN.pred_X) cluster_eval = ClusterEvaluation( gt, cluster_result).printEvaluation() msger_cluster.record_message([ 'Louvain', i, cluster_eval['F1'], cluster_eval['precision'], cluster_eval['recall'], cluster_msg ]) msger_cluster.save_json() print(cluster_eval) print('clustering messages saved.') if 'HAC' in select_cluster: print('-----HAC Clustering-----') cluster_result, cluster_msg = complete_HAC( dataset=data_to_cluster, HAC_dist=SN.pred_X, k=len(list(set(gt)))) cluster_eval = ClusterEvaluation( gt, cluster_result).printEvaluation() msger_cluster.record_message([ 'HAC', i, cluster_eval['F1'], cluster_eval['precision'], cluster_eval['recall'], cluster_msg ]) msger_cluster.save_json() print(cluster_eval) print('clustering messages saved.') print('End: The model is:', save_model_name, trainset_loss_type, testset_loss_type, 'p_cond is:', p_cond)
def load_cluster(train_data_file, test_data_file, wordvec_file, load_model_name=None, all_structure_file=None, trainset_loss_type='triplet', testset_loss_type='none', p_cond=0.03, to_cluster_data_num=100, p_denoise=1.0, rel2id_file=None, similarity_file=None, margin=1.0, save_cluster=False, louvain_weighted=False, same_level_pair_file=None, train_for_cluster_file=None, train_structure_file=None, test_infos_file=None, val_hier=False, golden=False, max_len=120, pos_emb_dim=5, batch_size=64, squared=True, random_init=False, seed=42): if load_model_name is not None: load_path = os.path.join('model_file', load_model_name).replace('\\', '/') else: load_path = None print('-----Data Loading-----') # for train dataloader_train = dataloader(train_data_file, wordvec_file, rel2id_file, similarity_file, same_level_pair_file, max_len=max_len, random_init=random_init, seed=seed) # for cluster never seen instances dataloader_train_for_cluster = dataloader(train_for_cluster_file, wordvec_file, rel2id_file, similarity_file, same_level_pair_file, max_len=max_len) dataloader_test = dataloader(test_data_file, wordvec_file, rel2id_file, similarity_file, max_len=max_len) word_emb_dim = dataloader_train._word_emb_dim_() word_vec_mat = dataloader_train._word_vec_mat_() print('word_emb_dim is {}'.format(word_emb_dim)) # compile model print('-----Model Initializing-----') rsn = RSN(word_vec_mat=word_vec_mat, max_len=max_len, pos_emb_dim=pos_emb_dim, dropout=0) rsn.set_train_op(batch_size=batch_size, train_loss_type=trainset_loss_type, testset_loss_type=testset_loss_type, p_cond=p_cond, p_denoise=p_denoise, p_mult=0.02, squared=squared, margin=margin) if load_path: rsn.load_model(load_path + "/RSNbest.pt") rsn = cudafy(rsn) rsn.eval() print('-----Louvain Clustering-----') if val_hier: print('-----Top Down Hierarchy Expansion-----') train_hierarchy_structure_info = json.load(open(train_structure_file)) all_hierarchy_structure_info = json.load(open(all_structure_file)) train_hierarchy_cluster_list, gt_hierarchy_cluster_list, train_data_num, test_data_num, train_data, train_label, test_data, test_label = prepare_cluster_list( dataloader_train_for_cluster, dataloader_test, train_hierarchy_structure_info, all_hierarchy_structure_info, to_cluster_data_num) link_th_list = [0.2] if golden: link_th_list = [0.3] predicted_cluster_dict_list = Top_Down_Louvain_with_test_cluster_done_avg_link_list_golden( gt_hierarchy_cluster_list, train_data_num, test_data_num, train_data, test_data, train_hierarchy_cluster_list, rsn.pred_X, link_th_list) else: cluster_result, cluster_msg = Louvain_no_isolation( dataset=test_data, edge_measure=rsn.pred_X, weighted=louvain_weighted) predicted_cluster_dict_list = Top_Down_Louvain_with_test_cluster_done_avg_link_list( cluster_result, train_data_num, test_data_num, train_data, test_data, train_hierarchy_cluster_list, rsn.pred_X, link_th_list) if save_cluster: json.dump(cluster_result, open("cluster_result.json", "w")) pickle.dump(predicted_cluster_dict_list, open("predicted_cluster_dict_list.pkl", "wb")) pickle.dump(gt_hierarchy_cluster_list, open("gt_hierarchy_cluster_list.pkl", "wb")) print("saved results!") for predicted_cluster_dict in predicted_cluster_dict_list: print("\n\n") predicted_cluster_list = predicted_cluster_dict['list'] print("Isolation threhold", predicted_cluster_dict['iso']) print("Average Link threhold", predicted_cluster_dict['link_th']) pickle.dump(predicted_cluster_list, open("predicted_cluster_list.pkl", "wb")) evaluation = HierarchyClusterEvaluation(gt_hierarchy_cluster_list, predicted_cluster_list, test_data_num) eval_info = evaluation.printEvaluation(print_flag=True) HierarchyClusterEvaluationTypes(gt_hierarchy_cluster_list, predicted_cluster_list, test_infos_file, rel2id_file).printEvaluation() else: test_data, test_data_label = dataloader_test._data_() cluster_result, cluster_msg = Louvain_no_isolation( dataset=test_data, edge_measure=rsn.pred_X, weighted=louvain_weighted) cluster_eval_b3 = ClusterEvaluation( test_data_label, cluster_result).printEvaluation(print_flag=True, extra_info=True) ClusterEvaluationB3Types(test_data_label, cluster_result, test_infos_file, rel2id_file).printEvaluation() print("100 times") print({k: v * 100 for k, v in cluster_eval_b3.items()})