def train_vectors(options, sens = None): if not utils.check_rebuild(options.vectors_path, descrip='embedding vectors', always_rebuild=options.always_rebuild): return if options.model == 'DeepWalk' or options.model == 'Node2Vec': # if options.using_tensorflow: # TF_skipgram.train_vectors(options, sens=sens) # else: skipgram.train_vectors(options, sens= sens) elif options.model == 'LINE': TF_line.train_vectors(options) elif options.model == 'RWNE': TF_rwne.train_vectors(options) elif options.model == 'SDNE': TF_sdne.train_vectors(options) elif options.model == 'DNGR': TF_dngr.train_vectors(options) elif options.model == 'GraRep': grarep.train_vectors(options) elif options.model == 'GCN': TF_gcn.train_vectors(options) else: logger.error("Unknown model for embedding: '%s'. "% options.model+ "Valid models: 'DeepWalk', 'Node2Vec', 'LINE', 'RWNE'.") sys.exit()
def build_walk_corpus_to_files(filebase, walk_times, headflag_of_index_file='', max_num_workers=cpu_count(), always_rebuild=False): if not utils.check_rebuild( filebase, descrip='walk corpus', always_rebuild=always_rebuild): return if max_num_workers <= 1 or walk_times <= 1: if max_num_workers > 1: logger.warning( 'Corpus bulid: walk times too small, using single-process instead...' ) files = [] logger.info( 'Corpus bulid: walking to files (without using multi-process)...') time_start = time.time() files.append( _construct_walk_corpus_and_write_singprocess( (filebase, walk_times))) logger.info('Corpus bulid: walk completed in {}s'.format(time.time() - time_start)) return files else: return _construct_walk_corpus_and_write_multiprocess( filebase, walk_times, headflag_of_index_file=headflag_of_index_file, max_num_workers=max_num_workers)
def store_walk_corpus(filebase, walk_sens, always_rebuild=False): if not utils.check_rebuild( filebase, descrip='walk corpus', always_rebuild=always_rebuild): return logger.info('Corpus store: storing...') time_start = time.time() with open(filebase, 'w') as fout: for sen in walk_sens: for v in sen: fout.write(u"{} ".format(str(v))) fout.write('\n') logger.info('Corpus store: store completed in {}s'.format(time.time() - time_start)) return
def train_vectors(options, sens=None): if not utils.check_rebuild(options.vectors_path, descrip='embedding vectors', always_rebuild=options.always_rebuild): return if options.model == 'DeepWalk': skipgram.train_vectors(options, sens=sens) elif options.model == 'LINE': TF_line.train_vectors(options) elif options.model == 'PTE': TF_pte.train_vectors(options) elif options.model == "SpaceyWalk": TF_spaceywalk.train_vectors(options) else: logger.error("Unknown model for embedding: '%s'. " % options.model + "Valid models: 'DeepWalk', 'LINE', 'PTE', 'SpaceyWalk'.") sys.exit()
def train_vectors(options): if not utils.check_rebuild(options.vectors_path, descrip='vectors', always_rebuild=options.always_rebuild): return train_vec_dir = os.path.split(options.vectors_path)[0] if not os.path.exists(train_vec_dir): os.makedirs(train_vec_dir) # construct network net = network.construct_network(options) Kstep = 2 # train info logger.info('Train info:') logger.info('\t train_model = {}'.format(options.model)) logger.info('\t total embedding nodes = {}'.format(net.get_nodes_size())) logger.info('\t total edges = {}'.format(net.get_edges_size())) logger.info('\t embedding size = {}'.format(options.embedding_size)) logger.info('\t Kstep = {}'.format(Kstep)) logger.info('\t vectors_path = {}'.format(options.vectors_path)) fr_vec = open(os.path.join(train_vec_dir, 'embedding.info'), 'w') fr_vec.write('embedding info:\n') fr_vec.write('\t train_model = {}\n'.format(options.model)) fr_vec.write('\t total embedding nodes = {}\n'.format( net.get_nodes_size())) fr_vec.write('\t total edges = {}\n'.format(net.get_edges_size())) fr_vec.write('\t embedding size = {}\n'.format(options.embedding_size)) fr_vec.write('\t Kstep = {}\n'.format(Kstep)) fr_vec.write('\t vectors_path = {}\n'.format(options.vectors_path)) fr_vec.close() # train logger.info('training...') time_start = time.time() grarep = GraRep(net.get_nodes_size(), net.edges, options.embedding_size, Kstep) vecs = grarep.train() save_word2vec_format(options.vectors_path, vecs, net._idx_nodes) logger.info('train completed in {}s'.format(time.time() - time_start)) return
def eval_once(options): global features_matrix, labels_matrix, LABEL_SIZE if not utils.check_rebuild(options.cluster_path, descrip='cluster', always_rebuild=options.always_rebuild): return logger.info('eval case: cluster...') logger.info('\t save_path: {}'.format(options.cluster_path)) logger.info('\t cluster: kmeans') logger.info('\t multilabel_rule: {}'.format(options.multilabel_rule)) logger.info('\t eval_online: {}'.format(options.eval_online)) logger.info('\t eval_workers: {}'.format(options.eval_workers)) logger.info('\t repeat {} times'.format(options.repeated_times)) logger.info('\t reading labeled data from file {}'.format( options.label_path)) time_start = time.time() id_list, labels_list = utils.get_labeled_data( options.label_path, multilabel_rule=options.multilabel_rule) features_matrix, labels_list = utils.get_vectors( utils.get_KeyedVectors(options.vectors_path), id_list, labels_list) labels_matrix = np.array([item[0] for item in labels_list]) LABEL_SIZE = options.label_size logger.info('\t reading labeled data completed in {}s'.format(time.time() - time_start)) logger.info('\t total labeled data size: {}'.format( np.size(features_matrix, axis=0))) logger.info('\t total labels size: {}'.format(options.label_size)) # cluster fr = open(options.cluster_path, 'w') fr.write('eval case: cluster...\n') fr.write('\t save_path: {}\n'.format(options.cluster_path)) fr.write('\t cluster: kmeans\n') fr.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule)) fr.write('\t eval_online: {}\n'.format(options.eval_online)) fr.write('\t eval_workers: {}\n'.format(options.eval_workers)) fr.write('\t repeat {} times\n'.format(options.repeated_times)) fr.write('\t total labeled data size: {}\n'.format( np.size(features_matrix, axis=0))) fr.write('\t total labels size: {}\n'.format(options.label_size)) for i in range(options.label_size): fr.write('\t\t label {}: {}\n'.format(i, np.sum(labels_matrix == i))) if options.eval_workers > 1 and options.repeated_times > 1: # speed up by using multi-process logger.info("\t allocating repeat_times to workers ...") if options.repeated_times <= options.eval_workers: times_per_worker = [1 for _ in range(options.repeated_times)] else: div, mod = divmod(options.repeated_times, options.eval_workers) times_per_worker = [div for _ in range(options.eval_workers)] for idx in range(mod): times_per_worker[idx] = times_per_worker[idx] + 1 assert sum( times_per_worker ) == options.repeated_times, 'workers allocating failed: %d != %d' % ( sum(times_per_worker), options.repeated_times) logger.info("\t using {} processes for evaling:".format( len(times_per_worker))) for idx, rep_times in enumerate(times_per_worker): logger.info("\t process-{}: repeat {} times".format( idx, rep_times)) try: nmi_list = [] # (train_ratio, macro, micro) with ProcessPoolExecutor( max_workers=options.eval_workers) as executor: for ret in executor.map(_cluster_thread_body, times_per_worker): nmi_list.extend(ret) except: nmi_list = [] # (train_ratio, macro, micro) with ProcessPoolExecutor( max_workers=options.eval_workers) as executor: for ret in executor.map(_cluster_thread_body, times_per_worker): nmi_list.extend(ret) if len(nmi_list) != options.repeated_times: logger.warning( "warning: eval unmatched repeated_times: {} != {}".format( len(nmi_list), options.repeated_times)) else: try: nmi_list = _cluster_thread_body(options.repeated_times) except: nmi_list = _cluster_thread_body(options.repeated_times) mean_nmi = sum(nmi_list) / float(len(nmi_list)) fr.write( 'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n' .format(options.repeated_times, len(nmi_list))) fr.write('\t\t NMI = {}\n'.format(mean_nmi)) fr.write('details:\n') for repeat in range(len(nmi_list)): fr.write('\t repeated {}/{}: NMI = {}\n'.format( repeat + 1, len(nmi_list), nmi_list[repeat])) fr.write('\neval case: cluster completed in {}s.'.format(time.time() - time_start)) fr.close() logger.info('eval case: cluster completed in {}s.'.format(time.time() - time_start)) return
def eval_online(options): global features_matrix, labels_matrix, LABEL_SIZE cluster_dir = os.path.split(options.cluster_path)[0] if not utils.check_rebuild(cluster_dir, descrip='cluster', always_rebuild=options.always_rebuild): return if not os.path.exists(cluster_dir): os.makedirs(cluster_dir) logger.info('eval case: cluster...') logger.info('\t save_path: {}'.format(options.cluster_path)) logger.info('\t cluster: kmeans') logger.info('\t multilabel_rule: {}'.format(options.multilabel_rule)) logger.info('\t eval_online: {}'.format(options.eval_online)) logger.info('\t eval_interval: {}s'.format(options.eval_interval)) logger.info('\t eval_workers: {}'.format(options.eval_workers)) logger.info('\t repeat {} times'.format(options.repeated_times)) logger.info('\t total labels size: {}'.format(options.label_size)) if options.eval_workers > 1 and options.repeated_times > 1: # speed up by using multi-process logger.info("\t allocating repeat_times to workers ...") if options.repeated_times <= options.eval_workers: times_per_worker = [1 for _ in range(options.repeated_times)] else: div, mod = divmod(options.repeated_times, options.eval_workers) times_per_worker = [div for _ in range(options.eval_workers)] for idx in range(mod): times_per_worker[idx] = times_per_worker[idx] + 1 assert sum( times_per_worker ) == options.repeated_times, 'workers allocating failed: %d != %d' % ( sum(times_per_worker), options.repeated_times) logger.info("\t using {} processes for evaling:".format( len(times_per_worker))) for idx, rep_times in enumerate(times_per_worker): logger.info("\t process-{}: repeat {} times".format( idx, rep_times)) fr_total = open(options.cluster_path, 'w') fr_total.write('eval case: cluster...\n') fr_total.write('\t save_dir: {}\n'.format(cluster_dir)) fr_total.write('\t cluster: kmeans\n') fr_total.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule)) fr_total.write('\t eval_online: {}\n'.format(options.eval_online)) fr_total.write('\t eval_interval: {}s\n'.format(options.eval_interval)) fr_total.write('\t eval_workers: {}\n'.format(options.eval_workers)) fr_total.write('\t repeat {} times\n'.format(options.repeated_times)) fr_total.write('\t total labels size: {}\n'.format(options.label_size)) fr_total.write( '\t results(NMI):\n=============================================================\n' ) fr_total.write('finish_time\tckpt\tNMI\n') logger.info('\t reading labeled data from file {}'.format( options.label_path)) time_start = time.time() id_list, labels_list = utils.get_labeled_data( options.label_path, multilabel_rule=options.multilabel_rule) logger.info('\t reading labeled data completed in {}s'.format(time.time() - time_start)) last_step = 0 summary_writer = tf.summary.FileWriter(cluster_dir, tf.Graph()) summary = tf.Summary() summary.value.add(tag='nmi', simple_value=0.) summary_writer.add_summary(summary, last_step) best_nmi = 0 ckpt_dir = os.path.join(os.path.split(options.vectors_path)[0], 'ckpt') ckpt = tf.train.get_checkpoint_state(ckpt_dir) while (not (ckpt and ckpt.model_checkpoint_path)): logger.info("\t model and vectors not exist, waiting ...") time.sleep(options.eval_interval) ckpt = tf.train.get_checkpoint_state(ckpt_dir) reading = options.vectors_path + ".reading_cluster" writing = options.vectors_path + ".writing" while (options.eval_online): while True: ckpt = tf.train.get_checkpoint_state(ckpt_dir) cur_step = int( ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) if cur_step <= last_step or (not os.path.exists( options.vectors_path)) or os.path.exists(writing): if os.path.exists( os.path.join( os.path.split(options.vectors_path)[0], "RUN_SUCCESS")): return time.sleep(options.eval_interval) continue # ready for reading logger.info("\t declare for reading ...") open(reading, "w") # declare time.sleep(30) ckpt = tf.train.get_checkpoint_state(ckpt_dir) cur_step = int( ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) if cur_step <= last_step or (not os.path.exists( options.vectors_path)) or os.path.exists(writing): os.remove(reading) # undeclare logger.info("\t confliction! undeclare and waiting ...") time.sleep(options.eval_interval) continue break logger.info("\t eval ckpt-{}.......".format(cur_step)) time_start = time.time() logger.info('\t reading embedding vectors from file {}'.format( options.vectors_path)) features_matrix, labels_list = utils.get_vectors( utils.get_KeyedVectors(options.vectors_path), id_list, labels_list) os.remove(reading) # synchrolock for multi-process logger.info("\t done for reading ...") labels_matrix = np.array([item[0] for item in labels_list]) LABEL_SIZE = options.label_size logger.info( '\t reading labeled data completed in {}s'.format(time.time() - time_start)) logger.info('\t total labeled data size: {}'.format( np.size(features_matrix, axis=0))) logger.info('\t total labels size: {}'.format(options.label_size)) # cluster fr = open(options.cluster_path + '.{}'.format(cur_step), 'w') fr.write('eval case: cluster...\n') fr.write('\t cluster: kmeans\n') fr.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule)) fr.write('\t eval_workers: {}\n'.format(options.eval_workers)) fr.write('\t repeat {} times\n'.format(options.repeated_times)) fr.write('\t total labeled data size: {}\n'.format( np.size(features_matrix, axis=0))) fr.write('\t total labels size: {}\n'.format(options.label_size)) for i in range(options.label_size): fr.write('\t\t label {}: {}\n'.format(i, np.sum(labels_matrix == i))) if options.eval_workers > 1 and options.repeated_times > 1: # speed up by using multi-process fr.write("\t using {} processes for evaling:\n".format( len(times_per_worker))) for idx, rep_times in enumerate(times_per_worker): fr.write("\t process-{}: repeat {} times\n".format( idx, rep_times)) try: nmi_list = [] with ProcessPoolExecutor( max_workers=options.eval_workers) as executor: for ret in executor.map(_cluster_thread_body, times_per_worker): nmi_list.extend(ret) except: nmi_list = [] with ProcessPoolExecutor( max_workers=options.eval_workers) as executor: for ret in executor.map(_cluster_thread_body, times_per_worker): nmi_list.extend(ret) if len(nmi_list) != options.repeated_times: logger.warning( "warning: eval unmatched repeated_times: {} != {}".format( len(nmi_list), options.repeated_times)) else: try: nmi_list = _cluster_thread_body(options.repeated_times) except: nmi_list = _cluster_thread_body(options.repeated_times) fr_total.write('%s ckpt-%-9d: ' % (time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(time.time())), cur_step)) summary = tf.Summary() mean_nmi = sum(nmi_list) / float(len(nmi_list)) fr.write( 'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n' .format(options.repeated_times, len(nmi_list))) fr.write('\t\t NMI = {}\n'.format(mean_nmi)) fr.write('details:\n') for repeat in range(len(nmi_list)): fr.write('\t repeated {}/{}: NMI = {}\n'.format( repeat + 1, len(nmi_list), nmi_list[repeat])) fr.write('\neval case: cluster completed in {}s\n'.format(time.time() - time_start)) fr.close() # fr_total.write('%.4f\n' % mean_nmi) fr_total.write('{}\n'.format(mean_nmi)) fr_total.flush() summary.value.add(tag='nmi', simple_value=mean_nmi) summary_writer.add_summary(summary, cur_step) summary_writer.flush() logger.info( 'cluster completed in {}s\n================================='. format(time.time() - time_start)) # copy ckpt-files according to last mean_Micro_F1 (0.9 ratio). if mean_nmi > best_nmi: best_nmi = mean_nmi ckptIsExists = os.path.exists( os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step)) if ckptIsExists: fr_best = open(os.path.join(cluster_dir, 'best_ckpt.info'), 'w') else: fr_best = open(os.path.join(cluster_dir, 'best_ckpt.info'), 'a') fr_best.write( "Note:the model.ckpt-best is the remainings of last best_ckpt!\n" "the current best_ckpt model is loss, but the result is:\n" ) fr_best.write("best_nmi: {}\n".format(best_nmi)) fr_best.write("best_ckpt: ckpt-{}\n".format(cur_step)) fr_best.close() if ckptIsExists: sourceFile = os.path.join( ckpt_dir, 'model.ckpt-%d.data-00000-of-00001' % cur_step) targetFile = os.path.join( cluster_dir, 'model.ckpt-best.data-00000-of-00001') if os.path.exists(targetFile): os.remove(targetFile) shutil.copy(sourceFile, targetFile) sourceFile = os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step) targetFile = os.path.join(cluster_dir, 'model.ckpt-best.index') if os.path.exists(targetFile): os.remove(targetFile) shutil.copy(sourceFile, targetFile) sourceFile = os.path.join(ckpt_dir, 'model.ckpt-%d.meta' % cur_step) targetFile = os.path.join(cluster_dir, 'model.ckpt-best.meta') if os.path.exists(targetFile): os.remove(targetFile) shutil.copy(sourceFile, targetFile) last_step = cur_step fr_total.close() summary_writer.close() return
def eval_online(options, net): global features_dict, true_edges_list_by_repeat, neg_edges_list_by_repeat link_prediction_dir = os.path.split(options.link_prediction_path)[0] if not utils.check_rebuild(link_prediction_dir, descrip='link_prediction', always_rebuild=options.always_rebuild): return if not os.path.exists(link_prediction_dir): os.makedirs(link_prediction_dir) logger.info('eval case: link_prediction ...') logger.info('\t data_dir = {}'.format(options.data_dir)) logger.info('\t data_name = {}'.format(options.data_name)) logger.info('\t isdirected = {}'.format(options.isdirected)) logger.info('\t eval_edge_type: {}'.format(options.eval_edge_type)) logger.info('\t save_dir: {}\n'.format(link_prediction_dir)) logger.info('\t classifier: LogisticRegression') logger.info('\t eval_online: {}'.format(options.eval_online)) logger.info('\t eval_interval: {}s'.format(options.eval_interval)) logger.info('\t eval_workers: {}'.format(options.eval_workers)) logger.info('\t repeated_times: {}'.format(options.repeated_times)) logger.info('\t feature_operators: {}'.format(options.feature_operators)) logger.info('\t sample_size: {}'.format(options.sample_size)) time_start = time.time() # load_features(options, net) load_edges(options, net) logger.info('\t total true edges size: {}'.format( len(true_edges_list_by_repeat[0]))) logger.info('\t total neg edges size: {}'.format( len(neg_edges_list_by_repeat[0]))) # repeated 10times repeated_times = options.repeated_times # split ratio if options.train_ratio > 0: train_ratio_list = [options.train_ratio] else: train_ratio_list = [0.01, 0.05] + [v / 10.0 for v in range(1, 10)] logger.info('\t repeat {} times for each train_ratio in {}'.format( repeated_times, train_ratio_list)) fr_total = open(options.link_prediction_path, 'w') fr_total.write('eval case: link_prediction...\n') fr_total.write('\t data_dir = {}\n'.format(options.data_dir)) fr_total.write('\t data_name = {}\n'.format(options.data_name)) fr_total.write('\t isdirected = {}\n'.format(options.isdirected)) fr_total.write('\t eval_edge_type: {}\n'.format(options.eval_edge_type)) fr_total.write('\t save_dir: {}\n\n'.format(link_prediction_dir)) fr_total.write('\t classifier: LogisticRegression\n') fr_total.write('\t eval_online: {}\n'.format(options.eval_online)) fr_total.write('\t eval_interval: {}s\n'.format(options.eval_interval)) fr_total.write('\t eval_workers: {}\n'.format(options.eval_workers)) fr_total.write('\t feature_operators: {}\n'.format( options.feature_operators)) fr_total.write('\t repeated_times: {}\n'.format(options.repeated_times)) fr_total.write('\t sample_size: {}\n'.format(options.sample_size)) fr_total.write('\t total true edges size: {}\n'.format( len(true_edges_list_by_repeat[0]))) fr_total.write('\t total neg edges size: {}\n'.format( len(neg_edges_list_by_repeat[0]))) fr_total.write('\t repeat {} times for each train_ratio in {}\n'.format( repeated_times, train_ratio_list)) fr_total.write( '\t results(AUC):\n=============================================================\n' ) tmp_str = "" for train_ratio in train_ratio_list: for op in options.feature_operators: tmp_str = tmp_str + "\t{}({})".format(train_ratio, op) fr_total.write('finish_time\tckpt\t' + tmp_str + "\n") full_train_ratio_info_list = [] for repeat in range(repeated_times): for op in options.feature_operators: for train_ratio in train_ratio_list: full_train_ratio_info_list.append((repeat, op, train_ratio)) if options.eval_workers > 1 and len(full_train_ratio_info_list) > 1: # speed up by using multi-process if len(full_train_ratio_info_list) <= options.eval_workers: train_ratios_per_worker = [[ train_ratio_info ] for train_ratio_info in full_train_ratio_info_list] else: div, mod = divmod(len(full_train_ratio_info_list), options.eval_workers) train_ratios_per_worker = [ full_train_ratio_info_list[div * i:div * (i + 1)] for i in range(options.eval_workers) ] for idx, train_ratio_info in enumerate( full_train_ratio_info_list[div * options.eval_workers:]): train_ratios_per_worker[idx].append(train_ratio_info) logger.info("\t using {} processes for evaling:".format( len(train_ratios_per_worker))) for idx, train_ratios in enumerate(train_ratios_per_worker): logger.info("\t process-{}: {}".format(idx, train_ratios)) last_step = 0 summary_writer = tf.summary.FileWriter(link_prediction_dir, tf.Graph()) summary = tf.Summary() for train_ratio in train_ratio_list: for op in options.feature_operators: summary.value.add(tag='auc_{}_{}'.format(train_ratio, op), simple_value=0.) summary_writer.add_summary(summary, last_step) best_auc = 0 ckpt_dir = os.path.join(os.path.split(options.vectors_path)[0], 'ckpt') ckpt = tf.train.get_checkpoint_state(ckpt_dir) while (not (ckpt and ckpt.model_checkpoint_path)): logger.info("\t model and vectors not exist, waiting ...") time.sleep(options.eval_interval) ckpt = tf.train.get_checkpoint_state(ckpt_dir) reading = options.vectors_path + ".reading_link_prediction_{}_{}".format( options.eval_edge_type[0], options.eval_edge_type[1]) writing = options.vectors_path + ".writing" while (options.eval_online): while True: ckpt = tf.train.get_checkpoint_state(ckpt_dir) cur_step = int( ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) ## synchrolock for multi-process: # while(not(cur_step > last_step and os.path.exists(options.vectors_path) and # time.time() - os.stat(options.vectors_path).st_mtime > 200)): # time.sleep(options.eval_interval) # ckpt = tf.train.get_checkpoint_state(ckpt_dir) # cur_step = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) # os.utime(options.vectors_path, None) if cur_step <= last_step or (not os.path.exists( options.vectors_path)) or os.path.exists(writing): if os.path.exists( os.path.join( os.path.split(options.vectors_path)[0], "RUN_SUCCESS")): return time.sleep(options.eval_interval) continue # ready for reading logger.info("\t declare for reading ...") open(reading, "w") # declare time.sleep(30) ckpt = tf.train.get_checkpoint_state(ckpt_dir) cur_step = int( ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) if cur_step <= last_step or (not os.path.exists( options.vectors_path)) or os.path.exists(writing): os.remove(reading) # undeclare logger.info("\t confliction! undeclare and waiting ...") time.sleep(options.eval_interval) continue break logger.info("\t eval ckpt-{}.......".format(cur_step)) time_start = time.time() # loading features_matrix(already trained) load_features(options, net) os.remove(reading) # synchrolock for multi-process logger.info("\t done for reading ...") logger.info('\t eval_workers: {}'.format(options.eval_workers)) logger.info('\t repeated_times: {}'.format(options.repeated_times)) logger.info('\t feature_operators: {}'.format( options.feature_operators)) logger.info('\t sample_size: {}'.format(options.sample_size)) logger.info('\t repeat {} times for each train_ratio in {}'.format( repeated_times, train_ratio_list)) logger.info('\t total true edges size: {}'.format( len(true_edges_list_by_repeat[0]))) logger.info('\t total neg edges size: {}'.format( len(neg_edges_list_by_repeat[0]))) fr = open(options.link_prediction_path + '.{}'.format(cur_step), 'w') fr.write('eval case: link_prediction ...\n') fr.write('\t data_dir = {}\n'.format(options.data_dir)) fr.write('\t data_name = {}\n'.format(options.data_name)) fr.write('\t isdirected = {}\n'.format(options.isdirected)) fr.write('\t eval_edge_type: {}\n'.format(options.eval_edge_type)) fr.write('\t classifier: LogisticRegression\n') fr.write('\t eval_online: {}\n'.format(options.eval_online)) fr.write('\t eval_workers: {}\n'.format(options.eval_workers)) fr.write('\t feature_operators: {}\n'.format( options.feature_operators)) fr.write('\t sample_size: {}\n'.format(options.sample_size)) fr.write('\t repeated_times: {}\n'.format(options.repeated_times)) fr.write('\t total true edges size: {}\n'.format( len(true_edges_list_by_repeat[0]))) fr.write('\t total neg edges size: {}\n'.format( len(neg_edges_list_by_repeat[0]))) fr.write('\t repeat {} times for each train_ratio in {}\n'.format( repeated_times, train_ratio_list)) if options.eval_workers > 1 and len(full_train_ratio_info_list) > 1: fr.write("\t using {} processes for evaling:\n".format( len(train_ratios_per_worker))) for idx, train_ratios in enumerate(train_ratios_per_worker): fr.write("\t process-{}: {}\n".format(idx, train_ratios)) try: ret_list = [] # (train_ratio, op, auc) with ProcessPoolExecutor( max_workers=options.eval_workers) as executor: for ret in executor.map(_classify_thread_body, train_ratios_per_worker): ret_list.extend(ret) except: logger.warning("concurrent.futures.process failed, retry...") time.sleep(10) ret_list = [] # (train_ratio, op, auc) with ProcessPoolExecutor( max_workers=options.eval_workers) as executor: for ret in executor.map(_classify_thread_body, train_ratios_per_worker): ret_list.extend(ret) else: ret_list = _classify_thread_body(full_train_ratio_info_list) fr_total.write('%s ckpt-%-9d: ' % (time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(time.time())), cur_step)) summary = tf.Summary() ret_dict = {} for train_ratio, op, auc in ret_list: # ret: (train_ratio, op, auc) if (train_ratio, op) in ret_dict: ret_dict[(train_ratio, op)].append(auc) else: ret_dict[(train_ratio, op)] = [auc] for train_ratio in train_ratio_list: for op in options.feature_operators: fr.write('\n' + '-' * 20 + '\n' + 'train_ratio = {}, operator = {}\n'.format( train_ratio, op)) auc_list = ret_dict[(train_ratio, op)] if len(auc_list) != repeated_times: logger.warning( "warning: train_ratio={},operator={},, eval unmatched repeated_times: {} != {}" .format(train_ratio, op, len(auc_list), repeated_times)) mean_auc = sum(auc_list) / float(len(auc_list)) fr.write( 'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n' .format(repeated_times, len(auc_list))) fr.write('\t\t AUC = {}\n'.format(mean_auc)) fr.write('details:\n') for repeat in range(len(auc_list)): fr.write('\t repeated {}/{}: AUC = {}\n'.format( repeat + 1, len(auc_list), auc_list[repeat])) fr_total.write('%.4f ' % (mean_auc)) summary.value.add(tag='auc_{}_{}'.format(train_ratio, op), simple_value=mean_auc) fr.write('\n eval case: link_prediction completed in {}s\n'.format( time.time() - time_start)) fr.close() fr_total.write('\n') fr_total.flush() summary_writer.add_summary(summary, cur_step) summary_writer.flush() logger.info( 'link_prediction completed in {}s\n=================================' .format(time.time() - time_start)) cur_auc = np.mean(ret_dict[(train_ratio_list[-1], options.feature_operators[-1])]) # copy ckpt-files according to last mean_Micro_F1 (0.9 ratio). if cur_auc > best_auc: best_auc = cur_auc ckptIsExists = os.path.exists( os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step)) if ckptIsExists: fr_best = open( os.path.join(link_prediction_dir, 'best_ckpt.info'), 'w') else: fr_best = open( os.path.join(link_prediction_dir, 'best_ckpt.info'), 'a') fr_best.write( "Note:the model.ckpt-best is the remainings of last best_ckpt!\n" "the current best_ckpt model is loss, but the result is:\n" ) fr_best.write( "best_auc(for train_ratio {} and operator {}): {}\n".format( train_ratio_list[-1], options.feature_operators[-1], best_auc)) fr_best.write("best_ckpt: ckpt-{}\n".format(cur_step)) fr_best.close() if ckptIsExists: sourceFile = os.path.join( ckpt_dir, 'model.ckpt-%d.data-00000-of-00001' % cur_step) targetFile = os.path.join( link_prediction_dir, 'model.ckpt-best.data-00000-of-00001') if os.path.exists(targetFile): os.remove(targetFile) shutil.copy(sourceFile, targetFile) sourceFile = os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step) targetFile = os.path.join(link_prediction_dir, 'model.ckpt-best.index') if os.path.exists(targetFile): os.remove(targetFile) shutil.copy(sourceFile, targetFile) sourceFile = os.path.join(ckpt_dir, 'model.ckpt-%d.meta' % cur_step) targetFile = os.path.join(link_prediction_dir, 'model.ckpt-best.meta') if os.path.exists(targetFile): os.remove(targetFile) shutil.copy(sourceFile, targetFile) last_step = cur_step fr_total.close() summary_writer.close()
def eval_once(options): global features_matrix, net_eval, net_except, SAMPLE_NODES, SAMPLE_RULE, METIRC, PREC_K if not utils.check_rebuild(options.link_prediction_path, descrip='link_prediction', always_rebuild=options.always_rebuild): return logger.info('eval case: link-prediction ...') logger.info('\t save_path: {}'.format(options.link_prediction_path)) logger.info('\t eval_data_path: {}'.format(options.eval_data_path)) logger.info('\t except_data_path: {}'.format(options.except_data_path)) logger.info('\t data_format: {}'.format(options.data_format)) logger.info('\t metrics: MAP and precise@K') logger.info('\t max_index for precise@K: {}'.format( options.precK_max_index)) logger.info('\t similarity_metric: {}'.format(options.similarity_metric)) logger.info('\t eval_online: {}'.format(options.eval_online)) logger.info('\t eval_interval: {}s'.format(options.eval_interval)) logger.info('\t sample_nodes: {}'.format(options.sample_nodes)) logger.info('\t sample_nodes_rule: {}'.format(options.sample_nodes_rule)) logger.info('\t repeat {} times'.format(options.repeated_times)) logger.info('\t eval_workers: {}'.format(options.eval_workers)) logger.info("constructing eval network ...") net_eval = network.construct_network(data_path=options.eval_data_path, data_format=options.data_format, print_net_info=False, isdirected=options.isdirected) eval_net_nodes_size = net_eval.get_nodes_size() eval_net_edges_size = net_eval.get_edges_size() logger.info("eval_net_nodes_size = {}".format(eval_net_nodes_size)) logger.info("eval_net_edges_size = {}".format(eval_net_edges_size)) logger.info("constructing except(train) network ...") net_except = network.construct_network(data_path=options.except_data_path, data_format=options.data_format, print_net_info=False, isdirected=options.isdirected) except_net_nodes_size = net_except.get_nodes_size() except_net_edges_size = net_except.get_edges_size() logger.info("except_net_nodes_size = {}".format(except_net_nodes_size)) logger.info("except_net_edges_size = {}".format(except_net_edges_size)) id_list = list(range(eval_net_nodes_size)) # must be [0,1,2,3,...] SAMPLE_NODES = options.sample_nodes SAMPLE_RULE = options.sample_nodes_rule METIRC = options.similarity_metric PREC_K = options.precK_max_index # loading features_matrix(already trained) logger.info('\t reading embedding vectors from file {}'.format( options.vectors_path)) time_start = time.time() features_matrix = utils.get_vectors( utils.get_KeyedVectors(options.vectors_path), id_list) logger.info( '\t reading embedding vectors completed in {}s'.format(time.time() - time_start)) logger.info('total loaded nodes: {}'.format( np.size(features_matrix, axis=0))) logger.info('the embedding dimension: {}'.format( np.size(features_matrix, axis=1))) fr = open(options.link_prediction_path, 'w') fr.write('eval case: link-prediction ...\n') fr.write('\t save_path: {}\n'.format(options.link_prediction_path)) fr.write('\t eval_data_path: {}\n'.format(options.eval_data_path)) fr.write('\t except_data_path: {}\n'.format(options.except_data_path)) fr.write('\t data_format: {}\n'.format(options.data_format)) fr.write('\t metrics: MAP and precise@K\n') fr.write('\t max_index for precise@K: {}\n'.format( options.precK_max_index)) fr.write('\t similarity_metric: {}\n'.format(options.similarity_metric)) fr.write('\t eval_online: {}\n'.format(options.eval_online)) fr.write('\t eval_interval: {}s\n'.format(options.eval_interval)) fr.write('\t sample_nodes: {}\n'.format(options.sample_nodes)) fr.write('\t sample_nodes_rule: {}\n'.format(options.sample_nodes_rule)) fr.write('\t repeat {} times\n'.format(options.repeated_times)) fr.write('\t eval_workers: {}\n'.format(options.eval_workers)) fr.write("eval_net_nodes_size = {}\n".format(eval_net_nodes_size)) fr.write("eval_net_edges_size = {}\n".format(eval_net_edges_size)) fr.write("except_net_nodes_size = {}\n".format(except_net_nodes_size)) fr.write("except_net_edges_size = {}\n".format(except_net_edges_size)) fr.write('total loaded nodes: {}\n'.format(np.size(features_matrix, axis=0))) fr.write('the embedding dimension: {}\n'.format( np.size(features_matrix, axis=1))) if options.sample_nodes > 0: if options.eval_workers > 1 and options.repeated_times > 1: # speed up by using multi-process logger.info("\t allocating repeat_times to workers ...") if options.repeated_times <= options.eval_workers: times_per_worker = [1 for _ in range(options.repeated_times)] else: div, mod = divmod(options.repeated_times, options.eval_workers) times_per_worker = [div for _ in range(options.eval_workers)] for idx in range(mod): times_per_worker[idx] = times_per_worker[idx] + 1 assert sum( times_per_worker ) == options.repeated_times, 'workers allocating failed: %d != %d' % ( sum(times_per_worker), options.repeated_times) logger.info("\t using {} processes for evaling:".format( len(times_per_worker))) for idx, rep_times in enumerate(times_per_worker): logger.info("\t process-{}: repeat {} times".format( idx, rep_times)) ret_list = [] # [[MAP, precisionK_list], ... ] with ProcessPoolExecutor( max_workers=options.eval_workers) as executor: for ret in executor.map(_sample_thread_body, times_per_worker): ret_list.extend(ret) if len(ret_list) != options.repeated_times: logger.warning( "warning: eval unmatched repeated_times: {} != {}".format( len(ret_list), options.repeated_times)) else: ret_list = _sample_thread_body(options.repeated_times) else: # no sampling, no repeat! ret_list = [_eval(net_eval, net_except)] # [[MAP, precisionK_list]] if options.sample_nodes > 0: fr.write( 'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n' .format(options.repeated_times, len(ret_list))) else: fr.write( 'due to the sample nodes = {}, so actual repeated_times = {}, results as follows:\n' .format(options.sample_nodes, len(ret_list))) mean_MAP = np.mean([ret[0] for ret in ret_list]) mean_precisionK = np.mean([ret[1] for ret in ret_list], axis=0) fr.write('\t\t MAP = {}\n'.format(mean_MAP)) for k in range(options.precK_max_index): if k < len(mean_precisionK): fr.write('\t\t precisionK_{} = {}\n'.format( k + 1, mean_precisionK[k])) else: fr.write('\t\t precisionK_{} = None\n'.format(k + 1)) fr.write('details:\n') for repeat in range(len(ret_list)): fr.write('\t repeated {}/{}:\n'.format(repeat + 1, len(ret_list))) MAP = ret_list[repeat][0] precisionK_list = ret_list[repeat][1] fr.write('\t\t MAP = {}\n'.format(MAP)) for k in range(options.precK_max_index): if k < len(precisionK_list): fr.write('\t\t precisionK_{} = {}\n'.format( k + 1, precisionK_list[k])) else: fr.write('\t\t precisionK_{} = None\n'.format(k + 1)) fr.write( '\neval case: link_prediction completed in {}s.'.format(time.time() - time_start)) fr.close() logger.info( 'eval case: link_prediction completed in {}s.'.format(time.time() - time_start)) return
def eval_online(options): global features_matrix, labels_matrix classify_dir = os.path.split(options.classify_path)[0] if not utils.check_rebuild(classify_dir, descrip='classify', always_rebuild=options.always_rebuild): return if not os.path.exists(classify_dir): os.makedirs(classify_dir) logger.info('eval case: classify...') logger.info('\t save_dir: {}'.format(classify_dir)) logger.info('\t classifier: LogisticRegression') logger.info('\t eval_online: {}'.format(options.eval_online)) logger.info('\t eval_interval: {}s'.format(options.eval_interval)) logger.info('\t eval_workers: {}'.format(options.eval_workers)) logger.info('\t total labels size: {}'.format(options.label_size)) # repeated 10times repeated_times = options.repeated_times # split ratio if options.train_ratio > 0: train_ratio_list = [options.train_ratio] else: train_ratio_list = [v / 10.0 for v in range(9, 0, -1)] logger.info('\t repeat {} times for each train_ratio in {}'.format( repeated_times, train_ratio_list)) train_ratio_fulllist = [ train_ratio for train_ratio in train_ratio_list for _ in range(repeated_times) ] if options.eval_workers > 1 and len(train_ratio_fulllist) > 1: # speed up by using multi-process if len(train_ratio_fulllist) <= options.eval_workers: train_ratios_per_worker = [[train_ratio] for train_ratio in train_ratio_fulllist] else: div, mod = divmod(len(train_ratio_fulllist), options.eval_workers) train_ratios_per_worker = [ train_ratio_fulllist[div * i:div * (i + 1)] for i in range(options.eval_workers) ] for idx, train_ratio in enumerate( train_ratio_fulllist[div * options.eval_workers:]): train_ratios_per_worker[len(train_ratios_per_worker) - 1 - idx].append(train_ratio) logger.info("\t using {} processes for evaling:".format( len(train_ratios_per_worker))) for idx, train_ratios in enumerate(train_ratios_per_worker): logger.info("\t process-{}: {}".format(idx, train_ratios)) fr_total = open(options.classify_path, 'w') fr_total.write('eval case: classify...\n') fr_total.write('\t save_dir: {}\n'.format(classify_dir)) fr_total.write('\t classifier: LogisticRegression\n') fr_total.write('\t eval_online: {}\n'.format(options.eval_online)) fr_total.write('\t eval_interval: {}s\n'.format(options.eval_interval)) fr_total.write('\t eval_workers: {}\n'.format(options.eval_workers)) fr_total.write('\t repeat {} times for each train_ratio in {}\n'.format( repeated_times, train_ratio_list)) fr_total.write('\t total labels size: {}\n'.format(options.label_size)) fr_total.write( '\t results(Macro_F1,Micro_F1):\n=============================================================\n' ) fr_total.write( 'finish_time\tckpt\t\t0.1\t0.2\t0.3\t0.4\t0.5\t0.6\t0.7\t0.8\t0.9\n') time_start = time.time() logger.info('\t reading labeled data from file {}'.format( options.label_path)) id_list_totoal, labels_list_total = utils.get_labeled_data( options.label_path) logger.info('\t reading labeled data completed in {}s'.format(time.time() - time_start)) last_step = 0 summary_writer = tf.summary.FileWriter(classify_dir, tf.Graph()) summary = tf.Summary() for train_ratio in train_ratio_list: summary.value.add(tag='macro_train_{}'.format(train_ratio), simple_value=0.) summary.value.add(tag='micro_train_{}'.format(train_ratio), simple_value=0.) summary_writer.add_summary(summary, last_step) best_micro = 0 ckpt_dir = os.path.join(os.path.split(options.vectors_path)[0], 'ckpt') ckpt = tf.train.get_checkpoint_state(ckpt_dir) while (not (ckpt and ckpt.model_checkpoint_path)): logger.info("\t model and vectors not exist, waiting ...") time.sleep(options.eval_interval) ckpt = tf.train.get_checkpoint_state(ckpt_dir) reading = options.vectors_path + ".reading_classify" writing = options.vectors_path + ".writing" while (options.eval_online): while True: ckpt = tf.train.get_checkpoint_state(ckpt_dir) cur_step = int( ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) ## synchrolock for multi-process: # while(not(cur_step > last_step and os.path.exists(options.vectors_path) and # time.time() - os.stat(options.vectors_path).st_mtime > 200)): # time.sleep(options.eval_interval) # ckpt = tf.train.get_checkpoint_state(ckpt_dir) # cur_step = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) # os.utime(options.vectors_path, None) if cur_step <= last_step or (not os.path.exists( options.vectors_path)) or os.path.exists(writing): if os.path.exists( os.path.join( os.path.split(options.vectors_path)[0], "RUN_SUCCESS")): return time.sleep(options.eval_interval) continue # ready for reading logger.info("\t declare for reading ...") open(reading, "w") # declare time.sleep(30) ckpt = tf.train.get_checkpoint_state(ckpt_dir) cur_step = int( ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) if cur_step <= last_step or (not os.path.exists( options.vectors_path)) or os.path.exists(writing): os.remove(reading) # undeclare logger.info("\t confliction! undeclare and waiting ...") time.sleep(options.eval_interval) continue break logger.info("\t eval ckpt-{}.......".format(cur_step)) time_start = time.time() logger.info('\t reading embedding vectors from file {}'.format( options.vectors_path)) features_matrix, labels_list = utils.get_vectors( utils.get_KeyedVectors(options.vectors_path), id_list_totoal, labels_list_total) os.remove(reading) # synchrolock for multi-process logger.info("\t done for reading ...") mlb = MultiLabelBinarizer(range(options.label_size)) labels_matrix = mlb.fit_transform(labels_list) logger.info('\t reading embedding vectors completed in {}s'.format( time.time() - time_start)) logger.info('\t total labeled data size: {}'.format( np.size(features_matrix, axis=0))) logger.info('\t total labels size: {}'.format(options.label_size)) # classify fr = open(options.classify_path + '.{}'.format(cur_step), 'w') fr.write('eval case: classify...\n') fr.write('\t classifier: LogisticRegression\n') fr.write('\t eval_workers: {}\n'.format(options.eval_workers)) fr.write('\t repeat {} times for each train_ratio in {}\n'.format( repeated_times, train_ratio_list)) fr.write('\t total labeled data size: {}\n'.format( np.size(features_matrix, axis=0))) fr.write('\t total labels size: {}\n'.format(options.label_size)) for i in range(options.label_size): fr.write('\t\t label {}: {}\n'.format(i, np.sum(labels_matrix[:, i]))) if options.eval_workers > 1 and len(train_ratio_fulllist) > 1: fr.write("\t using {} processes for evaling:\n".format( len(train_ratios_per_worker))) for idx, train_ratios in enumerate(train_ratios_per_worker): fr.write("\t process-{}: {}\n".format(idx, train_ratios)) ret_list = [] # (train_ratio, macro, micro) with ProcessPoolExecutor( max_workers=options.eval_workers) as executor: for ret in executor.map(_classify_thread_body, train_ratios_per_worker): ret_list.extend(ret) else: ret_list = _classify_thread_body(train_ratio_fulllist) fr_total.write('%s ckpt-%-9d: ' % (time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(time.time())), cur_step)) summary = tf.Summary() ret_dict = {} for ret in ret_list: if ret[0] in ret_dict: ret_dict[ret[0]][0].append(ret[1]) ret_dict[ret[0]][1].append(ret[2]) else: ret_dict[ret[0]] = [[ret[1]], [ret[2]]] for train_ratio, macro_micro in sorted(ret_dict.items(), key=lambda item: item[0]): fr.write('\n' + '-' * 20 + '\n' + 'train_ratio = {}\n'.format(train_ratio)) Macro_F1_list = macro_micro[0] Micro_F1_list = macro_micro[1] if len(Macro_F1_list) != repeated_times: logger.warning( "warning: train_ratio = {} eval unmatched repeated_times: {} != {}" .format(train_ratio, len(Macro_F1_list), repeated_times)) mean_Macro_F1 = sum(Macro_F1_list) / float(len(Macro_F1_list)) mean_Micro_F1 = sum(Micro_F1_list) / float(len(Micro_F1_list)) fr.write( 'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n' .format(repeated_times, len(Macro_F1_list))) fr.write('\t\t Macro_F1 = {}\n'.format(mean_Macro_F1)) fr.write('\t\t Micro_F1 = {}\n'.format(mean_Micro_F1)) fr.write('details:\n') for repeat in range(len(Macro_F1_list)): fr.write( '\t repeated {}/{}: Macro_F1 = {}, Micro_F1 = {}\n'.format( repeat + 1, len(Macro_F1_list), Macro_F1_list[repeat], Micro_F1_list[repeat])) fr_total.write('%.4f, %.4f ' % (mean_Macro_F1, mean_Micro_F1)) summary.value.add(tag='macro_train_{}'.format(train_ratio), simple_value=mean_Macro_F1) summary.value.add(tag='micro_train_{}'.format(train_ratio), simple_value=mean_Micro_F1) fr.write( '\neval case: classify completed in {}s\n'.format(time.time() - time_start)) fr.close() fr_total.write('\n') fr_total.flush() summary_writer.add_summary(summary, cur_step) summary_writer.flush() logger.info( 'classify completed in {}s\n================================='. format(time.time() - time_start)) # copy ckpt-files according to last mean_Micro_F1 (0.9 ratio). if mean_Micro_F1 > best_micro: best_micro = mean_Micro_F1 ckptIsExists = os.path.exists( os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step)) if ckptIsExists: fr_best = open(os.path.join(classify_dir, 'best_ckpt.info'), 'w') else: fr_best = open(os.path.join(classify_dir, 'best_ckpt.info'), 'a') fr_best.write( "Note:the model.ckpt-best is the remainings of last best_ckpt!\n" "the current best_ckpt model is loss, but the result is:\n" ) fr_best.write("best_micro(for ratio 0.9): {}\n".format(best_micro)) fr_best.write("best_ckpt: ckpt-{}\n".format(cur_step)) fr_best.close() if ckptIsExists: sourceFile = os.path.join( ckpt_dir, 'model.ckpt-%d.data-00000-of-00001' % cur_step) targetFile = os.path.join( classify_dir, 'model.ckpt-best.data-00000-of-00001') if os.path.exists(targetFile): os.remove(targetFile) shutil.copy(sourceFile, targetFile) sourceFile = os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step) targetFile = os.path.join(classify_dir, 'model.ckpt-best.index') if os.path.exists(targetFile): os.remove(targetFile) shutil.copy(sourceFile, targetFile) sourceFile = os.path.join(ckpt_dir, 'model.ckpt-%d.meta' % cur_step) targetFile = os.path.join(classify_dir, 'model.ckpt-best.meta') if os.path.exists(targetFile): os.remove(targetFile) shutil.copy(sourceFile, targetFile) last_step = cur_step fr_total.close() summary_writer.close()
def build_walk_corpus(options): global walker # check walk info and record if not utils.check_rebuild(options.corpus_store_path, descrip='walk corpus', always_rebuild=options.always_rebuild): return if options.model == "DeepWalk": random_walker = "uniform" net = network.construct_network(options, isHIN=False) elif options.model == "SpaceyWalk": random_walker = "spacey" net = network.construct_network(options, isHIN=True) elif options.model == "MetatreeWalk": random_walker = "metatreewalk" net = network.construct_network(options, isHIN=True) else: logger.error("Unknown model or it cann't build walk corpus: '%s'." % options.model) sys.exit() logger.info('Corpus bulid: walk info:') logger.info('\t data_dir = {}'.format(options.data_dir)) logger.info('\t data_name = {}'.format(options.data_name)) logger.info('\t isdirected = {}\n'.format(options.isdirected)) logger.info('\t random_walker = {}'.format(random_walker)) logger.info('\t walk_times = {}'.format(options.walk_times)) logger.info('\t walk_length = {}'.format(options.walk_length)) logger.info('\t max_walk_workers = {}'.format(options.walk_workers)) logger.info('\t walk_to_memory = {}'.format(options.walk_to_memory)) logger.info('\t seed = {}'.format(options.seed)) logger.info('\t alpha = {}'.format(options.alpha)) logger.info('\t window_size = {}'.format(options.window_size)) logger.info('\t sample_size = {}'.format(options.sample_size)) if options.walk_to_memory: logger.info('\t donot store corpus = {}'.format( str(options.not_store_corpus))) if not options.not_store_corpus: logger.info('\t corpus store path = {}'.format( options.corpus_store_path)) else: logger.info('\t corpus store path = {}'.format( options.corpus_store_path)) fr_walks = open( os.path.join( os.path.split(options.corpus_store_path)[0], 'walks.info'), 'w') fr_walks.write('Corpus walk info:\n') fr_walks.write('\t data_dir = {}\n'.format(options.data_dir)) fr_walks.write('\t data_name = {}\n'.format(options.data_name)) fr_walks.write('\t isdirected = {}\n\n'.format(options.isdirected)) fr_walks.write('\t random_walker = {}\n'.format(random_walker)) fr_walks.write('\t walk times = {}\n'.format(options.walk_times)) fr_walks.write('\t walk length = {}\n'.format(options.walk_length)) fr_walks.write('\t max walk workers = {}\n'.format(options.walk_workers)) fr_walks.write('\t seed = {}\n'.format(options.seed)) fr_walks.write('\t alpha = {}\n'.format(options.alpha)) fr_walks.write('\t window_size = {}\n'.format(options.window_size)) fr_walks.write('\t sample_size = {}\n'.format(options.sample_size)) fr_walks.write('\t walk to memory = {}\n'.format( str(options.walk_to_memory))) if options.walk_to_memory: fr_walks.write('\t donot store corpus = {}\n'.format( str(options.not_store_corpus))) if not options.not_store_corpus: fr_walks.write('\t corpus store path = {}\n'.format( options.corpus_store_path)) else: fr_walks.write('\t corpus store path = {}\n'.format( options.corpus_store_path)) fr_walks.close() if options.model == "SpaceyWalk": if options.using_metapath == "metagraph": metagraph = network.construct_meta_graph( options.metapath_path, isdirected=options.isdirected) elif options.using_metapath == "metatree": metagraph = network.construct_meta_tree(options.metapath_path, isdirected=True) elif options.using_metapath == "metaschema": metagraph = None else: logger.error("Unknown feature : '%s'." % options.using_metapath) sys.exit() walker = Walker(net, random_walker=random_walker, walk_length=options.walk_length, metagraph=metagraph, using_metapath=options.using_metapath, history_position=options.history_position, task="walk", alpha=options.alpha) elif options.model == "MetatreeWalk": metagraph = network.construct_meta_tree(options.metapath_path, isdirected=True) walker = Walker(net, random_walker=random_walker, walk_length=options.walk_length, metagraph=metagraph, task="walk") corpus_store_dir = os.path.split(options.corpus_store_path)[0] if not os.path.exists(corpus_store_dir): os.makedirs(corpus_store_dir) logger.info( 'Corpus bulid: walking and computing (using %d workers for multi-process)...' % options.walk_workers) time_start = time.time() if options.walk_times <= options.walk_workers: times_per_worker = [1 for _ in range(options.walk_times)] else: div, mod = divmod(options.walk_times, options.walk_workers) times_per_worker = [div for _ in range(options.walk_workers)] for idx in range(mod): times_per_worker[idx] = times_per_worker[idx] + 1 assert sum( times_per_worker ) == options.walk_times, 'workers allocating failed: %d != %d' % ( sum(times_per_worker), options.walk_times) nodes_total = list(range(walker.nodes_size)) sp_random = random.Random(options.seed) sp_random.shuffle(nodes_total) nodes_total = nodes_total[0:options.sample_size] nodes_total.insert(0, 8407) nodes_total.insert(0, 9891) nodes_total.insert(0, 8354) nodes_total.insert(0, 8798) for node in nodes_total: args_list = [] begin = 0 for cnt in times_per_worker: args_list.append((corpus_store_dir, node, begin + 1, begin + cnt, options.window_size)) begin += cnt with ProcessPoolExecutor(max_workers=options.walk_workers) as executor: executor.map(_construct_walk_corpus_and_write_singprocess, args_list) logger.info('Corpus bulid: walk completed in {}s'.format(time.time() - time_start)) del walker gc.collect() return
def build_walk_corpus(options): global walker # check walk info and record if not utils.check_rebuild(options.corpus_store_path, descrip='walk corpus', always_rebuild=options.always_rebuild): return if options.model == "DeepWalk": random_walker = "uniform" net = network.construct_network(options, isHIN=False) elif options.model == "SpaceyWalk": random_walker = "spacey" net = network.construct_network(options, isHIN=True) elif options.model == "MetatreeWalk": random_walker = "metatreewalk" net = network.construct_network(options, isHIN=True) else: logger.error("Unknown model or it cann't build walk corpus: '%s'." % options.model) sys.exit() logger.info('Corpus bulid: walk info:') logger.info('\t data_dir = {}'.format(options.data_dir)) logger.info('\t data_name = {}'.format(options.data_name)) logger.info('\t isdirected = {}\n'.format(options.isdirected)) logger.info('\t random_walker = {}'.format(random_walker)) logger.info('\t walk_times = {}'.format(options.walk_times)) logger.info('\t walk_length = {}'.format(options.walk_length)) logger.info('\t max_walk_workers = {}'.format(options.walk_workers)) logger.info('\t walk_to_memory = {}'.format(options.walk_to_memory)) logger.info('\t alpha = {}'.format(options.alpha)) if options.walk_to_memory: logger.info('\t donot store corpus = {}'.format( str(options.not_store_corpus))) if not options.not_store_corpus: logger.info('\t corpus store path = {}'.format( options.corpus_store_path)) else: logger.info('\t corpus store path = {}'.format( options.corpus_store_path)) fr_walks = open( os.path.join( os.path.split(options.corpus_store_path)[0], 'walks.info'), 'w') fr_walks.write('Corpus walk info:\n') fr_walks.write('\t data_dir = {}\n'.format(options.data_dir)) fr_walks.write('\t data_name = {}\n'.format(options.data_name)) fr_walks.write('\t isdirected = {}\n\n'.format(options.isdirected)) fr_walks.write('\t random_walker = {}\n'.format(random_walker)) fr_walks.write('\t walk times = {}\n'.format(options.walk_times)) fr_walks.write('\t walk length = {}\n'.format(options.walk_length)) fr_walks.write('\t max walk workers = {}\n'.format(options.walk_workers)) fr_walks.write('\t walk to memory = {}\n'.format( str(options.walk_to_memory))) if options.walk_to_memory: fr_walks.write('\t donot store corpus = {}\n'.format( str(options.not_store_corpus))) if not options.not_store_corpus: fr_walks.write('\t corpus store path = {}\n'.format( options.corpus_store_path)) else: fr_walks.write('\t corpus store path = {}\n'.format( options.corpus_store_path)) fr_walks.close() if options.model == "DeepWalk": walker = Walker(net, random_walker=random_walker, walk_length=options.walk_length) elif options.model == "SpaceyWalk": if options.using_metapath == "metagraph": metagraph = network.construct_meta_graph( options.metapath_path, isdirected=options.isdirected) elif options.using_metapath == "metatree": metagraph = network.construct_meta_tree(options.metapath_path, isdirected=True) elif options.using_metapath == "metaschema": metagraph = None else: logger.error("Unknown feature : '%s'." % options.using_metapath) sys.exit() walker = Walker(net, random_walker=random_walker, walk_length=options.walk_length, metagraph=metagraph, using_metapath=options.using_metapath, history_position=options.history_position, task="walk", alpha=options.alpha) elif options.model == "MetatreeWalk": metagraph = network.construct_meta_tree(options.metapath_path, isdirected=True) walker = Walker(net, random_walker=random_walker, walk_length=options.walk_length, metagraph=metagraph, task="walk") walk_corpus = None if options.walk_to_memory: walk_corpus = build_walk_corpus_to_memory( options.walk_times, max_num_workers=options.walk_workers) if not options.not_store_corpus: store_walk_corpus(options.corpus_store_path, walk_corpus, always_rebuild=options.always_rebuild) else: # walk to files walk_files = build_walk_corpus_to_files( options.corpus_store_path, options.walk_times, headflag_of_index_file=options.headflag_of_index_file, max_num_workers=options.walk_workers, always_rebuild=options.always_rebuild) if "train" in options.task: if options.load_from_memory: walk_corpus = load_walks_corpus(walk_files) else: walk_corpus = WalksCorpus(walk_files) del walker gc.collect() return walk_corpus
def eval_once(options): # visual_dir, visual_file = os.path.split(options.visualization_path) if not utils.check_rebuild(options.visualization_path, descrip='visualization', always_rebuild=options.always_rebuild): return # print logger logger.info('eval case: visualization...') logger.info('\t data_dir = {}'.format(options.data_dir)) logger.info('\t data_name = {}'.format(options.data_name)) logger.info('\t isdirected = {}'.format(options.isdirected)) logger.info('\t label_path = {}'.format(options.label_path)) logger.info('\t label_size = {}'.format(options.label_size)) logger.info('\t eval_node_type: {}'.format(options.eval_node_type)) logger.info('\t save_path: {}\n'.format(options.visualization_path)) logger.info('\t method: t-SNE') logger.info('\t multilabel_rule: {}'.format(options.multilabel_rule)) logger.info('\t marker_size: {}'.format(options.marker_size)) logger.info('\t eval_online: {}'.format(options.eval_online)) # get embedding vectors and markersize logger.info('\t reading labeled data from file {}'.format(options.label_path)) time_start = time.time() id_list, labels_list = utils.get_labeled_data(options.label_path, type=options.eval_node_type, multilabel_rule=options.multilabel_rule, type_filepath=os.path.join(options.data_dir, options.data_name + ".nodes")) id_list, features_matrix, labels_list = utils.get_vectors(utils.get_KeyedVectors(options.vectors_path), id_list, labels_list) labels_matrix = np.array([item[0] for item in labels_list]) logger.info('\t reading labeled data completed in {}s'.format(time.time() - time_start)) logger.info('\t total labeled data size: {}'.format(np.size(features_matrix,axis=0))) logger.info('\t the labels data embedding_dimension: {}'.format(np.size(features_matrix,axis=1))) logger.info('\t total labels size: {}'.format(options.label_size)) for i in range(options.label_size): logger.info('\t\t label {}: {}'.format(i, np.sum(labels_matrix == i))) fr = open(options.visualization_path, 'w') fr.write('eval case: visualization...\n') fr.write('\t data_dir = {}\n'.format(options.data_dir)) fr.write('\t data_name = {}\n'.format(options.data_name)) fr.write('\t isdirected = {}\n'.format(options.isdirected)) fr.write('\t label_path = {}\n'.format(options.label_path)) fr.write('\t label_size = {}\n'.format(options.label_size)) fr.write('\t eval_node_type: {}\n'.format(options.eval_node_type)) fr.write('\t save_path: {}\n\n'.format(options.visualization_path)) fr.write('\t method: t-SNE\n') fr.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule)) fr.write('\t marker_size: {}\n'.format(options.marker_size)) fr.write('\t eval_online: {}\n'.format(options.eval_online)) fr.write('\t total labeled data size: {}\n'.format(np.size(features_matrix, axis=0))) fr.write('\t the labels data embedding_dimension: {}\n'.format(np.size(features_matrix, axis=1))) fr.write('\t total labels size: {}\n'.format(options.label_size)) for i in range(options.label_size): fr.write('\t\t label {}: {}\n'.format(i, np.sum(labels_matrix==i))) figure_name = "visualization_" + str(np.size(features_matrix, axis=1)) figure_path = os.path.join(os.path.split(options.visualization_path)[0],figure_name) CCD = plot_embedding_in_2D(Markersize=options.marker_size, features_matrix=features_matrix, labels_matrix=labels_matrix, label_size=options.label_size, figure_path = figure_path) fr.write('\n figure_path: {}\n'.format(figure_path)) fr.write(' clustering_center_distance_sim: {}\n'.format(CCD)) fr.write('\neval case: visualization completed in {}s\n ======================'.format(time.time() - time_start)) fr.close() logger.info('eval case: visualization completed in {}s\n ======================'.format(time.time() - time_start))
def eval_online(options): visual_dir = os.path.split(options.visualization_path)[0] if not utils.check_rebuild(visual_dir, descrip='visualization', always_rebuild=options.always_rebuild): return if not os.path.exists(visual_dir): os.makedirs(visual_dir) # print logger logger.info('eval case: visualization...') logger.info('\t data_dir = {}'.format(options.data_dir)) logger.info('\t data_name = {}'.format(options.data_name)) logger.info('\t isdirected = {}'.format(options.isdirected)) logger.info('\t label_path = {}'.format(options.label_path)) logger.info('\t label_size = {}'.format(options.label_size)) logger.info('\t eval_node_type: {}'.format(options.eval_node_type)) logger.info('\t save_dir: {}\n'.format(visual_dir)) logger.info('\t method: t-SNE') logger.info('\t multilabel_rule: {}'.format(options.multilabel_rule)) logger.info('\t marker_size: {}'.format(options.marker_size)) logger.info('\t eval_online: {}'.format(options.eval_online)) logger.info('\t eval_interval: {}s'.format(options.eval_interval)) logger.info('\t reading labeled data from file {}'.format(options.label_path)) # get embedding vectors and markersize time_start = time.time() id_list_totoal, labels_list_totoal = utils.get_labeled_data(options.label_path, type=options.eval_node_type, multilabel_rule=options.multilabel_rule, type_filepath=os.path.join(options.data_dir, options.data_name + ".nodes")) logger.info('\t reading labeled data completed in {}s'.format(time.time() - time_start)) logger.info('\t total labeled data size: {}'.format(len(id_list_totoal))) logger.info('\t total labels size: {}'.format(options.label_size)) fr_total = open(options.visualization_path, 'w') fr_total.write('eval case: visualization...\n') fr_total.write('\t data_dir = {}\n'.format(options.data_dir)) fr_total.write('\t data_name = {}\n'.format(options.data_name)) fr_total.write('\t isdirected = {}\n'.format(options.isdirected)) fr_total.write('\t label_path = {}\n'.format(options.label_path)) fr_total.write('\t label_size = {}\n'.format(options.label_size)) fr_total.write('\t eval_node_type: {}\n'.format(options.eval_node_type)) fr_total.write('\t save_dir: {}\n\n'.format(visual_dir)) fr_total.write('\t method: t-SNE\n') fr_total.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule)) fr_total.write('\t marker_size: {}\n'.format(options.marker_size)) fr_total.write('\t eval_online: {}\n'.format(options.eval_online)) fr_total.write('\t eval_interval: {}s\n'.format(options.eval_interval)) fr_total.write('\t total labeled data size: {}\n'.format(len(id_list_totoal))) fr_total.write('\t total labels size: {}\n'.format(options.label_size)) fr_total.write('\t results(CCD-clustering_center_distance_sim):\n' '=============================================================\n') fr_total.write('finish_time\tckpt\tCCD\n') last_step = 0 summary_writer = tf.summary.FileWriter(visual_dir, tf.Graph()) summary = tf.Summary() summary.value.add(tag='CCD', simple_value=0.) summary_writer.add_summary(summary, last_step) best_CCD = 0 ckpt_dir = os.path.join(os.path.split(options.vectors_path)[0], 'ckpt') ckpt = tf.train.get_checkpoint_state(ckpt_dir) while (not (ckpt and ckpt.model_checkpoint_path)): logger.info("model and vectors not exist, waiting...") time.sleep(options.eval_interval) ckpt = tf.train.get_checkpoint_state(ckpt_dir) reading = options.vectors_path + ".reading_visualization_{}".format(options.eval_node_type) writing = options.vectors_path + ".writing" while (options.eval_online): while True: ckpt = tf.train.get_checkpoint_state(ckpt_dir) cur_step = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) if cur_step <= last_step or (not os.path.exists(options.vectors_path)) or os.path.exists(writing): if os.path.exists(os.path.join(os.path.split(options.vectors_path)[0], "RUN_SUCCESS")): return time.sleep(options.eval_interval) continue # ready for reading logger.info("\t declare for reading ...") open(reading, "w") # declare time.sleep(30) ckpt = tf.train.get_checkpoint_state(ckpt_dir) cur_step = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) if cur_step <= last_step or (not os.path.exists(options.vectors_path)) or os.path.exists(writing): os.remove(reading) # undeclare logger.info("\t confliction! undeclare and waiting ...") time.sleep(options.eval_interval) continue break logger.info("\t eval ckpt-{}.......".format(cur_step)) time_start = time.time() logger.info('\t reading embedding vectors from file {}'.format(options.vectors_path)) id_list, features_matrix, labels_list = utils.get_vectors(utils.get_KeyedVectors(options.vectors_path), id_list_totoal, labels_list_totoal) os.remove(reading) # synchrolock for multi-process logger.info("\t done for reading ...") labels_matrix = np.array([item[0] for item in labels_list]) logger.info('\t reading labeled data completed in {}s'.format(time.time() - time_start)) logger.info('\t total labeled data size: {}'.format(np.size(features_matrix, axis=0))) logger.info('\t total labels size: {}'.format(options.label_size)) for i in range(options.label_size): logger.info('\t\t label {}: {}'.format(i, np.sum(labels_matrix == i))) # visualization fr = open(options.visualization_path + '.{}'.format(cur_step), 'w') fr.write('eval case: visualization...\n') fr.write('\t data_dir = {}\n'.format(options.data_dir)) fr.write('\t data_name = {}\n'.format(options.data_name)) fr.write('\t isdirected = {}\n'.format(options.isdirected)) fr.write('\t label_path = {}\n'.format(options.label_path)) fr.write('\t label_size = {}\n'.format(options.label_size)) fr.write('\t eval_node_type: {}\n'.format(options.eval_node_type)) fr.write('\t method: t-SNE\n') fr.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule)) fr.write('\t marker_size: {}\n'.format(options.marker_size)) fr.write('\t eval_online: {}\n'.format(options.eval_online)) fr.write('\t eval_interval: {}s\n'.format(options.eval_interval)) fr.write('\t total labeled data size: {}\n'.format(np.size(features_matrix, axis=0))) fr.write('\t total labels size: {}\n'.format(options.label_size)) for i in range(options.label_size): fr.write('\t\t label {}: {}\n'.format(i, np.sum(labels_matrix == i))) fr_total.write('%s ckpt-%-9d: ' % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), cur_step)) summary = tf.Summary() figure_name = "visualization_" + str(np.size(features_matrix, axis=1)) + '.{}'.format(cur_step) figure_path = os.path.join(visual_dir, figure_name) CCD = plot_embedding_in_2D(Markersize=options.marker_size, features_matrix=features_matrix, labels_matrix=labels_matrix, label_size=options.label_size, figure_path=figure_path) fr.write('\n figure_path: {}\n'.format(figure_path)) fr.write(' clustering_center_distance_sim:{}\n'.format(CCD)) fr.write('\neval case: visualization completed in {}s\n ======================'.format(time.time() - time_start)) fr.close() fr_total.write('%.4f\n' % CCD) fr_total.flush() summary.value.add(tag='CCD', simple_value=CCD) summary_writer.add_summary(summary, cur_step) summary_writer.flush() logger.info('visualization completed in {}s\n================================='.format(time.time() - time_start)) # copy ckpt-files according to last mean_Micro_F1 (0.9 ratio). if CCD > best_CCD: best_CCD = CCD ckptIsExists = os.path.exists(os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step)) if ckptIsExists: fr_best = open(os.path.join(visual_dir, 'best_ckpt.info'), 'w') else: fr_best = open(os.path.join(visual_dir, 'best_ckpt.info'), 'a') fr_best.write("Note:the model.ckpt-best is the remainings of last best_ckpt!\n" "the current best_ckpt model is loss, but the result is:\n") fr_best.write("best_CCD: {}\n".format(best_CCD)) fr_best.write("best_ckpt: ckpt-{}\n".format(cur_step)) fr_best.close() if ckptIsExists: sourceFile = os.path.join(ckpt_dir, 'model.ckpt-%d.data-00000-of-00001' % cur_step) targetFile = os.path.join(visual_dir, 'model.ckpt-best.data-00000-of-00001') if os.path.exists(targetFile): os.remove(targetFile) shutil.copy(sourceFile, targetFile) sourceFile = os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step) targetFile = os.path.join(visual_dir, 'model.ckpt-best.index') if os.path.exists(targetFile): os.remove(targetFile) shutil.copy(sourceFile, targetFile) sourceFile = os.path.join(ckpt_dir, 'model.ckpt-%d.meta' % cur_step) targetFile = os.path.join(visual_dir, 'model.ckpt-best.meta') if os.path.exists(targetFile): os.remove(targetFile) shutil.copy(sourceFile, targetFile) last_step = cur_step fr_total.close() summary_writer.close() return
def build_walk_corpus(options, net=None): global walker # check walk info and record if not utils.check_rebuild(options.corpus_store_path, descrip='walk corpus', always_rebuild=options.always_rebuild): return if options.model == "DeepWalk": random_walker = "uniform" elif options.model == "Node2Vec": random_walker = "bias" else: logger.error("Unknown model or it cann't build walk corpus: '%s'." % options.model) sys.exit() if net == None: net = network.construct_network(options) logger.info('Corpus bulid: walk info:') logger.info('\t random_walker = {}'.format(random_walker)) logger.info('\t walk times = {}'.format(options.walk_times)) logger.info('\t walk length = {}'.format(options.walk_length)) if random_walker == "uniform": logger.info('\t walk restart = {}'.format(options.walk_restart)) elif random_walker == "bias": logger.info('\t return_parameter (p) = {}'.format(options.p)) logger.info('\t in-out_parameter (q) = {}'.format(options.q)) logger.info('\t max walk workers = {}'.format(options.walk_workers)) logger.info('\t walk to memory = {}'.format(str(options.walk_to_memory))) if options.walk_to_memory: logger.info('\t donot store corpus = {}'.format( str(options.not_store_corpus))) if not options.not_store_corpus: logger.info('\t corpus store path = {}'.format( options.corpus_store_path)) else: logger.info('\t corpus store path = {}'.format( options.corpus_store_path)) fr_walks = open( os.path.join( os.path.split(options.corpus_store_path)[0], 'walks.info'), 'w') fr_walks.write('Corpus walk info:\n') fr_walks.write('\t random_walker = {}\n'.format(random_walker)) fr_walks.write('\t walk times = {}\n'.format(options.walk_times)) fr_walks.write('\t walk length = {}\n'.format(options.walk_length)) if random_walker == "uniform": fr_walks.write('\t walk restart = {}\n'.format(options.walk_restart)) elif random_walker == "bias": fr_walks.write('\t return_parameter (p) = {}\n'.format(options.p)) fr_walks.write('\t in-out_parameter (q) = {}\n'.format(options.q)) fr_walks.write('\t max walk workers = {}\n'.format(options.walk_workers)) fr_walks.write('\t walk to memory = {}\n'.format( str(options.walk_to_memory))) if options.walk_to_memory: fr_walks.write('\t donot store corpus = {}\n'.format( str(options.not_store_corpus))) if not options.not_store_corpus: fr_walks.write('\t corpus store path = {}\n'.format( options.corpus_store_path)) else: fr_walks.write('\t corpus store path = {}\n'.format( options.corpus_store_path)) fr_walks.close() walker = Walker(net, random_walker=random_walker, walk_length=options.walk_length, p=options.p, q=options.q) if random_walker == "bias": # walker.preprocess_transition_probs(options.walk_workers) walker.preprocess_transition_probs(net_info_path=options.net_info_path) walk_corpus = None if options.walk_to_memory: walk_corpus = build_walk_corpus_to_memory( options.walk_times, max_num_workers=options.walk_workers) if not options.not_store_corpus: store_walk_corpus(options.corpus_store_path, walk_corpus, always_rebuild=options.always_rebuild) else: # walk to files walk_files = build_walk_corpus_to_files( options.corpus_store_path, options.walk_times, headflag_of_index_file=options.headflag_of_index_file, max_num_workers=options.walk_workers, always_rebuild=options.always_rebuild) if "train" in options.task: if options.load_from_memory: walk_corpus = load_walks_corpus(walk_files) else: walk_corpus = WalksCorpus(walk_files) del walker gc.collect() return walk_corpus
def eval_online(options): global features_matrix, net_eval, net_except, SAMPLE_NODES, SAMPLE_RULE, METIRC, PREC_K link_prediction_dir = os.path.split(options.link_prediction_path)[0] if not utils.check_rebuild(link_prediction_dir, descrip='link_prediction', always_rebuild=options.always_rebuild): return if not os.path.exists(link_prediction_dir): os.makedirs(link_prediction_dir) logger.info('eval case: link-prediction ...') logger.info('\t save_path: {}'.format(options.link_prediction_path)) logger.info('\t eval_data_path: {}'.format(options.eval_data_path)) logger.info('\t except_data_path: {}'.format(options.except_data_path)) logger.info('\t data_format: {}'.format(options.data_format)) logger.info('\t metrics: MAP and precise@K') logger.info('\t max_index for precise@K: {}'.format( options.precK_max_index)) logger.info('\t similarity_metric: {}'.format(options.similarity_metric)) logger.info('\t eval_online: {}'.format(options.eval_online)) logger.info('\t eval_interval: {}s'.format(options.eval_interval)) logger.info('\t sample_nodes: {}'.format(options.sample_nodes)) logger.info('\t sample_nodes_rule: {}'.format(options.sample_nodes_rule)) logger.info('\t repeat {} times'.format(options.repeated_times)) logger.info('\t eval_workers: {}'.format(options.eval_workers)) logger.info("constructing eval network ...") net_eval = network.construct_network(data_path=options.eval_data_path, data_format=options.data_format, print_net_info=False, isdirected=options.isdirected) eval_net_nodes_size = net_eval.get_nodes_size() eval_net_edges_size = net_eval.get_edges_size() logger.info("eval_net_nodes_size = {}".format(eval_net_nodes_size)) logger.info("eval_net_edges_size = {}".format(eval_net_edges_size)) logger.info("constructing except(train) network ...") net_except = network.construct_network(data_path=options.except_data_path, data_format=options.data_format, print_net_info=False, isdirected=options.isdirected) except_net_nodes_size = net_except.get_nodes_size() except_net_edges_size = net_except.get_edges_size() logger.info("except_net_nodes_size = {}".format(except_net_nodes_size)) logger.info("except_net_edges_size = {}".format(except_net_edges_size)) id_list = list(range(eval_net_nodes_size)) # must be [0,1,2,3,...] SAMPLE_NODES = options.sample_nodes SAMPLE_RULE = options.sample_nodes_rule METIRC = options.similarity_metric PREC_K = options.precK_max_index metric_prec_k_list = [1] decimal_number = 10 while metric_prec_k_list[-1] < options.precK_max_index: if decimal_number <= options.precK_max_index: metric_prec_k_list.append(decimal_number) else: break if 2 * decimal_number <= options.precK_max_index: metric_prec_k_list.append(2 * decimal_number) else: break if 5 * decimal_number <= options.precK_max_index: metric_prec_k_list.append(5 * decimal_number) else: break decimal_number = decimal_number * 10 if options.sample_nodes > 0: if options.eval_workers > 1 and options.repeated_times > 1: # speed up by using multi-process logger.info("\t allocating repeat_times to workers ...") if options.repeated_times <= options.eval_workers: times_per_worker = [1 for _ in range(options.repeated_times)] else: div, mod = divmod(options.repeated_times, options.eval_workers) times_per_worker = [div for _ in range(options.eval_workers)] for idx in range(mod): times_per_worker[idx] = times_per_worker[idx] + 1 assert sum( times_per_worker ) == options.repeated_times, 'workers allocating failed: %d != %d' % ( sum(times_per_worker), options.repeated_times) logger.info("\t using {} processes for evaling:".format( len(times_per_worker))) for idx, rep_times in enumerate(times_per_worker): logger.info("\t process-{}: repeat {} times".format( idx, rep_times)) fr_total = open(options.link_prediction_path, 'w') fr_total.write('eval case: link-prediction ...\n') fr_total.write('\t save_path: {}\n'.format(options.link_prediction_path)) fr_total.write('\t eval_data_path: {}\n'.format(options.eval_data_path)) fr_total.write('\t except_data_path: {}\n'.format( options.except_data_path)) fr_total.write('\t data_format: {}\n'.format(options.data_format)) fr_total.write('\t metrics: MAP and precise@K\n') fr_total.write('\t max_index for precise@K: {}\n'.format( options.precK_max_index)) fr_total.write('\t similarity_metric: {}\n'.format( options.similarity_metric)) fr_total.write('\t eval_online: {}\n'.format(options.eval_online)) fr_total.write('\t eval_interval: {}s\n'.format(options.eval_interval)) fr_total.write('\t sample_nodes: {}\n'.format(options.sample_nodes)) fr_total.write('\t sample_nodes_rule: {}\n'.format( options.sample_nodes_rule)) fr_total.write('\t repeat {} times\n'.format(options.repeated_times)) fr_total.write('\t eval_workers: {}\n'.format(options.eval_workers)) fr_total.write("eval_net_nodes_size = {}\n".format(eval_net_nodes_size)) fr_total.write("eval_net_edges_size = {}\n".format(eval_net_edges_size)) fr_total.write( "except_net_nodes_size = {}\n".format(except_net_nodes_size)) fr_total.write( "except_net_edges_size = {}\n".format(except_net_edges_size)) fr_total.write( '\t results:\n=============================================================\n' ) fr_total.write('finish_time\tckpt\tMAP\t') for v in metric_prec_k_list: fr_total.write('\tPr@{}'.format(v)) fr_total.write("\n") last_step = 0 summary_writer = tf.summary.FileWriter(link_prediction_dir, tf.Graph()) summary = tf.Summary() summary.value.add(tag='MAP', simple_value=0.) for v in metric_prec_k_list: summary.value.add(tag='Pr_{}'.format(v), simple_value=0.) summary_writer.add_summary(summary, last_step) best_MAP = 0 ckpt_dir = os.path.join(os.path.split(options.vectors_path)[0], 'ckpt') ckpt = tf.train.get_checkpoint_state(ckpt_dir) while (not (ckpt and ckpt.model_checkpoint_path)): logger.info("\t model and vectors not exist, waiting ...") time.sleep(options.eval_interval) ckpt = tf.train.get_checkpoint_state(ckpt_dir) reading = options.vectors_path + ".reading_link_prediction" writing = options.vectors_path + ".writing" while (options.eval_online): while True: ckpt = tf.train.get_checkpoint_state(ckpt_dir) cur_step = int( ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) if cur_step <= last_step or (not os.path.exists( options.vectors_path)) or os.path.exists(writing): if os.path.exists( os.path.join( os.path.split(options.vectors_path)[0], "RUN_SUCCESS")): return time.sleep(options.eval_interval) continue # ready for reading logger.info("\t declare for reading ...") open(reading, "w") # declare time.sleep(30) ckpt = tf.train.get_checkpoint_state(ckpt_dir) cur_step = int( ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) if cur_step <= last_step or (not os.path.exists( options.vectors_path)) or os.path.exists(writing): os.remove(reading) # undeclare logger.info("\t confliction! undeclare and waiting ...") time.sleep(options.eval_interval) continue break logger.info("\t eval ckpt-{}.......".format(cur_step)) # loading features_matrix(already trained) logger.info('\t reading embedding vectors from file {}'.format( options.vectors_path)) time_start = time.time() features_matrix = utils.get_vectors( utils.get_KeyedVectors(options.vectors_path), id_list) os.remove(reading) logger.info("\t done for reading ...") logger.info('\t reading embedding vectors completed in {}s'.format( time.time() - time_start)) logger.info('total loaded nodes: {}'.format( np.size(features_matrix, axis=0))) logger.info('the embedding dimension: {}'.format( np.size(features_matrix, axis=1))) # fr = open(options.link_prediction_path + '.{}'.format(cur_step), 'w') fr.write('eval case: link-prediction ...\n') fr.write('\t save_path: {}\n'.format(options.link_prediction_path)) fr.write('\t eval_data_path: {}\n'.format(options.eval_data_path)) fr.write('\t except_data_path: {}\n'.format(options.except_data_path)) fr.write('\t data_format: {}\n'.format(options.data_format)) fr.write('\t metrics: MAP and precise@K\n') fr.write('\t max_index for precise@K: {}\n'.format( options.precK_max_index)) fr.write('\t similarity_metric: {}\n'.format( options.similarity_metric)) fr.write('\t eval_online: {}\n'.format(options.eval_online)) fr.write('\t eval_interval: {}s\n'.format(options.eval_interval)) fr.write('\t sample_nodes: {}\n'.format(options.sample_nodes)) fr.write('\t sample_nodes_rule: {}\n'.format( options.sample_nodes_rule)) fr.write('\t repeat {} times\n'.format(options.repeated_times)) fr.write('\t eval_workers: {}\n'.format(options.eval_workers)) fr.write("eval_net_nodes_size = {}\n".format(eval_net_nodes_size)) fr.write("eval_net_edges_size = {}\n".format(eval_net_edges_size)) fr.write("except_net_nodes_size = {}\n".format(except_net_nodes_size)) fr.write("except_net_edges_size = {}\n".format(except_net_edges_size)) fr.write('total loaded nodes: {}\n'.format( np.size(features_matrix, axis=0))) fr.write('the embedding dimension: {}\n'.format( np.size(features_matrix, axis=1))) if options.sample_nodes > 0: if options.eval_workers > 1 and options.repeated_times > 1: # speed up by using multi-process ret_list = [] # [[MAP, precisionK_list], ... ] with ProcessPoolExecutor( max_workers=options.eval_workers) as executor: for ret in executor.map(_sample_thread_body, times_per_worker): ret_list.extend(ret) if len(ret_list) != options.repeated_times: logger.warning( "warning: eval unmatched repeated_times: {} != {}". format(len(ret_list), options.repeated_times)) else: ret_list = _sample_thread_body(options.repeated_times) else: # no sampling, no repeat! ret_list = [_eval(net_eval, net_except)] # [[MAP, precisionK_list]] fr_total.write('%s ckpt-%-9d: ' % (time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(time.time())), cur_step)) summary = tf.Summary() if options.sample_nodes > 0: fr.write( 'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n' .format(options.repeated_times, len(ret_list))) else: fr.write( 'due to the sample nodes = {}, so actual repeated_times = {}, results as follows:\n' .format(options.sample_nodes, len(ret_list))) mean_MAP = np.mean([ret[0] for ret in ret_list]) mean_precisionK = np.mean([ret[1] for ret in ret_list], axis=0) fr.write('\t\t MAP = {}\n'.format(mean_MAP)) for k in range(options.precK_max_index): if k < len(mean_precisionK): fr.write('\t\t precisionK_{} = {}\n'.format( k + 1, mean_precisionK[k])) else: fr.write('\t\t precisionK_{} = None\n'.format(k + 1)) fr.write('details:\n') for repeat in range(len(ret_list)): fr.write('\t repeated {}/{}:\n'.format(repeat + 1, len(ret_list))) MAP = ret_list[repeat][0] precisionK_list = ret_list[repeat][1] fr.write('\t\t MAP = {}\n'.format(MAP)) for k in range(options.precK_max_index): if k < len(precisionK_list): fr.write('\t\t precisionK_{} = {}\n'.format( k + 1, precisionK_list[k])) else: fr.write('\t\t precisionK_{} = None\n'.format(k + 1)) fr.write('\neval case: link_prediction completed in {}s.'.format( time.time() - time_start)) fr.close() fr_total.write('%.4f' % mean_MAP) summary.value.add(tag='MAP', simple_value=mean_MAP) for v in metric_prec_k_list: fr_total.write('\t%.4f' % mean_precisionK[v - 1]) summary.value.add(tag='Pr_{}'.format(v), simple_value=mean_precisionK[v - 1]) fr_total.write("\n") fr_total.flush() summary_writer.add_summary(summary, cur_step) summary_writer.flush() logger.info( 'eval case: ret_list completed in {}s.\n=================================' .format(time.time() - time_start)) # copy ckpt-files according to last mean_Micro_F1 (0.9 ratio). if mean_MAP > best_MAP: best_MAP = mean_MAP ckptIsExists = os.path.exists( os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step)) if ckptIsExists: fr_best = open( os.path.join(link_prediction_dir, 'best_ckpt.info'), 'w') else: fr_best = open( os.path.join(link_prediction_dir, 'best_ckpt.info'), 'a') fr_best.write( "Note:the model.ckpt-best is the remainings of last best_ckpt!\n" "the current best_ckpt model is loss, but the result is:\n" ) fr_best.write("best_MAP: {}\n".format(best_MAP)) fr_best.write("best_ckpt: ckpt-{}\n".format(cur_step)) fr_best.close() if ckptIsExists: sourceFile = os.path.join( ckpt_dir, 'model.ckpt-%d.data-00000-of-00001' % cur_step) targetFile = os.path.join( link_prediction_dir, 'model.ckpt-best.data-00000-of-00001') if os.path.exists(targetFile): os.remove(targetFile) shutil.copy(sourceFile, targetFile) sourceFile = os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step) targetFile = os.path.join(link_prediction_dir, 'model.ckpt-best.index') if os.path.exists(targetFile): os.remove(targetFile) shutil.copy(sourceFile, targetFile) sourceFile = os.path.join(ckpt_dir, 'model.ckpt-%d.meta' % cur_step) targetFile = os.path.join(link_prediction_dir, 'model.ckpt-best.meta') if os.path.exists(targetFile): os.remove(targetFile) shutil.copy(sourceFile, targetFile) last_step = cur_step fr_total.close() summary_writer.close()
def eval_once(options): global features_matrix, labels_matrix if not utils.check_rebuild(options.classify_path, descrip='classify', always_rebuild=options.always_rebuild): return logger.info('eval case: classify...') logger.info('\t save_path: {}'.format(options.classify_path)) logger.info('\t classifier: LogisticRegression') logger.info('\t eval_online: {}'.format(options.eval_online)) logger.info('\t eval_workers: {}'.format(options.eval_workers)) logger.info('\t reading labeled data from file {}'.format( options.label_path)) time_start = time.time() id_list, labels_list = utils.get_labeled_data(options.label_path) features_matrix, labels_list = utils.get_vectors( utils.get_KeyedVectors(options.vectors_path), id_list, labels_list) mlb = MultiLabelBinarizer(range(options.label_size)) labels_matrix = mlb.fit_transform(labels_list) logger.info('\t reading labeled data completed in {}s'.format(time.time() - time_start)) logger.info('\t total labeled data size: {}'.format( np.size(features_matrix, axis=0))) logger.info('\t total labels size: {}'.format(options.label_size)) # repeated 10times repeated_times = options.repeated_times # split ratio if options.train_ratio > 0: train_ratio_list = [options.train_ratio] else: train_ratio_list = [v / 10.0 for v in range(9, 0, -1)] logger.info('\t repeat {} times for each train_ratio in {}'.format( repeated_times, train_ratio_list)) train_ratio_fulllist = [ train_ratio for train_ratio in train_ratio_list for _ in range(repeated_times) ] # classify fr = open(options.classify_path, 'w') fr.write('eval case: classify...\n') fr.write('\t save_path: {}\n'.format(options.classify_path)) fr.write('\t classifier: LogisticRegression\n') fr.write('\t eval_online: {}\n'.format(options.eval_online)) fr.write('\t eval_workers: {}\n'.format(options.eval_workers)) fr.write('\t repeat {} times for each train_ratio in {}\n'.format( repeated_times, train_ratio_list)) fr.write('\t total labeled data size: {}\n'.format( np.size(features_matrix, axis=0))) fr.write('\t total labels size: {}\n'.format(options.label_size)) for i in range(options.label_size): fr.write('\t\t label {}: {}\n'.format(i, np.sum(labels_matrix[:, i]))) if options.eval_workers > 1 and len(train_ratio_fulllist) > 1: # speed up by using multi-process if len(train_ratio_fulllist) <= options.eval_workers: train_ratios_per_worker = [[train_ratio] for train_ratio in train_ratio_fulllist] else: div, mod = divmod(len(train_ratio_fulllist), options.eval_workers) train_ratios_per_worker = [ train_ratio_fulllist[div * i:div * (i + 1)] for i in range(options.eval_workers) ] for idx, train_ratio in enumerate( train_ratio_fulllist[div * options.eval_workers:]): train_ratios_per_worker[len(train_ratios_per_worker) - 1 - idx].append(train_ratio) logger.info("\t using {} processes for evaling:".format( len(train_ratios_per_worker))) for idx, train_ratios in enumerate(train_ratios_per_worker): logger.info("\t process-{}: {}".format(idx, train_ratios)) ret_list = [] # (train_ratio, macro, micro) with ProcessPoolExecutor(max_workers=options.eval_workers) as executor: for ret in executor.map(_classify_thread_body, train_ratios_per_worker): ret_list.extend(ret) else: ret_list = _classify_thread_body(train_ratio_fulllist) ret_dict = {} for ret in ret_list: if ret[0] in ret_dict: ret_dict[ret[0]][0].append(ret[1]) ret_dict[ret[0]][1].append(ret[2]) else: ret_dict[ret[0]] = [[ret[1]], [ret[2]]] for train_ratio, macro_micro in sorted(ret_dict.items(), key=lambda item: item[0]): fr.write('\n' + '-' * 20 + '\n' + 'train_ratio = {}\n'.format(train_ratio)) Macro_F1_list = macro_micro[0] Micro_F1_list = macro_micro[1] if len(Macro_F1_list) != repeated_times: logger.warning( "warning: train_ratio = {} eval unmatched repeated_times: {} != {}" .format(train_ratio, len(Macro_F1_list), repeated_times)) mean_Macro_F1 = sum(Macro_F1_list) / float(len(Macro_F1_list)) mean_Micro_F1 = sum(Micro_F1_list) / float(len(Micro_F1_list)) fr.write( 'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n' .format(repeated_times, len(Macro_F1_list))) fr.write('\t\t Macro_F1 = {}\n'.format(mean_Macro_F1)) fr.write('\t\t Micro_F1 = {}\n'.format(mean_Micro_F1)) fr.write('details:\n') for repeat in range(len(Macro_F1_list)): fr.write( '\t repeated {}/{}: Macro_F1 = {}, Micro_F1 = {}\n'.format( repeat + 1, len(Macro_F1_list), Macro_F1_list[repeat], Micro_F1_list[repeat])) fr.write('\neval case: classify completed in {}s'.format(time.time() - time_start)) fr.close() logger.info('eval case: classify completed in {}s'.format(time.time() - time_start))
def generate_train_data(corpus_store_path, headflag_of_index_file, train_workers, window_size, idx_vocab_freq_file, sens=None, always_rebuild=False): corpusfiles_list = [] if sens == None: # check index file with open(corpus_store_path, 'r') as f: headline = f.readline().strip() if headline == headflag_of_index_file: logger.info('generate training examples from corpus files: ') for line in f: line = line.strip() if line[0:5] == 'FILE:': if os.path.exists(line[6:]): logger.info('corpus file: {}'.format(line[6:])) corpusfiles_list.append(line[6:]) else: logger.warning( 'cannot find corpus file: {}, skiped.'.format( line[6:])) else: corpusfiles_list.append(corpus_store_path) logger.info( 'generate training examples from file: {}...'.format( corpusfiles_list)) else: logger.info('generate training examples from memory sentences...') # generate train data if utils.check_rebuild(idx_vocab_freq_file, descrip='vocab and frequencies', always_rebuild=always_rebuild): logger.info('get vocabs ...') time_start = time.time() if sens == None: vocabs = scan_files_using_multiprocess( corpusfiles_list, max_num_workers=train_workers, func=get_vocabs_from_files) else: vocabs = [get_vocabs_from_sentences(sens)] logger.info('get vocabs completed in {}s'.format(time.time() - time_start)) logger.info('get frequencies ...') time_start = time.time() vocab2idx, idx2vocab, nodes_frequencies = scan_vocabs( vocabs, idx_vocab_freq_file) logger.info('get frequencies completed in {}s'.format(time.time() - time_start)) else: logger.info("get vocab and frequencies from file: {}".format( idx_vocab_freq_file)) time_start = time.time() vocab2idx = {} idx2vocab = [] nodes_frequencies = [] # count = 0 for line in open(idx_vocab_freq_file): linelist = line.strip().split(' ') idx = int(linelist[0]) node = int(linelist[1]) freq = float(linelist[2]) vocab2idx[node] = idx # assert count == idx, "error, %d != %d" %(count,idx) # count += 1 idx2vocab.append(idx) nodes_frequencies.append(freq) logger.info( 'get vocab and frequencies completed in {}s'.format(time.time() - time_start)) logger.info('get training examples ...') time_start = time.time() if sens == None: rets = scan_files_using_multiprocess(corpusfiles_list, max_num_workers=train_workers, func=get_examples_from_files, args=(vocab2idx, window_size)) else: rets = [get_examples_from_sentences(sens, vocab2idx, window_size)] logger.info('get training examples completed in {}s'.format(time.time() - time_start)) data = np.concatenate([item[0] for item in rets]) labels = np.concatenate([item[1] for item in rets]) # logger.info('total nodes: {}, total examples: {}'.format(len(nodes_frequencies), len(data))) # dataset = DataSet(data=data, labels=labels, shuffled= not options.unshuffled) return data, labels, idx2vocab, nodes_frequencies
def eval_once(options, net): global features_dict, true_edges_list_by_repeat, neg_edges_list_by_repeat if not utils.check_rebuild(options.link_prediction_path, descrip='link_prediction', always_rebuild=options.always_rebuild): return logger.info('eval case: link_prediction...') logger.info('\t data_dir = {}'.format(options.data_dir)) logger.info('\t data_name = {}'.format(options.data_name)) logger.info('\t isdirected = {}'.format(options.isdirected)) logger.info('\t eval_edge_type: {}'.format(options.eval_edge_type)) logger.info('\t save_path: {}\n'.format(options.link_prediction_path)) logger.info('\t classifier: LogisticRegression') logger.info('\t eval_online: {}'.format(options.eval_online)) logger.info('\t eval_workers: {}'.format(options.eval_workers)) logger.info('\t repeated_times: {}'.format(options.repeated_times)) logger.info('\t feature_operators: {}'.format(options.feature_operators)) logger.info('\t sample_size: {}'.format(options.sample_size)) time_start = time.time() load_features(options, net) load_edges(options, net) logger.info('\t total true edges size: {}'.format( len(true_edges_list_by_repeat[0]))) logger.info('\t total neg edges size: {}'.format( len(neg_edges_list_by_repeat[0]))) # repeated 10times repeated_times = options.repeated_times # split ratio if options.train_ratio > 0: train_ratio_list = [options.train_ratio] else: train_ratio_list = [0.01, 0.05] + [v / 10.0 for v in range(1, 10)] logger.info('\t repeat {} times for each train_ratio in {}'.format( repeated_times, train_ratio_list)) fr = open(options.link_prediction_path, 'w') fr.write('eval case: link-prediction ...\n') fr.write('\t data_dir = {}\n'.format(options.data_dir)) fr.write('\t data_name = {}\n'.format(options.data_name)) fr.write('\t isdirected = {}\n'.format(options.isdirected)) fr.write('\t eval_edge_type: {}\n'.format(options.eval_edge_type)) fr.write('\t save_path: {}\n\n'.format(options.link_prediction_path)) fr.write('\t classifier: LogisticRegression\n') fr.write('\t eval_online: {}\n'.format(options.eval_online)) fr.write('\t eval_workers: {}\n'.format(options.eval_workers)) fr.write('\t feature_operators: {}\n'.format(options.feature_operators)) fr.write('\t repeated_times: {}\n'.format(options.repeated_times)) fr.write('\t sample_size: {}\n'.format(options.sample_size)) fr.write('\t total true edges size: {}\n'.format( len(true_edges_list_by_repeat[0]))) fr.write('\t total neg edges size: {}\n'.format( len(neg_edges_list_by_repeat[0]))) fr.write('\t repeat {} times for each train_ratio in {}\n'.format( repeated_times, train_ratio_list)) full_train_ratio_info_list = [] for repeat in range(repeated_times): for op in options.feature_operators: for train_ratio in train_ratio_list: full_train_ratio_info_list.append((repeat, op, train_ratio)) if options.eval_workers > 1 and len(full_train_ratio_info_list) > 1: # speed up by using multi-process if len(full_train_ratio_info_list) <= options.eval_workers: train_ratios_per_worker = [[ train_ratio_info ] for train_ratio_info in full_train_ratio_info_list] else: div, mod = divmod(len(full_train_ratio_info_list), options.eval_workers) train_ratios_per_worker = [ full_train_ratio_info_list[div * i:div * (i + 1)] for i in range(options.eval_workers) ] for idx, train_ratio_info in enumerate( full_train_ratio_info_list[div * options.eval_workers:]): train_ratios_per_worker[idx].append(train_ratio_info) logger.info("\t using {} processes for evaling:".format( len(train_ratios_per_worker))) for idx, train_ratios in enumerate(train_ratios_per_worker): logger.info("\t process-{}: {}".format(idx, train_ratios)) try: ret_list = [] # (train_ratio, op, auc) with ProcessPoolExecutor( max_workers=options.eval_workers) as executor: for ret in executor.map(_classify_thread_body, train_ratios_per_worker): ret_list.extend(ret) except: logger.warning("concurrent.futures.process failed, retry...") time.sleep(10) ret_list = [] # (train_ratio, op, auc) with ProcessPoolExecutor( max_workers=options.eval_workers) as executor: for ret in executor.map(_classify_thread_body, train_ratios_per_worker): ret_list.extend(ret) else: ret_list = _classify_thread_body(full_train_ratio_info_list) ret_dict = {} for train_ratio, op, auc in ret_list: # ret: (train_ratio, op, auc) if (train_ratio, op) in ret_dict: ret_dict[(train_ratio, op)].append(auc) else: ret_dict[(train_ratio, op)] = [auc] for train_ratio in train_ratio_list: for op in options.feature_operators: fr.write( '\n' + '-' * 20 + '\n' + 'train_ratio = {}, operator = {}\n'.format(train_ratio, op)) auc_list = ret_dict[(train_ratio, op)] if len(auc_list) != repeated_times: logger.warning( "warning: train_ratio={},operator={},, eval unmatched repeated_times: {} != {}" .format(train_ratio, op, len(auc_list), repeated_times)) mean_auc = sum(auc_list) / float(len(auc_list)) fr.write( 'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n' .format(repeated_times, len(auc_list))) fr.write('\t\t AUC = {}\n'.format(mean_auc)) fr.write('details:\n') for repeat in range(len(auc_list)): fr.write('\t repeated {}/{}: AUC = {}\n'.format( repeat + 1, len(auc_list), auc_list[repeat])) fr.write( '\neval case: link_prediction completed in {}s'.format(time.time() - time_start)) fr.close() logger.info( 'eval case: link_prediction completed in {}s'.format(time.time() - time_start))